The warp_exchange class is a warp level parallel primitive which provides methods for rearranging items partitioned across threads in a warp. More...

#include <warp_exchange.hpp>

Public Types
using	storage_type = detail::raw_storage< storage_type_ >
	Struct used to allocate a temporary memory that is required for thread communication during operations provided by the related parallel primitive. More...

Public Member Functions
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void	blocked_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
	Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage. More...

template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void	blocked_to_striped_shuffle (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
	Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations. More...

template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void	striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
	Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage. More...

template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void	striped_to_blocked_shuffle (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
	Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations. More...

template<class U , class OffsetT >
ROCPRIM_DEVICE ROCPRIM_INLINE void	scatter_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const OffsetT(&ranks)[ItemsPerThread], storage_type &storage)
	Orders `input` values according to ranks using temporary storage, then writes the values to `output` in a striped manner. More...

Detailed Description

template<class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>
class warp_exchange< T, ItemsPerThread, WarpSize >

The warp_exchange class is a warp level parallel primitive which provides methods for rearranging items partitioned across threads in a warp.

Template Parameters

T	- the input type.
ItemsPerThread	- the number of items contributed by each thread.
WarpSize	- the number of threads in a warp.

Overview

The warp_exchange class supports the following rearrangement methods:
- Transposing a blocked arrangement to a striped arrangement.
- Transposing a striped arrangement to a blocked arrangement.

Examples: In the example an exchange operation is performed on a warp of 8 threads, using type int with 4 items per thread.

__global__ void example_kernel(...)
{
constexpr unsigned int threads_per_block = 128;
constexpr unsigned int threads_per_warp = 8;
constexpr unsigned int items_per_thread = 4;
constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp;
const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
// specialize warp_exchange for int, warp of 8 threads and 4 items per thread
using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
// allocate storage in shared memory
__shared__ warp_exchange_int::storage_type storage[warps_per_block];
int items[items_per_thread];
...
warp_exchange_int w_exchange;
w_exchange.blocked_to_striped(items, items, storage[warp_id]);
...
}

Member Typedef Documentation

◆ storage_type

template<class T , unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>

using warp_exchange< T, ItemsPerThread, WarpSize >::storage_type = detail::raw_storage<storage_type_>

Struct used to allocate a temporary memory that is required for thread communication during operations provided by the related parallel primitive.

Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords __shared__. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.

Member Function Documentation

◆ blocked_to_striped()

template<class T , unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>

template<class U >

ROCPRIM_DEVICE ROCPRIM_INLINE void warp_exchange< T, ItemsPerThread, WarpSize >::blocked_to_striped	(	const T(&)	input[ItemsPerThread],
		U(&)	output[ItemsPerThread],
		storage_type &	storage
	)

inline

Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage.

Template Parameters

U	- [inferred] the output type.

Parameters

[in]	input	- array that data is loaded from.
[out]	output	- array that data is loaded to.
[in]	storage	- reference to a temporary storage object of type storage_type.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Example.: __global__ void example_kernel(...)
{
constexpr unsigned int threads_per_block = 128;
constexpr unsigned int threads_per_warp = 8;
constexpr unsigned int items_per_thread = 4;
constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp;
const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
// specialize warp_exchange for int, warp of 8 threads and 4 items per thread
using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
// allocate storage in shared memory
__shared__ warp_exchange_int::storage_type storage[warps_per_block];
int items[items_per_thread];
...
warp_exchange_int w_exchange;
w_exchange.blocked_to_striped(items, items, storage[warp_id]);
...
}

◆ blocked_to_striped_shuffle()

template<class T , unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>

template<class U >

ROCPRIM_DEVICE ROCPRIM_INLINE void warp_exchange< T, ItemsPerThread, WarpSize >::blocked_to_striped_shuffle	(	const T(&)	input[ItemsPerThread],
		U(&)	output[ItemsPerThread]
	)

inline

Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations.

Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize.

Template Parameters

U	- [inferred] the output type.

Parameters

[in]	input	- array that data is loaded from.
[out]	output	- array that data is loaded to.

Example.: __global__ void example_kernel(...)
{
constexpr unsigned int threads_per_block = 128;
constexpr unsigned int threads_per_warp = 8;
constexpr unsigned int items_per_thread = 4;
constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp;
const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
// specialize warp_exchange for int, warp of 8 threads and 4 items per thread
using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
int items[items_per_thread];
...
warp_exchange_int w_exchange;
w_exchange.blocked_to_striped_shuffle(items, items);
...
}

◆ scatter_to_striped()

template<class T , unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>

template<class U , class OffsetT >

ROCPRIM_DEVICE ROCPRIM_INLINE void warp_exchange< T, ItemsPerThread, WarpSize >::scatter_to_striped	(	const T(&)	input[ItemsPerThread],
		U(&)	output[ItemsPerThread],
		const OffsetT(&)	ranks[ItemsPerThread],
		storage_type &	storage
	)

inline

Orders input values according to ranks using temporary storage, then writes the values to output in a striped manner.

No values in ranks should exists that exceed WarpSize*ItemsPerThread-1 .

Template Parameters

U	- [inferred] the output type.

Parameters

[in]	input	- array that data is loaded from.
[out]	output	- array that data is loaded to.
[in]	ranks	- array containing the positions.
[in]	storage	- reference to a temporary storage object of type storage_type.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Example.: __global__ void example_kernel(...)
{
constexpr unsigned int threads_per_block = 128;
constexpr unsigned int threads_per_warp = 8;
constexpr unsigned int items_per_thread = 4;
constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp;
const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
// specialize warp_exchange for int, warp of 8 threads and 4 items per thread
using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
// allocate storage in shared memory
__shared__ warp_exchange_int::storage_type storage[warps_per_block];
int items[items_per_thread];
// data-type of `ranks` should be able to contain warp_size*items_per_thread unique elements
// unsigned short is sufficient for up to 1024*64 elements
unsigned short ranks[items_per_thread];
...
warp_exchange_int w_exchange;
w_exchange.scatter_to_striped(items, items, ranks, storage[warp_id]);
...
}

◆ striped_to_blocked()

template<class T , unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>

template<class U >

ROCPRIM_DEVICE ROCPRIM_INLINE void warp_exchange< T, ItemsPerThread, WarpSize >::striped_to_blocked	(	const T(&)	input[ItemsPerThread],
		U(&)	output[ItemsPerThread],
		storage_type &	storage
	)

inline

Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage.

Template Parameters

U	- [inferred] the output type.

Parameters

[in]	input	- array that data is loaded from.
[out]	output	- array that data is loaded to.
[in]	storage	- reference to a temporary storage object of type storage_type.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Example.: __global__ void example_kernel(...)
{
constexpr unsigned int threads_per_block = 128;
constexpr unsigned int threads_per_warp = 8;
constexpr unsigned int items_per_thread = 4;
constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp;
const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
// specialize warp_exchange for int, warp of 8 threads and 4 items per thread
using warp_exchange_int = rocprim::warp_exchange<int, threads_per_warp, items_per_thread>;
// allocate storage in shared memory
__shared__ warp_exchange_int::storage_type storage[warps_per_block];
int items[items_per_thread];
...
warp_exchange_int w_exchange;
w_exchange.striped_to_blocked(items, items, storage[warp_id]);
...
}

◆ striped_to_blocked_shuffle()

template<class T , unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>

template<class U >

ROCPRIM_DEVICE ROCPRIM_INLINE void warp_exchange< T, ItemsPerThread, WarpSize >::striped_to_blocked_shuffle	(	const T(&)	input[ItemsPerThread],
		U(&)	output[ItemsPerThread]
	)

inline

Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations.

Caution: this API is experimental. Performance might not be consistent. ItemsPerThread must be a divisor of WarpSize.

Template Parameters

U	- [inferred] the output type.

Parameters

[in]	input	- array that data is loaded from.
[out]	output	- array that data is loaded to.

Example.: __global__ void example_kernel(...)
{
constexpr unsigned int threads_per_block = 128;
constexpr unsigned int threads_per_warp = 8;
constexpr unsigned int items_per_thread = 4;
constexpr unsigned int warps_per_block = threads_per_block / threads_per_warp;
const unsigned int warp_id = hipThreadIdx_x / threads_per_warp;
// specialize warp_exchange for int, warp of 8 threads and 4 items per thread
using warp_exchange_int = rocprim::warp_exchange<int, items_per_thread, threads_per_warp>;
int items[items_per_thread];
...
warp_exchange_int w_exchange;
w_exchange.striped_to_blocked_shuffle(items, items);
...
}

The documentation for this class was generated from the following file:

rocprim/include/rocprim/warp/warp_exchange.hpp

Public Types

Public Member Functions

Detailed Description

template<class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()> class warp_exchange< T, ItemsPerThread, WarpSize >

Member Typedef Documentation

◆ storage_type

Member Function Documentation

◆ blocked_to_striped()

◆ blocked_to_striped_shuffle()

◆ scatter_to_striped()

◆ striped_to_blocked()

◆ striped_to_blocked_shuffle()

template<class T, unsigned int ItemsPerThread, unsigned int WarpSize = ::rocprim::device_warp_size()>
class warp_exchange< T, ItemsPerThread, WarpSize >