rocPRIM
|
The block_exchange
class is a block level parallel primitive which provides methods for rearranging items partitioned across threads in a block.
More...
#include <block_exchange.hpp>
Public Types | |
using | storage_type = detail::raw_storage< storage_type_ > |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. More... | |
Public Member Functions | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | blocked_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread]) |
Transposes a blocked arrangement of items to a striped arrangement across the thread block. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | blocked_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage) |
Transposes a blocked arrangement of items to a striped arrangement across the thread block, using temporary storage. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread]) |
Transposes a striped arrangement of items to a blocked arrangement across the thread block. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage) |
Transposes a striped arrangement of items to a blocked arrangement across the thread block, using temporary storage. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | blocked_to_warp_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread]) |
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | blocked_to_warp_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage) |
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block, using temporary storage. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | warp_striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread]) |
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block. More... | |
template<class U > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | warp_striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage) |
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block, using temporary storage. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | scatter_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread]) |
Scatters items to a blocked arrangement based on their ranks across the thread block. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | gather_from_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread]) |
Gathers items from a striped arrangement based on their ranks across the thread block. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | scatter_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage) |
Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | gather_from_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage) |
Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | scatter_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread]) |
Scatters items to a striped arrangement based on their ranks across the thread block. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | scatter_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage) |
Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | scatter_to_striped_guarded (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread]) |
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank. More... | |
template<class U , class Offset > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | scatter_to_striped_guarded (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage) |
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage. More... | |
template<class U , class Offset , class ValidFlag > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | scatter_to_striped_flagged (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread]) |
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity. More... | |
template<class U , class Offset , class ValidFlag > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | scatter_to_striped_flagged (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread], storage_type &storage) |
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage. More... | |
The block_exchange
class is a block level parallel primitive which provides methods for rearranging items partitioned across threads in a block.
T | - the input type. |
BlockSize | - the number of threads in a block. |
ItemsPerThread | - the number of items contributed by each thread. |
block_exchange
class supports the following rearrangement methods:In the examples exchange operation is performed on block of 128 threads, using type int
with 8 items per thread.
using block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::storage_type = detail::raw_storage<storage_type_> |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.
Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords __shared__
. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
|
inline |
Transposes a blocked arrangement of items to a striped arrangement across the thread block.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
|
inline |
Transposes a blocked arrangement of items to a striped arrangement across the thread block, using temporary storage.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
|
inline |
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block, using temporary storage.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Gathers items from a striped arrangement based on their ranks across the thread block.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[out] | ranks | - array that has rank of data. |
|
inline |
Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[out] | ranks | - array that has rank of data. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
|
inline |
Scatters items to a blocked arrangement based on their ranks across the thread block.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[out] | ranks | - array that has rank of data. |
|
inline |
Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[out] | ranks | - array that has rank of data. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Scatters items to a striped arrangement based on their ranks across the thread block.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[out] | ranks | - array that has rank of data. |
|
inline |
Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[out] | ranks | - array that has rank of data. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
ValidFlag | - [inferred] the validity flag type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | ranks | - array that has rank of data. |
[in] | is_valid | - array that has flags to denote validity. |
|
inline |
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
ValidFlag | - [inferred] the validity flag type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | ranks | - array that has rank of data. |
[in] | is_valid | - array that has flags to denote validity. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | ranks | - array that has rank of data. |
|
inline |
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage.
U | - [inferred] the output type. |
Offset | - [inferred] the rank type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | ranks | - array that has rank of data. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Transposes a striped arrangement of items to a blocked arrangement across the thread block.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
|
inline |
Transposes a striped arrangement of items to a blocked arrangement across the thread block, using temporary storage.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
|
inline |
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block, using temporary storage.
U | - [inferred] the output type. |
[in] | input | - array that data is loaded from. |
[out] | output | - array that data is loaded to. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.