rocPRIM
|
The warp_scan class is a warp level parallel primitive which provides methods for performing inclusive and exclusive scan operations of items partitioned across threads in a hardware warp. More...
#include <warp_scan.hpp>
Public Types | |
using | storage_type = typename base_type::storage_type |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. More... | |
Public Member Functions | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | inclusive_scan (T input, T &output, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive scan across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | inclusive_scan (T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive scan across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | inclusive_scan (T input, T &output, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive scan and reduction across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | inclusive_scan (T, T &, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive scan and reduction across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | exclusive_scan (T input, T &output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs exclusive scan across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | exclusive_scan (T, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs exclusive scan across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | exclusive_scan (T input, T &output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs exclusive scan and reduction across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | exclusive_scan (T, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs exclusive scan and reduction across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | scan (T input, T &inclusive_output, T &exclusive_output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive and exclusive scan operations across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | scan (T, T &, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive and exclusive scan operations across threads Invalid Warp Size. | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | scan (T input, T &inclusive_output, T &exclusive_output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive and exclusive scan operations, and reduction across threads in a logical warp. More... | |
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | scan (T, T &, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
Performs inclusive and exclusive scan operations across threads Invalid Warp Size. | |
template<unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | broadcast (T input, const unsigned int src_lane, storage_type &storage) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), T >::type |
Broadcasts value from one thread to all threads in logical warp. More... | |
template<unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | broadcast (T, const unsigned int, storage_type &) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T >::type |
Broadcasts value from one thread to all threads in logical warp. More... | |
Protected Member Functions | |
template<unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | to_exclusive (T inclusive_input, T &exclusive_output, storage_type &storage) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
template<unsigned int FunctionWarpSize = WarpSize> | |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | to_exclusive (T, T &, storage_type &) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
The warp_scan class is a warp level parallel primitive which provides methods for performing inclusive and exclusive scan operations of items partitioned across threads in a hardware warp.
T | - the input/output type. |
WarpSize | - the size of logical warp size, which can be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). Scan operations are performed separately within groups determined by WarpSize. |
WarpSize
must be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). If it is less, scan is performed separately within groups determined by WarpSize. WarpSize
is 4, hardware warp is 64, scan will be performed in logical warps grouped like this: { {0, 1, 2, 3}, {4, 5, 6, 7 }, ..., {60, 61, 62, 63} }
(thread is represented here by its id within hardware warp).WarpSize
consecutive threads from the same hardware warp.WarpSize
;In the examples scan operation is performed on groups of 16 threads, each provides one int
value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 64.
using warp_scan< T, WarpSize >::storage_type = typename base_type::storage_type |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.
Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords __shared__
. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
|
inline |
Broadcasts value from one thread to all threads in logical warp.
[in] | input | - value to broadcast. |
[in] | src_lane | - id of the thread whose value should be broadcasted |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
Broadcasts value from one thread to all threads in logical warp.
Invalid Warp Size
|
inline |
Performs exclusive scan across threads in a logical warp.
BinaryFunction | - type of binary function used for scan. Default type is rocprim::plus<T>. |
[in] | input | - thread input value. |
[out] | output | - reference to a thread output value. May be aliased with input . |
[in] | init | - initial value used to start the exclusive scan. Should be the same for all threads in a logical warp. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | scan_op | - binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.The examples present exclusive min scan operations performed on groups of 32 threads, each provides one float
value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 256.
If the initial value is 100
and input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}
, then output values in the first logical warp will be {100, 1, -2, -2, -4, ..., -30},
in the second: {100, 33, -34, -34, -36, ..., -62}
etc.
|
inline |
Performs exclusive scan across threads in a logical warp.
Invalid Warp Size
|
inline |
Performs exclusive scan and reduction across threads in a logical warp.
BinaryFunction | - type of binary function used for scan. Default type is rocprim::plus<T>. |
[in] | input | - thread input value. |
[out] | output | - reference to a thread output value. May be aliased with input . |
[in] | init | - initial value used to start the exclusive scan. Should be the same for all threads in a logical warp. |
[out] | reduction | - result of reducing of all input values in logical warp. init value is not included in the reduction. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | scan_op | - binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.The examples present exclusive prefix sum operations performed on groups of 64 threads, each thread provides one int
value. Hardware warp size is 64. Block (tile) size is 256.
If the initial value is 10
and input
values across threads in a block/tile are {1, 1, ..., 1, 1}
, then output
values in every logical warp will be {10, 11, 12, 13, ..., 73}
. The reduction
will be 64.
|
inline |
Performs exclusive scan and reduction across threads in a logical warp.
Invalid Warp Size
|
inline |
Performs inclusive scan across threads in a logical warp.
BinaryFunction | - type of binary function used for scan. Default type is rocprim::plus<T>. |
[in] | input | - thread input value. |
[out] | output | - reference to a thread output value. May be aliased with input . |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | scan_op | - binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.The examples present inclusive min scan operations performed on groups of 32 threads, each provides one float
value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 256.
If the input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}
, then output values in the first logical warp will be {1, -2, -2, -4, ..., -32},
in the second: {33, -34, -34, -36, ..., -64}
etc.
|
inline |
Performs inclusive scan across threads in a logical warp.
Invalid Warp Size
|
inline |
Performs inclusive scan and reduction across threads in a logical warp.
BinaryFunction | - type of binary function used for scan. Default type is rocprim::plus<T>. |
[in] | input | - thread input value. |
[out] | output | - reference to a thread output value. May be aliased with input . |
[out] | reduction | - result of reducing of all input values in logical warp. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | scan_op | - binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.The examples present inclusive prefix sum operations performed on groups of 64 threads, each thread provides one int
value. Hardware warp size is 64. Block (tile) size is 256.
If the input
values across threads in a block/tile are {1, 1, 1, 1, ..., 1, 1}
, then output
values in the every logical warp will be {1, 2, 3, 4, ..., 64}
. The reduction
will be equal 64
.
|
inline |
Performs inclusive scan and reduction across threads in a logical warp.
Invalid Warp Size
|
inline |
Performs inclusive and exclusive scan operations across threads in a logical warp.
BinaryFunction | - type of binary function used for scan. Default type is rocprim::plus<T>. |
[in] | input | - thread input value. |
[out] | inclusive_output | - reference to a thread inclusive-scan output value. |
[out] | exclusive_output | - reference to a thread exclusive-scan output value. |
[in] | init | - initial value used to start the exclusive scan. Should be the same for all threads in a logical warp. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | scan_op | - binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.The examples present min inclusive and exclusive scan operations performed on groups of 32 threads, each provides one float
value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 256.
If the initial value is 100
and input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}
, then in_output
values in the first logical warp will be {1, -2, -2, -4, ..., -32},
in the second: {33, -34, -34, -36, ..., -64}
and so forth, ex_output
values in the first logical warp will be {100, 1, -2, -2, -4, ..., -30},
in the second: {100, 33, -34, -34, -36, ..., -62}
etc.
|
inline |
Performs inclusive and exclusive scan operations, and reduction across threads in a logical warp.
BinaryFunction | - type of binary function used for scan. Default type is rocprim::plus<T>. |
[in] | input | - thread input value. |
[out] | inclusive_output | - reference to a thread inclusive-scan output value. |
[out] | exclusive_output | - reference to a thread exclusive-scan output value. |
[in] | init | - initial value used to start the exclusive scan. Should be the same for all threads in a logical warp. |
[out] | reduction | - result of reducing of all input values in logical warp. init value is not included in the reduction. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | scan_op | - binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.The examples present inclusive and exclusive prefix sum operations performed on groups of 64 threads, each thread provides one int
value. Hardware warp size is 64. Block (tile) size is 256.
If the initial value is 10
and input
values across threads in a block/tile are {1, 1, ..., 1, 1}
, then in_output
values in every logical warp will be {1, 2, 3, 4, ..., 63, 64}
, and ex_output
values in every logical warp will be {10, 11, 12, 13, ..., 73}
. The reduction
will be 64.