|
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | reduce (T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs reduction across threads in a logical warp. More...
|
|
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | reduce (T, T &, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs reduction across threads in a logical warp. More...
|
|
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | reduce (T input, T &output, int valid_items, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs reduction across threads in a logical warp. More...
|
|
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | reduce (T, T &, int, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs reduction across threads in a logical warp. More...
|
|
template<class Flag , class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | head_segmented_reduce (T input, T &output, Flag flag, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs head-segmented reduction across threads in a logical warp. More...
|
|
template<class Flag , class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | head_segmented_reduce (T, T &, Flag, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs head-segmented reduction across threads in a logical warp. More...
|
|
template<class Flag , class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | tail_segmented_reduce (T input, T &output, Flag flag, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs tail-segmented reduction across threads in a logical warp. More...
|
|
template<class Flag , class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize> |
ROCPRIM_DEVICE ROCPRIM_INLINE auto | tail_segmented_reduce (T, T &, Flag, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type |
| Performs tail-segmented reduction across threads in a logical warp. More...
|
|
template<class T, unsigned int WarpSize = device_warp_size(), bool UseAllReduce = false>
class warp_reduce< T, WarpSize, UseAllReduce >
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduction operations on items partitioned across threads in a hardware warp.
- Template Parameters
-
T | - the input/output type. |
WarpSize | - the size of logical warp size, which can be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). Reduce operations are performed separately within groups determined by WarpSize. |
UseAllReduce | - input parameter to determine whether to broadcast final reduction value to all threads (default is false). |
- Overview
WarpSize
must be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). If it is less, reduce is performed separately within groups determined by WarpSize.
For example, if WarpSize
is 4, hardware warp is 64, reduction will be performed in logical warps grouped like this: { {0, 1, 2, 3}, {4, 5, 6, 7 }, ..., {60, 61, 62, 63} }
(thread is represented here by its id within hardware warp).
- Logical warp is a group of
WarpSize
consecutive threads from the same hardware warp.
- Supports non-commutative reduce operators. However, a reduce operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
- Number of threads executing warp_reduce's function must be a multiple of
WarpSize
;
- All threads from a logical warp must be in the same hardware warp.
- Examples
In the examples reduce operation is performed on groups of 16 threads, each provides one int
value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 64.
__global__ void example_kernel(...)
{
using warp_reduce_int = rocprim::warp_reduce<int, 16>;
__shared__ warp_reduce_int::storage_type temp[4];
int logical_warp_id = threadIdx.x/16;
int value = ...;
warp_reduce_int().reduce(
value,
value,
temp[logical_warp_id]
);
...
}