21 #ifndef ROCPRIM_WARP_WARP_REDUCE_HPP_ 22 #define ROCPRIM_WARP_WARP_REDUCE_HPP_ 24 #include <type_traits> 26 #include "../config.hpp" 27 #include "../detail/various.hpp" 29 #include "../intrinsics.hpp" 30 #include "../functional.hpp" 31 #include "../types.hpp" 33 #include "detail/warp_reduce_crosslane.hpp" 34 #include "detail/warp_reduce_shared_mem.hpp" 39 BEGIN_ROCPRIM_NAMESPACE
45 template<
class T,
unsigned int WarpSize,
bool UseAllReduce>
48 typedef typename std::conditional<
51 detail::warp_reduce_crosslane<T, WarpSize, UseAllReduce>,
112 bool UseAllReduce =
false 115 #ifndef DOXYGEN_SHOULD_SKIP_THIS
119 using base_type =
typename detail::select_warp_reduce_impl<T, WarpSize, UseAllReduce>::type;
122 static_assert(WarpSize <= ROCPRIM_MAX_WARP_SIZE,
"WarpSize can't be greater than hardware warp size.");
179 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
180 ROCPRIM_DEVICE ROCPRIM_INLINE
184 BinaryFunction reduce_op = BinaryFunction())
185 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
192 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
193 ROCPRIM_DEVICE ROCPRIM_INLINE
197 BinaryFunction reduce_op = BinaryFunction())
198 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
251 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
252 ROCPRIM_DEVICE ROCPRIM_INLINE
257 BinaryFunction reduce_op = BinaryFunction())
258 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
265 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
266 ROCPRIM_DEVICE ROCPRIM_INLINE
271 BinaryFunction reduce_op = BinaryFunction())
272 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
297 template<
class Flag,
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
298 ROCPRIM_DEVICE ROCPRIM_INLINE
303 BinaryFunction reduce_op = BinaryFunction())
304 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
306 base_type::head_segmented_reduce(input, output, flag, storage, reduce_op);
311 template<
class Flag,
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
312 ROCPRIM_DEVICE ROCPRIM_INLINE
317 BinaryFunction reduce_op = BinaryFunction())
318 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
343 template<
class Flag,
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
344 ROCPRIM_DEVICE ROCPRIM_INLINE
349 BinaryFunction reduce_op = BinaryFunction())
350 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
352 base_type::tail_segmented_reduce(input, output, flag, storage, reduce_op);
357 template<
class Flag,
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
358 ROCPRIM_DEVICE ROCPRIM_INLINE
363 BinaryFunction reduce_op = BinaryFunction())
364 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
372 END_ROCPRIM_NAMESPACE
377 #endif // ROCPRIM_WARP_WARP_REDUCE_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE auto head_segmented_reduce(T, T &, Flag, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs head-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:313
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T, T &, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:194
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
typename base_type::storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: warp_reduce.hpp:133
hipError_t reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduction primitive for device level.
Definition: device_reduce.hpp:374
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: warp_reduce.hpp:46
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, int valid_items, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:253
ROCPRIM_DEVICE ROCPRIM_INLINE auto tail_segmented_reduce(T, T &, Flag, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs tail-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:359
#define ROCPRIM_PRINT_ERROR_ONCE(message)
Prints the supplied error message only once (using only one of the active threads).
Definition: functional.hpp:42
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T, T &, int, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:267
Definition: warp_reduce_shared_mem.hpp:43
Definition: various.hpp:108
ROCPRIM_DEVICE ROCPRIM_INLINE auto tail_segmented_reduce(T input, T &output, Flag flag, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs tail-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:345
ROCPRIM_DEVICE ROCPRIM_INLINE auto head_segmented_reduce(T input, T &output, Flag flag, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs head-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:299
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:181