21 #ifndef ROCPRIM_WARP_WARP_SCAN_HPP_ 22 #define ROCPRIM_WARP_WARP_SCAN_HPP_ 24 #include <type_traits> 26 #include "../config.hpp" 27 #include "../detail/various.hpp" 29 #include "../intrinsics.hpp" 30 #include "../functional.hpp" 31 #include "../types.hpp" 33 #include "detail/warp_scan_crosslane.hpp" 34 #include "detail/warp_scan_shared_mem.hpp" 39 BEGIN_ROCPRIM_NAMESPACE
45 template<
class T,
unsigned int WarpSize>
48 typedef typename std::conditional<
51 detail::warp_scan_crosslane<T, WarpSize>,
112 #ifndef DOXYGEN_SHOULD_SKIP_THIS
116 using base_type =
typename detail::select_warp_scan_impl<T, WarpSize>::type;
119 static_assert(WarpSize <= ROCPRIM_MAX_WARP_SIZE,
"WarpSize can't be greater than hardware warp size.");
180 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
181 ROCPRIM_DEVICE ROCPRIM_INLINE
185 BinaryFunction scan_op = BinaryFunction())
186 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
193 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
194 ROCPRIM_DEVICE ROCPRIM_INLINE
198 BinaryFunction scan_op = BinaryFunction())
199 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
255 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
256 ROCPRIM_DEVICE ROCPRIM_INLINE
261 BinaryFunction scan_op = BinaryFunction())
262 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
269 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
270 ROCPRIM_DEVICE ROCPRIM_INLINE
275 BinaryFunction scan_op = BinaryFunction())
276 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
335 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
336 ROCPRIM_DEVICE ROCPRIM_INLINE
341 BinaryFunction scan_op = BinaryFunction())
342 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
349 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
350 ROCPRIM_DEVICE ROCPRIM_INLINE
355 BinaryFunction scan_op = BinaryFunction())
356 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
416 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
417 ROCPRIM_DEVICE ROCPRIM_INLINE
423 BinaryFunction scan_op = BinaryFunction())
424 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
431 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
432 ROCPRIM_DEVICE ROCPRIM_INLINE
438 BinaryFunction scan_op = BinaryFunction())
439 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
504 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
505 ROCPRIM_DEVICE ROCPRIM_INLINE
511 BinaryFunction scan_op = BinaryFunction())
512 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
514 base_type::scan(input, inclusive_output, exclusive_output, init, storage, scan_op);
519 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
520 ROCPRIM_DEVICE ROCPRIM_INLINE
526 BinaryFunction scan_op = BinaryFunction())
527 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
592 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
593 ROCPRIM_DEVICE ROCPRIM_INLINE
600 BinaryFunction scan_op = BinaryFunction())
601 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
604 input, inclusive_output, exclusive_output, init, reduction,
611 template<
class BinaryFunction = ::rocprim::plus<T>,
unsigned int FunctionWarpSize = WarpSize>
612 ROCPRIM_DEVICE ROCPRIM_INLINE
619 BinaryFunction scan_op = BinaryFunction())
620 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
636 template<
unsigned int FunctionWarpSize = WarpSize>
637 ROCPRIM_DEVICE ROCPRIM_INLINE
639 const unsigned int src_lane,
641 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), T>::type
643 return base_type::broadcast(input, src_lane, storage);
648 template<
unsigned int FunctionWarpSize = WarpSize>
649 ROCPRIM_DEVICE ROCPRIM_INLINE
653 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T>::type
659 #ifndef DOXYGEN_SHOULD_SKIP_THIS 662 template<
unsigned int FunctionWarpSize = WarpSize>
663 ROCPRIM_DEVICE ROCPRIM_INLINE
664 auto to_exclusive(T inclusive_input, T& exclusive_output,
storage_type& storage)
665 ->
typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
667 return base_type::to_exclusive(inclusive_input, exclusive_output, storage);
670 template<
unsigned int FunctionWarpSize = WarpSize>
671 ROCPRIM_DEVICE ROCPRIM_INLINE
673 ->
typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE),
void>::type
681 END_ROCPRIM_NAMESPACE
686 #endif // ROCPRIM_WARP_WARP_SCAN_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T input, T &inclusive_output, T &exclusive_output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations, and reduction across threads in a logical warp...
Definition: warp_scan.hpp:594
The warp_scan class is a warp level parallel primitive which provides methods for performing inclusiv...
Definition: warp_scan.hpp:111
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T input, T &output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:418
typename base_type::storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: warp_scan.hpp:130
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:351
ROCPRIM_DEVICE ROCPRIM_INLINE auto broadcast(T, const unsigned int, storage_type &) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T >::type
Broadcasts value from one thread to all threads in logical warp.
Definition: warp_scan.hpp:650
Definition: warp_scan.hpp:46
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T input, T &inclusive_output, T &exclusive_output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations across threads in a logical warp.
Definition: warp_scan.hpp:506
hipError_t exclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel exclusive scan primitive for device level.
Definition: device_scan.hpp:651
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T, T &, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:271
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:433
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T, T &, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations across threads Invalid Warp Size.
Definition: warp_scan.hpp:613
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T, T &, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations across threads Invalid Warp Size.
Definition: warp_scan.hpp:521
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T input, T &output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:337
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T input, T &output, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:257
Definition: warp_scan_shared_mem.hpp:41
hipError_t inclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel inclusive scan primitive for device level.
Definition: device_scan.hpp:539
#define ROCPRIM_PRINT_ERROR_ONCE(message)
Prints the supplied error message only once (using only one of the active threads).
Definition: functional.hpp:42
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:195
ROCPRIM_DEVICE ROCPRIM_INLINE auto broadcast(T input, const unsigned int src_lane, storage_type &storage) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), T >::type
Broadcasts value from one thread to all threads in logical warp.
Definition: warp_scan.hpp:638
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T input, T &output, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:182
Definition: various.hpp:108