21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_    22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_    24 #include <type_traits>    26 #include "../../config.hpp"    27 #include "../../detail/various.hpp"    29 #include "../../functional.hpp"    30 #include "../../intrinsics.hpp"    32 #include "../../warp/warp_reduce.hpp"    34 BEGIN_ROCPRIM_NAMESPACE
    42 template<
class T, 
int n, 
typename = 
void>
    46     ROCPRIM_HOST_DEVICE T 
get(
int index) 
const    51     ROCPRIM_HOST_DEVICE 
void set(
int index, T value)
    63 #ifndef DOXYGEN_SHOULD_SKIP_THIS    64 template<
class T, 
int n>
    65 class fast_array<T, n, std::enable_if_t<(sizeof(T) > sizeof(int32_t))>>
    68     ROCPRIM_HOST_DEVICE T 
get(
int index) 
const    72         for(
int i = 0; i < words_no; i++)
    74             const size_t s = 
std::min(
sizeof(int32_t), 
sizeof(T) - i * 
sizeof(int32_t));
    76             std::memcpy(reinterpret_cast<char*>(&result) + i * 
sizeof(int32_t),
    80             __builtin_memcpy(reinterpret_cast<char*>(&result) + i * 
sizeof(int32_t),
    88     ROCPRIM_HOST_DEVICE 
void set(
int index, T value)
    91         for(
int i = 0; i < words_no; i++)
    93             const size_t s = 
std::min(
sizeof(int32_t), 
sizeof(T) - i * 
sizeof(int32_t));
    95             std::memcpy(data + index + i * n,
    96                         reinterpret_cast<const char*>(&value) + i * 
sizeof(int32_t),
    99             __builtin_memcpy(data + index + i * n,
   100                              reinterpret_cast<const char*>(&value) + i * 
sizeof(int32_t),
   107     static constexpr 
int words_no = rocprim::detail::ceiling_div(
sizeof(T), 
sizeof(int32_t));
   109     int32_t data[words_no * n];
   111 #endif // DOXYGEN_SHOULD_SKIP_THIS   114          unsigned int BlockSizeX,
   115          unsigned int BlockSizeY,
   116          unsigned int BlockSizeZ,
   117          bool         CommutativeOnly = 
false>
   120     static constexpr 
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
   124     static constexpr 
unsigned int warp_size_
   127     static constexpr 
unsigned int segment_len = ceiling_div(BlockSize, warp_size_);
   129     static constexpr 
bool block_multiple_warp_     = !(BlockSize % warp_size_);
   130     static constexpr 
bool block_smaller_than_warp_ = (BlockSize < warp_size_);
   131     using warp_reduce_prefix_type = ::rocprim::detail::warp_reduce_crosslane<T, warp_size_, false>;
   146     template<
class BinaryFunction>
   147     ROCPRIM_DEVICE ROCPRIM_INLINE 
void   150         this->reduce_impl(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   161     template<
class BinaryFunction>
   162     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE 
void reduce(T input, T& output, BinaryFunction reduce_op)
   165         this->
reduce(input, output, storage, reduce_op);
   173     template<
unsigned int ItemsPerThread, 
class BinaryFunction>
   174     ROCPRIM_DEVICE ROCPRIM_INLINE 
void reduce(T (&input)[ItemsPerThread],
   177                                               BinaryFunction reduce_op)
   180         T thread_input = input[0];
   182         for(
unsigned int i = 1; i < ItemsPerThread; i++)
   184             thread_input = reduce_op(thread_input, input[i]);
   188         const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
   189         this->reduce_impl(flat_tid, thread_input, output, storage, reduce_op);
   196     template<
unsigned int ItemsPerThread, 
class BinaryFunction>
   197     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE 
void   198         reduce(T (&input)[ItemsPerThread], T& output, BinaryFunction reduce_op)
   201         this->
reduce(input, output, storage, reduce_op);
   210     template<
class BinaryFunction>
   211     ROCPRIM_DEVICE ROCPRIM_INLINE 
void reduce(T              input,
   213                                               unsigned int   valid_items,
   215                                               BinaryFunction reduce_op)
   217         this->reduce_impl(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   230     template<
class BinaryFunction>
   231     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE 
void   232         reduce(T input, T& output, 
unsigned int valid_items, BinaryFunction reduce_op)
   235         this->
reduce(input, output, valid_items, storage, reduce_op);
   239     template<
class BinaryFunction, 
bool FunctionCommutativeOnly = CommutativeOnly>
   240     ROCPRIM_DEVICE ROCPRIM_INLINE 
auto reduce_impl(
const unsigned int flat_tid,
   244                                                    BinaryFunction     reduce_op) ->
   245         typename std::enable_if<(FunctionCommutativeOnly), void>::type
   247         storage_type_& storage_ = storage.get();
   248         if(flat_tid >= warp_size_)
   250             storage_.threads.set(flat_tid, input);
   254         if(flat_tid < warp_size_)
   256             unsigned int thread_index     = flat_tid;
   257             T            thread_reduction = input;
   259             for(
unsigned int i = 1; i < segment_len; i++)
   261                 thread_index += warp_size_;
   262                 if(block_multiple_warp_ || (thread_index < BlockSize))
   265                         = reduce_op(thread_reduction, storage_.threads.get(thread_index));
   275     template<
class BinaryFunction, 
bool FunctionCommutativeOnly = CommutativeOnly>
   276     ROCPRIM_DEVICE ROCPRIM_INLINE 
auto reduce_impl(
const unsigned int flat_tid,
   280                                                    BinaryFunction     reduce_op) ->
   281         typename std::enable_if<(!FunctionCommutativeOnly), void>::type
   283         storage_type_& storage_ = storage.get();
   284         storage_.threads.set(flat_tid, input);
   287         constexpr 
unsigned int active_lanes = ceiling_div(BlockSize, segment_len);
   289         if(flat_tid < active_lanes)
   291             unsigned int thread_index     = segment_len * flat_tid;
   292             T            thread_reduction = storage_.threads.get(thread_index);
   294             for(
unsigned int i = 1; i < segment_len; i++)
   297                 if(block_multiple_warp_ || (thread_index < BlockSize))
   300                         = reduce_op(thread_reduction, storage_.threads.get(thread_index));
   310     template<
bool UseVal
id, 
class WarpReduce, 
class BinaryFunction>
   311     ROCPRIM_DEVICE ROCPRIM_INLINE 
auto   312         warp_reduce(T input, T& output, 
const unsigned int valid_items, BinaryFunction reduce_op) ->
   313         typename std::enable_if<UseValid>::type
   315         WarpReduce().
reduce(input, output, valid_items, reduce_op);
   318     template<
bool UseVal
id, 
class WarpReduce, 
class BinaryFunction>
   319     ROCPRIM_DEVICE ROCPRIM_INLINE 
auto   320         warp_reduce(T input, T& output, 
const unsigned int valid_items, BinaryFunction reduce_op) ->
   321         typename std::enable_if<!UseValid>::type
   324         WarpReduce().
reduce(input, output, reduce_op);
   327     template<
class BinaryFunction, 
bool FunctionCommutativeOnly = CommutativeOnly>
   328     ROCPRIM_DEVICE ROCPRIM_INLINE 
auto reduce_impl(
const unsigned int flat_tid,
   331                                                    const unsigned int valid_items,
   333                                                    BinaryFunction     reduce_op) ->
   334         typename std::enable_if<(FunctionCommutativeOnly), void>::type
   336         storage_type_& storage_ = storage.get();
   337         if((flat_tid >= warp_size_) && (flat_tid < valid_items))
   339             storage_.threads.set(flat_tid, input);
   343         if(flat_tid < warp_size_)
   345             T thread_reduction = input;
   346             for(
unsigned int i = warp_size_ + flat_tid; i < valid_items; i += warp_size_)
   348                 thread_reduction = reduce_op(thread_reduction, storage_.threads.get(i));
   350             warp_reduce_prefix_type().reduce(thread_reduction, output, valid_items, reduce_op);
   354     template<
class BinaryFunction, 
bool FunctionCommutativeOnly = CommutativeOnly>
   355     ROCPRIM_DEVICE ROCPRIM_INLINE 
auto reduce_impl(
const unsigned int flat_tid,
   358                                                    const unsigned int valid_items,
   360                                                    BinaryFunction     reduce_op) ->
   361         typename std::enable_if<(!FunctionCommutativeOnly), void>::type
   363         storage_type_& storage_ = storage.get();
   364         if(flat_tid < valid_items)
   366             storage_.threads.set(flat_tid, input);
   370         unsigned int thread_index = segment_len * flat_tid;
   371         if(thread_index < valid_items)
   373             T thread_reduction = storage_.threads.get(thread_index);
   375             for(
unsigned int i = 1; i < segment_len; i++)
   378                 if(thread_index < valid_items)
   381                         = reduce_op(thread_reduction, storage_.threads.get(thread_index));
   385             warp_reduce_prefix_type().reduce(thread_reduction,
   387                                              (valid_items + segment_len - 1) / segment_len,
   394 END_ROCPRIM_NAMESPACE
   396 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_ Definition: block_reduce_raking_reduce.hpp:118
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target. 
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, unsigned int valid_items, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator. 
Definition: block_reduce_raking_reduce.hpp:232
hipError_t reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduction primitive for device level. 
Definition: device_reduce.hpp:374
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments. 
Definition: functional.hpp:63
Deprecated: Configuration of device-level scan primitives. 
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T(&input)[ItemsPerThread], T &output, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator. 
Definition: block_reduce_raking_reduce.hpp:174
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile) 
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, unsigned int valid_items, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator. 
Definition: block_reduce_raking_reduce.hpp:211
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T(&input)[ItemsPerThread], T &output, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator. 
Definition: block_reduce_raking_reduce.hpp:198
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
Definition: block_reduce_raking_reduce.hpp:43
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator. 
Definition: block_reduce_raking_reduce.hpp:162
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator. 
Definition: block_reduce_raking_reduce.hpp:148
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp. 
Definition: warp_reduce.hpp:181