21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_    22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_    24 #include <type_traits>    26 #include "../../config.hpp"    27 #include "../../detail/various.hpp"    29 #include "../../intrinsics.hpp"    30 #include "../../functional.hpp"    32 #include "../../warp/warp_reduce.hpp"    34 BEGIN_ROCPRIM_NAMESPACE
    41     unsigned int BlockSizeX,
    42     unsigned int BlockSizeY,
    43     unsigned int BlockSizeZ
    47     static constexpr 
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
    49     static constexpr 
unsigned int warp_size_ =
    52     static constexpr 
unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_;
    55     static constexpr 
bool block_size_is_warp_multiple_ = ((BlockSize % warp_size_) == 0);
    56     static constexpr 
bool warps_no_is_pow_of_two_ = detail::is_power_of_two(warps_no_);
    62     using warp_reduce_input_type = ::rocprim::detail::warp_reduce_crosslane<T, warp_size_, false>;
    65     using warp_reduce_output_type = ::rocprim::detail::warp_reduce_crosslane<
    66         T, detail::next_power_of_two(warps_no_), 
false    71         T warp_partials[warps_no_];
    77     template<
class BinaryFunction>
    78     ROCPRIM_DEVICE ROCPRIM_INLINE
    82                 BinaryFunction reduce_op)
    85             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
    86             input, output, storage, reduce_op
    90     template<
class BinaryFunction>
    91     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
    94                 BinaryFunction reduce_op)
    97         this->
reduce(input, output, storage, reduce_op);
   100     template<
unsigned int ItemsPerThread, 
class BinaryFunction>
   101     ROCPRIM_DEVICE ROCPRIM_INLINE
   102     void reduce(T (&input)[ItemsPerThread],
   105                 BinaryFunction reduce_op)
   108         T thread_input = input[0];
   110         for(
unsigned int i = 1; i < ItemsPerThread; i++)
   112             thread_input = reduce_op(thread_input, input[i]);
   116         const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
   119             thread_input, output, 
   125     template<
unsigned int ItemsPerThread, 
class BinaryFunction>
   126     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   127     void reduce(T (&input)[ItemsPerThread],
   129                 BinaryFunction reduce_op)
   132         this->
reduce(input, output, storage, reduce_op);
   135     template<
class BinaryFunction>
   136     ROCPRIM_DEVICE ROCPRIM_INLINE
   139                 unsigned int valid_items,
   141                 BinaryFunction reduce_op)
   144             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   145             input, output, valid_items, storage, reduce_op
   149     template<
class BinaryFunction>
   150     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   153                 unsigned int valid_items,
   154                 BinaryFunction reduce_op)
   157         this->
reduce(input, output, valid_items, storage, reduce_op);
   161     template<
class BinaryFunction>
   162     ROCPRIM_DEVICE ROCPRIM_INLINE
   163     void reduce_impl(
const unsigned int flat_tid,
   167                      BinaryFunction reduce_op)
   171         const unsigned int warp_offset = 
warp_id * warp_size_;
   172         const unsigned int num_valid =
   173             (warp_offset < BlockSize) ? BlockSize - warp_offset : 0;
   174         storage_type_& storage_ = storage.get();
   178             input, output, num_valid, reduce_op
   184             storage_.warp_partials[
warp_id] = output;
   188         if(flat_tid < warps_no_)
   191             auto warp_partial = storage_.warp_partials[
lane_id];
   194                 warp_partial, output, warps_no_, reduce_op
   199     template<
bool UseVal
id, 
class WarpReduce, 
class BinaryFunction>
   200     ROCPRIM_DEVICE ROCPRIM_INLINE
   203                      const unsigned int valid_items,
   204                      BinaryFunction reduce_op)
   205         -> 
typename std::enable_if<UseValid>::type
   208             input, output, valid_items, reduce_op
   212     template<
bool UseVal
id, 
class WarpReduce, 
class BinaryFunction>
   213     ROCPRIM_DEVICE ROCPRIM_INLINE
   216                      const unsigned int valid_items,
   217                      BinaryFunction reduce_op)
   218         -> 
typename std::enable_if<!UseValid>::type
   222             input, output, reduce_op
   226     template<
class BinaryFunction>
   227     ROCPRIM_DEVICE ROCPRIM_INLINE
   228     void reduce_impl(
const unsigned int flat_tid,
   231                      const unsigned int valid_items,
   233                      BinaryFunction reduce_op)
   237         const unsigned int warp_offset = 
warp_id * warp_size_;
   238         const unsigned int num_valid =
   239             (warp_offset < valid_items) ? valid_items - warp_offset : 0;
   240         storage_type_& storage_ = storage.get();
   243         warp_reduce_input_type().reduce(
   244             input, output, num_valid, reduce_op
   250             storage_.warp_partials[
warp_id] = output;
   254         if(flat_tid < warps_no_)
   257             auto warp_partial = storage_.warp_partials[
lane_id];
   259             unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_;
   260             warp_reduce_output_type().reduce(
   261                 warp_partial, output, valid_warps_no, reduce_op
   269 END_ROCPRIM_NAMESPACE
   271 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_ Definition: block_reduce_warp_reduce.hpp:45
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target. 
Definition: thread.hpp:70
Definition: benchmark_block_reduce.cpp:63
Deprecated: Configuration of device-level scan primitives. 
Definition: block_histogram.hpp:62
const unsigned int warp_id
Returns warp id in a block (tile). 
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile) 
Definition: thread.hpp:216
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp. 
Definition: thread.hpp:93
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp. 
Definition: warp_reduce.hpp:181