21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_ 22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_ 24 #include <type_traits> 26 #include "../../config.hpp" 27 #include "../../detail/various.hpp" 29 #include "../../intrinsics.hpp" 30 #include "../../functional.hpp" 32 #include "../../warp/warp_reduce.hpp" 34 BEGIN_ROCPRIM_NAMESPACE
41 unsigned int BlockSizeX,
42 unsigned int BlockSizeY,
43 unsigned int BlockSizeZ
47 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
49 static constexpr
unsigned int warp_size_ =
52 static constexpr
unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_;
55 static constexpr
bool block_size_is_warp_multiple_ = ((BlockSize % warp_size_) == 0);
56 static constexpr
bool warps_no_is_pow_of_two_ = detail::is_power_of_two(warps_no_);
62 using warp_reduce_input_type = ::rocprim::detail::warp_reduce_crosslane<T, warp_size_, false>;
65 using warp_reduce_output_type = ::rocprim::detail::warp_reduce_crosslane<
66 T, detail::next_power_of_two(warps_no_),
false 71 T warp_partials[warps_no_];
77 template<
class BinaryFunction>
78 ROCPRIM_DEVICE ROCPRIM_INLINE
82 BinaryFunction reduce_op)
85 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
86 input, output, storage, reduce_op
90 template<
class BinaryFunction>
91 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
94 BinaryFunction reduce_op)
97 this->
reduce(input, output, storage, reduce_op);
100 template<
unsigned int ItemsPerThread,
class BinaryFunction>
101 ROCPRIM_DEVICE ROCPRIM_INLINE
102 void reduce(T (&input)[ItemsPerThread],
105 BinaryFunction reduce_op)
108 T thread_input = input[0];
110 for(
unsigned int i = 1; i < ItemsPerThread; i++)
112 thread_input = reduce_op(thread_input, input[i]);
116 const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
119 thread_input, output,
125 template<
unsigned int ItemsPerThread,
class BinaryFunction>
126 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
127 void reduce(T (&input)[ItemsPerThread],
129 BinaryFunction reduce_op)
132 this->
reduce(input, output, storage, reduce_op);
135 template<
class BinaryFunction>
136 ROCPRIM_DEVICE ROCPRIM_INLINE
139 unsigned int valid_items,
141 BinaryFunction reduce_op)
144 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
145 input, output, valid_items, storage, reduce_op
149 template<
class BinaryFunction>
150 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
153 unsigned int valid_items,
154 BinaryFunction reduce_op)
157 this->
reduce(input, output, valid_items, storage, reduce_op);
161 template<
class BinaryFunction>
162 ROCPRIM_DEVICE ROCPRIM_INLINE
163 void reduce_impl(
const unsigned int flat_tid,
167 BinaryFunction reduce_op)
171 const unsigned int warp_offset =
warp_id * warp_size_;
172 const unsigned int num_valid =
173 (warp_offset < BlockSize) ? BlockSize - warp_offset : 0;
174 storage_type_& storage_ = storage.get();
178 input, output, num_valid, reduce_op
184 storage_.warp_partials[
warp_id] = output;
188 if(flat_tid < warps_no_)
191 auto warp_partial = storage_.warp_partials[
lane_id];
194 warp_partial, output, warps_no_, reduce_op
199 template<
bool UseVal
id,
class WarpReduce,
class BinaryFunction>
200 ROCPRIM_DEVICE ROCPRIM_INLINE
203 const unsigned int valid_items,
204 BinaryFunction reduce_op)
205 ->
typename std::enable_if<UseValid>::type
208 input, output, valid_items, reduce_op
212 template<
bool UseVal
id,
class WarpReduce,
class BinaryFunction>
213 ROCPRIM_DEVICE ROCPRIM_INLINE
216 const unsigned int valid_items,
217 BinaryFunction reduce_op)
218 ->
typename std::enable_if<!UseValid>::type
222 input, output, reduce_op
226 template<
class BinaryFunction>
227 ROCPRIM_DEVICE ROCPRIM_INLINE
228 void reduce_impl(
const unsigned int flat_tid,
231 const unsigned int valid_items,
233 BinaryFunction reduce_op)
237 const unsigned int warp_offset =
warp_id * warp_size_;
238 const unsigned int num_valid =
239 (warp_offset < valid_items) ? valid_items - warp_offset : 0;
240 storage_type_& storage_ = storage.get();
243 warp_reduce_input_type().reduce(
244 input, output, num_valid, reduce_op
250 storage_.warp_partials[
warp_id] = output;
254 if(flat_tid < warps_no_)
257 auto warp_partial = storage_.warp_partials[
lane_id];
259 unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_;
260 warp_reduce_output_type().reduce(
261 warp_partial, output, valid_warps_no, reduce_op
269 END_ROCPRIM_NAMESPACE
271 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_ Definition: block_reduce_warp_reduce.hpp:45
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
Definition: benchmark_block_reduce.cpp:63
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
const unsigned int warp_id
Returns warp id in a block (tile).
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:181