21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_ 22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_ 24 #include <type_traits> 26 #include "../../config.hpp" 27 #include "../../detail/various.hpp" 29 #include "../../functional.hpp" 30 #include "../../intrinsics.hpp" 32 #include "../../warp/warp_reduce.hpp" 34 BEGIN_ROCPRIM_NAMESPACE
42 template<
class T,
int n,
typename =
void>
46 ROCPRIM_HOST_DEVICE T
get(
int index)
const 51 ROCPRIM_HOST_DEVICE
void set(
int index, T value)
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS 64 template<
class T,
int n>
65 class fast_array<T, n, std::enable_if_t<(sizeof(T) > sizeof(int32_t))>>
68 ROCPRIM_HOST_DEVICE T
get(
int index)
const 72 for(
int i = 0; i < words_no; i++)
74 const size_t s =
std::min(
sizeof(int32_t),
sizeof(T) - i *
sizeof(int32_t));
76 std::memcpy(reinterpret_cast<char*>(&result) + i *
sizeof(int32_t),
80 __builtin_memcpy(reinterpret_cast<char*>(&result) + i *
sizeof(int32_t),
88 ROCPRIM_HOST_DEVICE
void set(
int index, T value)
91 for(
int i = 0; i < words_no; i++)
93 const size_t s =
std::min(
sizeof(int32_t),
sizeof(T) - i *
sizeof(int32_t));
95 std::memcpy(data + index + i * n,
96 reinterpret_cast<const char*>(&value) + i *
sizeof(int32_t),
99 __builtin_memcpy(data + index + i * n,
100 reinterpret_cast<const char*>(&value) + i *
sizeof(int32_t),
107 static constexpr
int words_no = rocprim::detail::ceiling_div(
sizeof(T),
sizeof(int32_t));
109 int32_t data[words_no * n];
111 #endif // DOXYGEN_SHOULD_SKIP_THIS 114 unsigned int BlockSizeX,
115 unsigned int BlockSizeY,
116 unsigned int BlockSizeZ,
117 bool CommutativeOnly =
false>
120 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
124 static constexpr
unsigned int warp_size_
127 static constexpr
unsigned int segment_len = ceiling_div(BlockSize, warp_size_);
129 static constexpr
bool block_multiple_warp_ = !(BlockSize % warp_size_);
130 static constexpr
bool block_smaller_than_warp_ = (BlockSize < warp_size_);
131 using warp_reduce_prefix_type = ::rocprim::detail::warp_reduce_crosslane<T, warp_size_, false>;
146 template<
class BinaryFunction>
147 ROCPRIM_DEVICE ROCPRIM_INLINE
void 150 this->reduce_impl(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
161 template<
class BinaryFunction>
162 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void reduce(T input, T& output, BinaryFunction reduce_op)
165 this->
reduce(input, output, storage, reduce_op);
173 template<
unsigned int ItemsPerThread,
class BinaryFunction>
174 ROCPRIM_DEVICE ROCPRIM_INLINE
void reduce(T (&input)[ItemsPerThread],
177 BinaryFunction reduce_op)
180 T thread_input = input[0];
182 for(
unsigned int i = 1; i < ItemsPerThread; i++)
184 thread_input = reduce_op(thread_input, input[i]);
188 const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
189 this->reduce_impl(flat_tid, thread_input, output, storage, reduce_op);
196 template<
unsigned int ItemsPerThread,
class BinaryFunction>
197 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void 198 reduce(T (&input)[ItemsPerThread], T& output, BinaryFunction reduce_op)
201 this->
reduce(input, output, storage, reduce_op);
210 template<
class BinaryFunction>
211 ROCPRIM_DEVICE ROCPRIM_INLINE
void reduce(T input,
213 unsigned int valid_items,
215 BinaryFunction reduce_op)
217 this->reduce_impl(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
230 template<
class BinaryFunction>
231 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void 232 reduce(T input, T& output,
unsigned int valid_items, BinaryFunction reduce_op)
235 this->
reduce(input, output, valid_items, storage, reduce_op);
239 template<
class BinaryFunction,
bool FunctionCommutativeOnly = CommutativeOnly>
240 ROCPRIM_DEVICE ROCPRIM_INLINE
auto reduce_impl(
const unsigned int flat_tid,
244 BinaryFunction reduce_op) ->
245 typename std::enable_if<(FunctionCommutativeOnly), void>::type
247 storage_type_& storage_ = storage.get();
248 if(flat_tid >= warp_size_)
250 storage_.threads.set(flat_tid, input);
254 if(flat_tid < warp_size_)
256 unsigned int thread_index = flat_tid;
257 T thread_reduction = input;
259 for(
unsigned int i = 1; i < segment_len; i++)
261 thread_index += warp_size_;
262 if(block_multiple_warp_ || (thread_index < BlockSize))
265 = reduce_op(thread_reduction, storage_.threads.get(thread_index));
275 template<
class BinaryFunction,
bool FunctionCommutativeOnly = CommutativeOnly>
276 ROCPRIM_DEVICE ROCPRIM_INLINE
auto reduce_impl(
const unsigned int flat_tid,
280 BinaryFunction reduce_op) ->
281 typename std::enable_if<(!FunctionCommutativeOnly), void>::type
283 storage_type_& storage_ = storage.get();
284 storage_.threads.set(flat_tid, input);
287 constexpr
unsigned int active_lanes = ceiling_div(BlockSize, segment_len);
289 if(flat_tid < active_lanes)
291 unsigned int thread_index = segment_len * flat_tid;
292 T thread_reduction = storage_.threads.get(thread_index);
294 for(
unsigned int i = 1; i < segment_len; i++)
297 if(block_multiple_warp_ || (thread_index < BlockSize))
300 = reduce_op(thread_reduction, storage_.threads.get(thread_index));
310 template<
bool UseVal
id,
class WarpReduce,
class BinaryFunction>
311 ROCPRIM_DEVICE ROCPRIM_INLINE
auto 312 warp_reduce(T input, T& output,
const unsigned int valid_items, BinaryFunction reduce_op) ->
313 typename std::enable_if<UseValid>::type
315 WarpReduce().
reduce(input, output, valid_items, reduce_op);
318 template<
bool UseVal
id,
class WarpReduce,
class BinaryFunction>
319 ROCPRIM_DEVICE ROCPRIM_INLINE
auto 320 warp_reduce(T input, T& output,
const unsigned int valid_items, BinaryFunction reduce_op) ->
321 typename std::enable_if<!UseValid>::type
324 WarpReduce().
reduce(input, output, reduce_op);
327 template<
class BinaryFunction,
bool FunctionCommutativeOnly = CommutativeOnly>
328 ROCPRIM_DEVICE ROCPRIM_INLINE
auto reduce_impl(
const unsigned int flat_tid,
331 const unsigned int valid_items,
333 BinaryFunction reduce_op) ->
334 typename std::enable_if<(FunctionCommutativeOnly), void>::type
336 storage_type_& storage_ = storage.get();
337 if((flat_tid >= warp_size_) && (flat_tid < valid_items))
339 storage_.threads.set(flat_tid, input);
343 if(flat_tid < warp_size_)
345 T thread_reduction = input;
346 for(
unsigned int i = warp_size_ + flat_tid; i < valid_items; i += warp_size_)
348 thread_reduction = reduce_op(thread_reduction, storage_.threads.get(i));
350 warp_reduce_prefix_type().reduce(thread_reduction, output, valid_items, reduce_op);
354 template<
class BinaryFunction,
bool FunctionCommutativeOnly = CommutativeOnly>
355 ROCPRIM_DEVICE ROCPRIM_INLINE
auto reduce_impl(
const unsigned int flat_tid,
358 const unsigned int valid_items,
360 BinaryFunction reduce_op) ->
361 typename std::enable_if<(!FunctionCommutativeOnly), void>::type
363 storage_type_& storage_ = storage.get();
364 if(flat_tid < valid_items)
366 storage_.threads.set(flat_tid, input);
370 unsigned int thread_index = segment_len * flat_tid;
371 if(thread_index < valid_items)
373 T thread_reduction = storage_.threads.get(thread_index);
375 for(
unsigned int i = 1; i < segment_len; i++)
378 if(thread_index < valid_items)
381 = reduce_op(thread_reduction, storage_.threads.get(thread_index));
385 warp_reduce_prefix_type().reduce(thread_reduction,
387 (valid_items + segment_len - 1) / segment_len,
394 END_ROCPRIM_NAMESPACE
396 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_ Definition: block_reduce_raking_reduce.hpp:118
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, unsigned int valid_items, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:232
hipError_t reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduction primitive for device level.
Definition: device_reduce.hpp:374
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T(&input)[ItemsPerThread], T &output, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:174
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, unsigned int valid_items, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:211
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T(&input)[ItemsPerThread], T &output, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:198
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
Definition: block_reduce_raking_reduce.hpp:43
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:162
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:148
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:181