rocPRIM
block_reduce_raking_reduce.hpp
1 // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_
22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../functional.hpp"
30 #include "../../intrinsics.hpp"
31 
32 #include "../../warp/warp_reduce.hpp"
33 
34 BEGIN_ROCPRIM_NAMESPACE
35 
36 namespace detail
37 {
38 
39 // Class for fast storage/load of large object's arrays in local memory
40 // for sequential access from consecutive threads.
41 // For small types reproduces array
42 template<class T, int n, typename = void>
44 {
45 public:
46  ROCPRIM_HOST_DEVICE T get(int index) const
47  {
48  return data[index];
49  }
50 
51  ROCPRIM_HOST_DEVICE void set(int index, T value)
52  {
53  data[index] = value;
54  }
55 
56 private:
57  T data[n];
58 };
59 
60 // For large types reduces bank conflicts to minimum
61 // by values sliced into int32_t and each slice stored continuously.
62 // Treatment of []= operator by proxy objects
63 #ifndef DOXYGEN_SHOULD_SKIP_THIS
64 template<class T, int n>
65 class fast_array<T, n, std::enable_if_t<(sizeof(T) > sizeof(int32_t))>>
66 {
67 public:
68  ROCPRIM_HOST_DEVICE T get(int index) const
69  {
70  T result;
71  ROCPRIM_UNROLL
72  for(int i = 0; i < words_no; i++)
73  {
74  const size_t s = std::min(sizeof(int32_t), sizeof(T) - i * sizeof(int32_t));
75 #ifdef __HIP_CPU_RT__
76  std::memcpy(reinterpret_cast<char*>(&result) + i * sizeof(int32_t),
77  data + index + i * n,
78  s);
79 #else
80  __builtin_memcpy(reinterpret_cast<char*>(&result) + i * sizeof(int32_t),
81  data + index + i * n,
82  s);
83 #endif
84  }
85  return result;
86  }
87 
88  ROCPRIM_HOST_DEVICE void set(int index, T value)
89  {
90  ROCPRIM_UNROLL
91  for(int i = 0; i < words_no; i++)
92  {
93  const size_t s = std::min(sizeof(int32_t), sizeof(T) - i * sizeof(int32_t));
94 #ifdef __HIP_CPU_RT__
95  std::memcpy(data + index + i * n,
96  reinterpret_cast<const char*>(&value) + i * sizeof(int32_t),
97  s);
98 #else
99  __builtin_memcpy(data + index + i * n,
100  reinterpret_cast<const char*>(&value) + i * sizeof(int32_t),
101  s);
102 #endif
103  }
104  }
105 
106 private:
107  static constexpr int words_no = rocprim::detail::ceiling_div(sizeof(T), sizeof(int32_t));
108 
109  int32_t data[words_no * n];
110 };
111 #endif // DOXYGEN_SHOULD_SKIP_THIS
112 
113 template<class T,
114  unsigned int BlockSizeX,
115  unsigned int BlockSizeY,
116  unsigned int BlockSizeZ,
117  bool CommutativeOnly = false>
119 {
120  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
121 
122  // Warp reduce, warp_reduce_crosslane does not require shared memory (storage), but
123  // logical warp size must be a power of two.
124  static constexpr unsigned int warp_size_
125  = detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
126 
127  static constexpr unsigned int segment_len = ceiling_div(BlockSize, warp_size_);
128 
129  static constexpr bool block_multiple_warp_ = !(BlockSize % warp_size_);
130  static constexpr bool block_smaller_than_warp_ = (BlockSize < warp_size_);
131  using warp_reduce_prefix_type = ::rocprim::detail::warp_reduce_crosslane<T, warp_size_, false>;
132 
133  struct storage_type_
134  {
135  fast_array<T, BlockSize> threads;
136  };
137 
138 public:
140 
146  template<class BinaryFunction>
147  ROCPRIM_DEVICE ROCPRIM_INLINE void
148  reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op)
149  {
150  this->reduce_impl(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
151  input,
152  output,
153  storage,
154  reduce_op);
155  }
156 
161  template<class BinaryFunction>
162  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T& output, BinaryFunction reduce_op)
163  {
164  ROCPRIM_SHARED_MEMORY storage_type storage;
165  this->reduce(input, output, storage, reduce_op);
166  }
167 
173  template<unsigned int ItemsPerThread, class BinaryFunction>
174  ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T (&input)[ItemsPerThread],
175  T& output,
176  storage_type& storage,
177  BinaryFunction reduce_op)
178  {
179  // Reduce thread items
180  T thread_input = input[0];
181  ROCPRIM_UNROLL
182  for(unsigned int i = 1; i < ItemsPerThread; i++)
183  {
184  thread_input = reduce_op(thread_input, input[i]);
185  }
186 
187  // Reduction of reduced values to get partials
188  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
189  this->reduce_impl(flat_tid, thread_input, output, storage, reduce_op);
190  }
191 
196  template<unsigned int ItemsPerThread, class BinaryFunction>
197  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
198  reduce(T (&input)[ItemsPerThread], T& output, BinaryFunction reduce_op)
199  {
200  ROCPRIM_SHARED_MEMORY storage_type storage;
201  this->reduce(input, output, storage, reduce_op);
202  }
203 
210  template<class BinaryFunction>
211  ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input,
212  T& output,
213  unsigned int valid_items,
214  storage_type& storage,
215  BinaryFunction reduce_op)
216  {
217  this->reduce_impl(::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
218  input,
219  output,
220  valid_items,
221  storage,
222  reduce_op);
223  }
224 
230  template<class BinaryFunction>
231  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void
232  reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op)
233  {
234  ROCPRIM_SHARED_MEMORY storage_type storage;
235  this->reduce(input, output, valid_items, storage, reduce_op);
236  }
237 
238 private:
239  template<class BinaryFunction, bool FunctionCommutativeOnly = CommutativeOnly>
240  ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce_impl(const unsigned int flat_tid,
241  T input,
242  T& output,
243  storage_type& storage,
244  BinaryFunction reduce_op) ->
245  typename std::enable_if<(FunctionCommutativeOnly), void>::type
246  {
247  storage_type_& storage_ = storage.get();
248  if(flat_tid >= warp_size_)
249  {
250  storage_.threads.set(flat_tid, input);
251  }
253 
254  if(flat_tid < warp_size_)
255  {
256  unsigned int thread_index = flat_tid;
257  T thread_reduction = input;
258  ROCPRIM_UNROLL
259  for(unsigned int i = 1; i < segment_len; i++)
260  {
261  thread_index += warp_size_;
262  if(block_multiple_warp_ || (thread_index < BlockSize))
263  {
264  thread_reduction
265  = reduce_op(thread_reduction, storage_.threads.get(thread_index));
266  }
267  }
269  output,
270  BlockSize,
271  reduce_op);
272  }
273  }
274 
275  template<class BinaryFunction, bool FunctionCommutativeOnly = CommutativeOnly>
276  ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce_impl(const unsigned int flat_tid,
277  T input,
278  T& output,
279  storage_type& storage,
280  BinaryFunction reduce_op) ->
281  typename std::enable_if<(!FunctionCommutativeOnly), void>::type
282  {
283  storage_type_& storage_ = storage.get();
284  storage_.threads.set(flat_tid, input);
286 
287  constexpr unsigned int active_lanes = ceiling_div(BlockSize, segment_len);
288 
289  if(flat_tid < active_lanes)
290  {
291  unsigned int thread_index = segment_len * flat_tid;
292  T thread_reduction = storage_.threads.get(thread_index);
293  ROCPRIM_UNROLL
294  for(unsigned int i = 1; i < segment_len; i++)
295  {
296  ++thread_index;
297  if(block_multiple_warp_ || (thread_index < BlockSize))
298  {
299  thread_reduction
300  = reduce_op(thread_reduction, storage_.threads.get(thread_index));
301  }
302  }
304  output,
305  active_lanes,
306  reduce_op);
307  }
308  }
309 
310  template<bool UseValid, class WarpReduce, class BinaryFunction>
311  ROCPRIM_DEVICE ROCPRIM_INLINE auto
312  warp_reduce(T input, T& output, const unsigned int valid_items, BinaryFunction reduce_op) ->
313  typename std::enable_if<UseValid>::type
314  {
315  WarpReduce().reduce(input, output, valid_items, reduce_op);
316  }
317 
318  template<bool UseValid, class WarpReduce, class BinaryFunction>
319  ROCPRIM_DEVICE ROCPRIM_INLINE auto
320  warp_reduce(T input, T& output, const unsigned int valid_items, BinaryFunction reduce_op) ->
321  typename std::enable_if<!UseValid>::type
322  {
323  (void)valid_items;
324  WarpReduce().reduce(input, output, reduce_op);
325  }
326 
327  template<class BinaryFunction, bool FunctionCommutativeOnly = CommutativeOnly>
328  ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce_impl(const unsigned int flat_tid,
329  T input,
330  T& output,
331  const unsigned int valid_items,
332  storage_type& storage,
333  BinaryFunction reduce_op) ->
334  typename std::enable_if<(FunctionCommutativeOnly), void>::type
335  {
336  storage_type_& storage_ = storage.get();
337  if((flat_tid >= warp_size_) && (flat_tid < valid_items))
338  {
339  storage_.threads.set(flat_tid, input);
340  }
342 
343  if(flat_tid < warp_size_)
344  {
345  T thread_reduction = input;
346  for(unsigned int i = warp_size_ + flat_tid; i < valid_items; i += warp_size_)
347  {
348  thread_reduction = reduce_op(thread_reduction, storage_.threads.get(i));
349  }
350  warp_reduce_prefix_type().reduce(thread_reduction, output, valid_items, reduce_op);
351  }
352  }
353 
354  template<class BinaryFunction, bool FunctionCommutativeOnly = CommutativeOnly>
355  ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce_impl(const unsigned int flat_tid,
356  T input,
357  T& output,
358  const unsigned int valid_items,
359  storage_type& storage,
360  BinaryFunction reduce_op) ->
361  typename std::enable_if<(!FunctionCommutativeOnly), void>::type
362  {
363  storage_type_& storage_ = storage.get();
364  if(flat_tid < valid_items)
365  {
366  storage_.threads.set(flat_tid, input);
367  }
369 
370  unsigned int thread_index = segment_len * flat_tid;
371  if(thread_index < valid_items)
372  {
373  T thread_reduction = storage_.threads.get(thread_index);
374  ROCPRIM_UNROLL
375  for(unsigned int i = 1; i < segment_len; i++)
376  {
377  ++thread_index;
378  if(thread_index < valid_items)
379  {
380  thread_reduction
381  = reduce_op(thread_reduction, storage_.threads.get(thread_index));
382  }
383  }
384  // not ceiling_div here as not constexpr and this is faster
385  warp_reduce_prefix_type().reduce(thread_reduction,
386  output,
387  (valid_items + segment_len - 1) / segment_len,
388  reduce_op);
389  }
390  }
391 };
392 } // end namespace detail
393 
394 END_ROCPRIM_NAMESPACE
395 
396 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_RAKING_REDUCE_HPP_
Definition: block_reduce_raking_reduce.hpp:118
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, unsigned int valid_items, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:232
hipError_t reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduction primitive for device level.
Definition: device_reduce.hpp:374
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T(&input)[ItemsPerThread], T &output, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:174
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, unsigned int valid_items, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:211
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T(&input)[ItemsPerThread], T &output, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:198
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
Definition: block_reduce_raking_reduce.hpp:43
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:162
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op)
Computes a thread block-wide reduction using specified reduction operator.
Definition: block_reduce_raking_reduce.hpp:148
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:181