rocPRIM
block_reduce_warp_reduce.hpp
1 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_
22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../intrinsics.hpp"
30 #include "../../functional.hpp"
31 
32 #include "../../warp/warp_reduce.hpp"
33 
34 BEGIN_ROCPRIM_NAMESPACE
35 
36 namespace detail
37 {
38 
39 template<
40  class T,
41  unsigned int BlockSizeX,
42  unsigned int BlockSizeY,
43  unsigned int BlockSizeZ
44 >
46 {
47  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
48  // Select warp size
49  static constexpr unsigned int warp_size_ =
50  detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
51  // Number of warps in block
52  static constexpr unsigned int warps_no_ = (BlockSize + warp_size_ - 1) / warp_size_;
53 
54  // Check if we have to pass number of valid items into warp reduction primitive
55  static constexpr bool block_size_is_warp_multiple_ = ((BlockSize % warp_size_) == 0);
56  static constexpr bool warps_no_is_pow_of_two_ = detail::is_power_of_two(warps_no_);
57 
58  // typedef of warp_reduce primitive that will be used to perform warp-level
59  // reduce operation on input values.
60  // warp_reduce_crosslane is an implementation of warp_reduce that does not need storage,
61  // but requires logical warp size to be a power of two.
62  using warp_reduce_input_type = ::rocprim::detail::warp_reduce_crosslane<T, warp_size_, false>;
63  // typedef of warp_reduce primitive that will be used to perform reduction
64  // of results of warp-level reduction.
65  using warp_reduce_output_type = ::rocprim::detail::warp_reduce_crosslane<
66  T, detail::next_power_of_two(warps_no_), false
67  >;
68 
69  struct storage_type_
70  {
71  T warp_partials[warps_no_];
72  };
73 
74 public:
76 
77  template<class BinaryFunction>
78  ROCPRIM_DEVICE ROCPRIM_INLINE
79  void reduce(T input,
80  T& output,
81  storage_type& storage,
82  BinaryFunction reduce_op)
83  {
84  this->reduce_impl(
85  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
86  input, output, storage, reduce_op
87  );
88  }
89 
90  template<class BinaryFunction>
91  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
92  void reduce(T input,
93  T& output,
94  BinaryFunction reduce_op)
95  {
96  ROCPRIM_SHARED_MEMORY storage_type storage;
97  this->reduce(input, output, storage, reduce_op);
98  }
99 
100  template<unsigned int ItemsPerThread, class BinaryFunction>
101  ROCPRIM_DEVICE ROCPRIM_INLINE
102  void reduce(T (&input)[ItemsPerThread],
103  T& output,
104  storage_type& storage,
105  BinaryFunction reduce_op)
106  {
107  // Reduce thread items
108  T thread_input = input[0];
109  ROCPRIM_UNROLL
110  for(unsigned int i = 1; i < ItemsPerThread; i++)
111  {
112  thread_input = reduce_op(thread_input, input[i]);
113  }
114 
115  // Reduction of reduced values to get partials
116  const auto flat_tid = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
117  this->reduce_impl(
118  flat_tid,
119  thread_input, output, // input, output
120  storage,
121  reduce_op
122  );
123  }
124 
125  template<unsigned int ItemsPerThread, class BinaryFunction>
126  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
127  void reduce(T (&input)[ItemsPerThread],
128  T& output,
129  BinaryFunction reduce_op)
130  {
131  ROCPRIM_SHARED_MEMORY storage_type storage;
132  this->reduce(input, output, storage, reduce_op);
133  }
134 
135  template<class BinaryFunction>
136  ROCPRIM_DEVICE ROCPRIM_INLINE
137  void reduce(T input,
138  T& output,
139  unsigned int valid_items,
140  storage_type& storage,
141  BinaryFunction reduce_op)
142  {
143  this->reduce_impl(
144  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
145  input, output, valid_items, storage, reduce_op
146  );
147  }
148 
149  template<class BinaryFunction>
150  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
151  void reduce(T input,
152  T& output,
153  unsigned int valid_items,
154  BinaryFunction reduce_op)
155  {
156  ROCPRIM_SHARED_MEMORY storage_type storage;
157  this->reduce(input, output, valid_items, storage, reduce_op);
158  }
159 
160 private:
161  template<class BinaryFunction>
162  ROCPRIM_DEVICE ROCPRIM_INLINE
163  void reduce_impl(const unsigned int flat_tid,
164  T input,
165  T& output,
166  storage_type& storage,
167  BinaryFunction reduce_op)
168  {
169  const auto warp_id = ::rocprim::warp_id(flat_tid);
170  const auto lane_id = ::rocprim::lane_id();
171  const unsigned int warp_offset = warp_id * warp_size_;
172  const unsigned int num_valid =
173  (warp_offset < BlockSize) ? BlockSize - warp_offset : 0;
174  storage_type_& storage_ = storage.get();
175 
176  // Perform warp reduce
178  input, output, num_valid, reduce_op
179  );
180 
181  // i-th warp will have its partial stored in storage_.warp_partials[i-1]
182  if(lane_id == 0)
183  {
184  storage_.warp_partials[warp_id] = output;
185  }
187 
188  if(flat_tid < warps_no_)
189  {
190  // Use warp partial to calculate the final reduce results for every thread
191  auto warp_partial = storage_.warp_partials[lane_id];
192 
194  warp_partial, output, warps_no_, reduce_op
195  );
196  }
197  }
198 
199  template<bool UseValid, class WarpReduce, class BinaryFunction>
200  ROCPRIM_DEVICE ROCPRIM_INLINE
201  auto warp_reduce(T input,
202  T& output,
203  const unsigned int valid_items,
204  BinaryFunction reduce_op)
205  -> typename std::enable_if<UseValid>::type
206  {
207  WarpReduce().reduce(
208  input, output, valid_items, reduce_op
209  );
210  }
211 
212  template<bool UseValid, class WarpReduce, class BinaryFunction>
213  ROCPRIM_DEVICE ROCPRIM_INLINE
214  auto warp_reduce(T input,
215  T& output,
216  const unsigned int valid_items,
217  BinaryFunction reduce_op)
218  -> typename std::enable_if<!UseValid>::type
219  {
220  (void) valid_items;
221  WarpReduce().reduce(
222  input, output, reduce_op
223  );
224  }
225 
226  template<class BinaryFunction>
227  ROCPRIM_DEVICE ROCPRIM_INLINE
228  void reduce_impl(const unsigned int flat_tid,
229  T input,
230  T& output,
231  const unsigned int valid_items,
232  storage_type& storage,
233  BinaryFunction reduce_op)
234  {
235  const auto warp_id = ::rocprim::warp_id(flat_tid);
236  const auto lane_id = ::rocprim::lane_id();
237  const unsigned int warp_offset = warp_id * warp_size_;
238  const unsigned int num_valid =
239  (warp_offset < valid_items) ? valid_items - warp_offset : 0;
240  storage_type_& storage_ = storage.get();
241 
242  // Perform warp reduce
243  warp_reduce_input_type().reduce(
244  input, output, num_valid, reduce_op
245  );
246 
247  // i-th warp will have its partial stored in storage_.warp_partials[i-1]
248  if(lane_id == 0)
249  {
250  storage_.warp_partials[warp_id] = output;
251  }
253 
254  if(flat_tid < warps_no_)
255  {
256  // Use warp partial to calculate the final reduce results for every thread
257  auto warp_partial = storage_.warp_partials[lane_id];
258 
259  unsigned int valid_warps_no = (valid_items + warp_size_ - 1) / warp_size_;
260  warp_reduce_output_type().reduce(
261  warp_partial, output, valid_warps_no, reduce_op
262  );
263  }
264  }
265 };
266 
267 } // end namespace detail
268 
269 END_ROCPRIM_NAMESPACE
270 
271 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_REDUCE_WARP_REDUCE_HPP_
Definition: block_reduce_warp_reduce.hpp:45
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
Definition: benchmark_block_reduce.cpp:63
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
const unsigned int warp_id
Returns warp id in a block (tile).
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:181