rocPRIM
lookback_scan_state.hpp
1 // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_DETAIL_LOOKBACK_SCAN_STATE_HPP_
22 #define ROCPRIM_DEVICE_DETAIL_LOOKBACK_SCAN_STATE_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../functional.hpp"
27 #include "../../intrinsics.hpp"
28 #include "../../type_traits.hpp"
29 #include "../../types.hpp"
30 
31 #include "../../warp/detail/warp_reduce_crosslane.hpp"
32 #include "../../warp/detail/warp_scan_crosslane.hpp"
33 
34 #include "../../detail/binary_op_wrappers.hpp"
35 #include "../../detail/temp_storage.hpp"
36 #include "../../detail/various.hpp"
37 
38 extern "C"
39 {
40  void __builtin_amdgcn_s_sleep(int);
41 }
42 BEGIN_ROCPRIM_NAMESPACE
43 
44 // Single pass prefix scan was implemented based on:
45 // Merrill, D. and Garland, M. Single-pass Parallel Prefix Scan with Decoupled Look-back.
46 // Technical Report NVR2016-001, NVIDIA Research. Mar. 2016.
47 
48 namespace detail
49 {
50 
51 enum prefix_flag
52 {
53  // flag for padding, values should be discarded
54  PREFIX_INVALID = -1,
55  // initialized, not result in value
56  PREFIX_EMPTY = 0,
57  // partial prefix value (from single block)
58  PREFIX_PARTIAL = 1,
59  // final prefix value
60  PREFIX_COMPLETE = 2
61 };
62 
63 // lookback_scan_state object keeps track of prefixes status for
64 // a look-back prefix scan. Initially every prefix can be either
65 // invalid (padding values) or empty. One thread in a block should
66 // later set it to partial, and later to complete.
67 template<class T, bool UseSleep = false, bool IsSmall = (sizeof(T) <= 4)>
68 struct lookback_scan_state;
69 
70 // Packed flag and prefix value are loaded/stored in one atomic operation.
71 template<class T, bool UseSleep>
73 {
74 private:
75  using flag_type_ = char;
76 
77  // Type which is used in store/load operations of block prefix (flag and value).
78  // It is 32-bit or 64-bit int and can be loaded/stored using single atomic instruction.
79  using prefix_underlying_type =
80  typename std::conditional<
81  (sizeof(T) > 2),
82  unsigned long long,
83  unsigned int
84  >::type;
85 
86  // Helper struct
87  struct alignas(sizeof(prefix_underlying_type)) prefix_type
88  {
89  flag_type_ flag;
90  T value;
91  };
92 
93  static_assert(sizeof(prefix_underlying_type) == sizeof(prefix_type), "");
94 
95 public:
96  // Type used for flag/flag of block prefix
97  using flag_type = flag_type_;
98  using value_type = T;
99 
100  // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes
101  ROCPRIM_HOST static inline
102  lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks)
103  {
104  (void) number_of_blocks;
105  lookback_scan_state state;
106  state.prefixes = reinterpret_cast<prefix_underlying_type*>(temp_storage);
107  return state;
108  }
109 
110  ROCPRIM_HOST static inline
111  size_t get_storage_size(const unsigned int number_of_blocks)
112  {
113  return sizeof(prefix_underlying_type) * (::rocprim::host_warp_size() + number_of_blocks);
114  }
115 
116  ROCPRIM_HOST static inline detail::temp_storage::layout
117  get_temp_storage_layout(const unsigned int number_of_blocks)
118  {
119  return detail::temp_storage::layout{get_storage_size(number_of_blocks),
120  alignof(prefix_underlying_type)};
121  }
122 
123  ROCPRIM_DEVICE ROCPRIM_INLINE
124  void initialize_prefix(const unsigned int block_id,
125  const unsigned int number_of_blocks)
126  {
127  constexpr unsigned int padding = ::rocprim::device_warp_size();
128 
129  if(block_id < number_of_blocks)
130  {
131  prefix_type prefix;
132  prefix.flag = PREFIX_EMPTY;
133  prefix_underlying_type p;
134 #ifndef __HIP_CPU_RT__
135  __builtin_memcpy(&p, &prefix, sizeof(prefix_type));
136 #else
137  std::memcpy(&p, &prefix, sizeof(prefix_type));
138 #endif
139  prefixes[padding + block_id] = p;
140  }
141  if(block_id < padding)
142  {
143  prefix_type prefix;
144  prefix.flag = PREFIX_INVALID;
145  prefix_underlying_type p;
146 #ifndef __HIP_CPU_RT__
147  __builtin_memcpy(&p, &prefix, sizeof(prefix_type));
148 #else
149  std::memcpy(&p, &prefix, sizeof(prefix_type));
150 #endif
151  prefixes[block_id] = p;
152  }
153  }
154 
155  ROCPRIM_DEVICE ROCPRIM_INLINE
156  void set_partial(const unsigned int block_id, const T value)
157  {
158  this->set(block_id, PREFIX_PARTIAL, value);
159  }
160 
161  ROCPRIM_DEVICE ROCPRIM_INLINE
162  void set_complete(const unsigned int block_id, const T value)
163  {
164  this->set(block_id, PREFIX_COMPLETE, value);
165  }
166 
167  // block_id must be > 0
168  ROCPRIM_DEVICE ROCPRIM_INLINE
169  void get(const unsigned int block_id, flag_type& flag, T& value)
170  {
171  constexpr unsigned int padding = ::rocprim::device_warp_size();
172 
173  prefix_type prefix;
174 
175  const unsigned int SLEEP_MAX = 32;
176  unsigned int times_through = 1;
177 
178  prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
179 #ifndef __HIP_CPU_RT__
180  __builtin_memcpy(&prefix, &p, sizeof(prefix_type));
181 #else
182  std::memcpy(&prefix, &p, sizeof(prefix_type));
183 #endif
184  while(prefix.flag == PREFIX_EMPTY)
185  {
186  if (UseSleep)
187  {
188  for (unsigned int j = 0; j < times_through; j++)
189 #ifndef __HIP_CPU_RT__
190  __builtin_amdgcn_s_sleep(1);
191 #else
192  std::this_thread::sleep_for(std::chrono::microseconds{1});
193 #endif
194  if (times_through < SLEEP_MAX)
195  times_through++;
196  }
197  // atomic_add(..., 0) is used to load values atomically
198  prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
199 #ifndef __HIP_CPU_RT__
200  __builtin_memcpy(&prefix, &p, sizeof(prefix_type));
201 #else
202  std::memcpy(&prefix, &p, sizeof(prefix_type));
203 #endif
204  }
205 
206  // return
207  flag = prefix.flag;
208  value = prefix.value;
209  }
210 
211 private:
212  ROCPRIM_DEVICE ROCPRIM_INLINE
213  void set(const unsigned int block_id, const flag_type flag, const T value)
214  {
215  constexpr unsigned int padding = ::rocprim::device_warp_size();
216 
217  prefix_type prefix = { flag, value };
218  prefix_underlying_type p;
219 #ifndef __HIP_CPU_RT__
220  __builtin_memcpy(&p, &prefix, sizeof(prefix_type));
221 #else
222  std::memcpy(&p, &prefix, sizeof(prefix_type));
223 #endif
224  ::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p);
225  }
226 
227  prefix_underlying_type * prefixes;
228 };
229 
230 // Flag, partial and final prefixes are stored in separate arrays.
231 // Consistency ensured by memory fences between flag and prefixes load/store operations.
232 template<class T, bool UseSleep>
233 struct lookback_scan_state<T, UseSleep, false>
234 {
235 
236 public:
237  using flag_type = unsigned int;
238  using value_type = T;
239 
240  // temp_storage must point to allocation of get_storage_size(number_of_blocks) bytes
241  ROCPRIM_HOST static inline
242  lookback_scan_state create(void* temp_storage, const unsigned int number_of_blocks)
243  {
244  const auto n = ::rocprim::host_warp_size() + number_of_blocks;
245  lookback_scan_state state;
246 
247  auto ptr = static_cast<char*>(temp_storage);
248 
249  state.prefixes_flags = reinterpret_cast<flag_type*>(ptr);
250  ptr += ::rocprim::detail::align_size(n * sizeof(flag_type));
251 
252  state.prefixes_partial_values = reinterpret_cast<T*>(ptr);
253  ptr += ::rocprim::detail::align_size(n * sizeof(T));
254 
255  state.prefixes_complete_values = reinterpret_cast<T*>(ptr);
256  return state;
257  }
258 
259  ROCPRIM_HOST static inline
260  size_t get_storage_size(const unsigned int number_of_blocks)
261  {
262  const auto n = ::rocprim::host_warp_size() + number_of_blocks;
263  size_t size = ::rocprim::detail::align_size(n * sizeof(flag_type));
264  size += 2 * ::rocprim::detail::align_size(n * sizeof(T));
265  return size;
266  }
267 
268  ROCPRIM_HOST static inline detail::temp_storage::layout
269  get_temp_storage_layout(const unsigned int number_of_blocks)
270  {
271  size_t alignment = std::max(alignof(flag_type), alignof(T));
272  return detail::temp_storage::layout{get_storage_size(number_of_blocks), alignment};
273  }
274 
275  ROCPRIM_DEVICE ROCPRIM_INLINE
276  void initialize_prefix(const unsigned int block_id,
277  const unsigned int number_of_blocks)
278  {
279  constexpr unsigned int padding = ::rocprim::device_warp_size();
280  if(block_id < number_of_blocks)
281  {
282  prefixes_flags[padding + block_id] = PREFIX_EMPTY;
283  }
284  if(block_id < padding)
285  {
286  prefixes_flags[block_id] = PREFIX_INVALID;
287  }
288  }
289 
290  ROCPRIM_DEVICE ROCPRIM_INLINE
291  void set_partial(const unsigned int block_id, const T value)
292  {
293  constexpr unsigned int padding = ::rocprim::device_warp_size();
294 
295  prefixes_partial_values[padding + block_id] = value;
296  ::rocprim::detail::memory_fence_device();
297  ::rocprim::detail::atomic_exch(&prefixes_flags[padding + block_id], PREFIX_PARTIAL);
298  }
299 
300  ROCPRIM_DEVICE ROCPRIM_INLINE
301  void set_complete(const unsigned int block_id, const T value)
302  {
303  constexpr unsigned int padding = ::rocprim::device_warp_size();
304 
305  prefixes_complete_values[padding + block_id] = value;
306  ::rocprim::detail::memory_fence_device();
307  ::rocprim::detail::atomic_exch(&prefixes_flags[padding + block_id], PREFIX_COMPLETE);
308  }
309 
310  // block_id must be > 0
311  ROCPRIM_DEVICE ROCPRIM_INLINE
312  void get(const unsigned int block_id, flag_type& flag, T& value)
313  {
314  constexpr unsigned int padding = ::rocprim::device_warp_size();
315 
316  const unsigned int SLEEP_MAX = 32;
317  unsigned int times_through = 1;
318 
319  // atomic_add(..., 0) is used to load values atomically
320  flag = ::rocprim::detail::atomic_add(&prefixes_flags[padding + block_id], 0);
321  ::rocprim::detail::memory_fence_device();
322  while(flag == PREFIX_EMPTY)
323  {
324  if (UseSleep)
325  {
326  for (unsigned int j = 0; j < times_through; j++)
327 #ifndef __HIP_CPU_RT__
328  __builtin_amdgcn_s_sleep(1);
329 #else
330  std::this_thread::sleep_for(std::chrono::microseconds{1});
331 #endif
332  if (times_through < SLEEP_MAX)
333  times_through++;
334  }
335 
336  flag = ::rocprim::detail::atomic_add(&prefixes_flags[padding + block_id], 0);
337  ::rocprim::detail::memory_fence_device();
338  }
339 
340  if(flag == PREFIX_PARTIAL)
341  value = prefixes_partial_values[padding + block_id];
342  else
343  value = prefixes_complete_values[padding + block_id];
344  }
345 
346 private:
347  flag_type * prefixes_flags;
348  // We need to separate arrays for partial and final prefixes, because
349  // value can be overwritten before flag is changed (flag and value are
350  // not stored in single instruction).
351  T * prefixes_partial_values;
352  T * prefixes_complete_values;
353 };
354 
355 template<class T, class BinaryFunction, class LookbackScanState>
357 {
358  using flag_type = typename LookbackScanState::flag_type;
359  static_assert(
360  std::is_same<T, typename LookbackScanState::value_type>::value,
361  "T must be LookbackScanState::value_type"
362  );
363 
364 public:
365  ROCPRIM_DEVICE ROCPRIM_INLINE
366  lookback_scan_prefix_op(unsigned int block_id,
367  BinaryFunction scan_op,
368  LookbackScanState &scan_state)
369  : block_id_(block_id),
370  scan_op_(scan_op),
371  scan_state_(scan_state)
372  {
373  }
374 
375  ROCPRIM_DEVICE ROCPRIM_INLINE
376  ~lookback_scan_prefix_op() = default;
377 
378  ROCPRIM_DEVICE ROCPRIM_INLINE
379  void reduce_partial_prefixes(unsigned int block_id,
380  flag_type& flag,
381  T& partial_prefix)
382  {
383  // Order of reduction must be reversed, because 0th thread has
384  // prefix from the (block_id_ - 1) block, 1st thread has prefix
385  // from (block_id_ - 2) block etc.
386  using headflag_scan_op_type = reverse_binary_op_wrapper<
387  BinaryFunction, T, T
388  >;
389  using warp_reduce_prefix_type = warp_reduce_crosslane<
390  T, ::rocprim::device_warp_size(), false
391  >;
392 
393  T block_prefix;
394  scan_state_.get(block_id, flag, block_prefix);
395 
396  auto headflag_scan_op = headflag_scan_op_type(scan_op_);
397  warp_reduce_prefix_type()
398  .tail_segmented_reduce(
399  block_prefix,
400  partial_prefix,
401  (flag == PREFIX_COMPLETE),
402  headflag_scan_op
403  );
404  }
405 
406  ROCPRIM_DEVICE ROCPRIM_INLINE
407  T get_prefix()
408  {
409  flag_type flag;
410  T partial_prefix;
411  unsigned int previous_block_id = block_id_ - ::rocprim::lane_id() - 1;
412 
413  // reduce last warp_size() number of prefixes to
414  // get the complete prefix for this block.
415  reduce_partial_prefixes(previous_block_id, flag, partial_prefix);
416  T prefix = partial_prefix;
417 
418  // while we don't load a complete prefix, reduce partial prefixes
419  while(::rocprim::detail::warp_all(flag != PREFIX_COMPLETE))
420  {
421  previous_block_id -= ::rocprim::device_warp_size();
422  reduce_partial_prefixes(previous_block_id, flag, partial_prefix);
423  prefix = scan_op_(partial_prefix, prefix);
424  }
425  return prefix;
426  }
427 
428  ROCPRIM_DEVICE ROCPRIM_INLINE
429  T operator()(T reduction)
430  {
431  // Set partial prefix for next block
432  if(::rocprim::lane_id() == 0)
433  {
434  scan_state_.set_partial(block_id_, reduction);
435  }
436 
437  // Get prefix
438  auto prefix = get_prefix();
439 
440  // Set complete prefix for next block
441  if(::rocprim::lane_id() == 0)
442  {
443  scan_state_.set_complete(block_id_, scan_op_(prefix, reduction));
444  }
445  return prefix;
446  }
447 
448 protected:
449  unsigned int block_id_;
450  BinaryFunction scan_op_;
451  LookbackScanState& scan_state_;
452 };
453 
454 inline hipError_t is_sleep_scan_state_used(bool& use_sleep)
455 {
456  hipDeviceProp_t prop;
457  int deviceId;
458  if(const hipError_t error = hipGetDevice(&deviceId))
459  {
460  return error;
461  }
462  else if(const hipError_t error = hipGetDeviceProperties(&prop, deviceId))
463  {
464  return error;
465  }
466 #if HIP_VERSION >= 307
467  const int asicRevision = prop.asicRevision;
468 #else
469  const int asicRevision = 0;
470 #endif
471  use_sleep = std::string(prop.gcnArchName).find("908") != std::string::npos && asicRevision < 2;
472  return hipSuccess;
473 }
474 
475 template<typename T>
477 {
478 private:
479  struct storage_type_
480  {
481  T block_reduction;
482  T prefix;
483  };
484 
485 public:
487 
488  template<typename PrefixOp>
489  static ROCPRIM_DEVICE auto create(PrefixOp& prefix_op, storage_type& storage)
490  {
491  return [&](T reduction) mutable
492  {
493  auto prefix = prefix_op(reduction);
494  if(::rocprim::lane_id() == 0)
495  {
496  storage.get().block_reduction = std::move(reduction);
497  storage.get().prefix = prefix;
498  }
499  return prefix;
500  };
501  }
502 
503  static ROCPRIM_DEVICE T get_reduction(const storage_type& storage)
504  {
505  return storage.get().block_reduction;
506  }
507 
508  static ROCPRIM_DEVICE T get_prefix(const storage_type& storage)
509  {
510  return storage.get().prefix;
511  }
512 };
513 
514 template<class T, class LookbackScanState, class BinaryOp = ::rocprim::plus<T>>
516  : public lookback_scan_prefix_op<T, BinaryOp, LookbackScanState>
517 {
518 private:
521 
522  ROCPRIM_DEVICE ROCPRIM_INLINE base_type& base()
523  {
524  return *this;
525  }
526 
527 public:
528  using storage_type = typename factory::storage_type;
529 
530  ROCPRIM_DEVICE ROCPRIM_INLINE offset_lookback_scan_prefix_op(unsigned int block_id,
531  LookbackScanState& state,
532  storage_type& storage,
533  BinaryOp binary_op = BinaryOp())
534  : base_type(block_id, BinaryOp(std::move(binary_op)), state), storage(storage)
535  {}
536 
537  ROCPRIM_DEVICE ROCPRIM_INLINE T operator()(T reduction)
538  {
539  return factory::create(base(), storage)(reduction);
540  }
541 
542  ROCPRIM_DEVICE ROCPRIM_INLINE T get_reduction() const
543  {
544  return factory::get_reduction(storage);
545  }
546 
547  ROCPRIM_DEVICE ROCPRIM_INLINE T get_prefix() const
548  {
549  return factory::get_prefix(storage);
550  }
551 
552  // rocThrust uses this implementation detail of rocPRIM, required for backwards compatibility
553  ROCPRIM_DEVICE ROCPRIM_INLINE T get_exclusive_prefix() const
554  {
555  return get_prefix();
556  }
557 
558 private:
559  storage_type& storage;
560 };
561 
562 } // end of detail namespace
563 
564 END_ROCPRIM_NAMESPACE
565 
566 #endif // ROCPRIM_DEVICE_DETAIL_LOOKBACK_SCAN_STATE_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_id()
Returns block identifier in a multidimensional grid by dimension.
Definition: thread.hpp:258
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
Definition: lookback_scan_state.hpp:68
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
Definition: binary_op_wrappers.hpp:43
This value-structure describes the required layout of some piece of temporary memory, which includes the required size and the required alignment.
Definition: temp_storage.hpp:47
Definition: lookback_scan_state.hpp:356
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: lookback_scan_state.hpp:476
ROCPRIM_HOST unsigned int host_warp_size()
Returns a number of threads in a hardware warp for the actual device.
Definition: thread.hpp:52
Definition: lookback_scan_state.hpp:72
Definition: lookback_scan_state.hpp:515
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93