21 #ifndef ROCPRIM_DEVICE_DETAIL_LOOKBACK_SCAN_STATE_HPP_ 22 #define ROCPRIM_DEVICE_DETAIL_LOOKBACK_SCAN_STATE_HPP_ 24 #include <type_traits> 26 #include "../../functional.hpp" 27 #include "../../intrinsics.hpp" 28 #include "../../type_traits.hpp" 29 #include "../../types.hpp" 31 #include "../../warp/detail/warp_reduce_crosslane.hpp" 32 #include "../../warp/detail/warp_scan_crosslane.hpp" 34 #include "../../detail/binary_op_wrappers.hpp" 35 #include "../../detail/temp_storage.hpp" 36 #include "../../detail/various.hpp" 40 void __builtin_amdgcn_s_sleep(
int);
42 BEGIN_ROCPRIM_NAMESPACE
67 template<
class T,
bool UseSleep = false,
bool IsSmall = (sizeof(T) <= 4)>
68 struct lookback_scan_state;
71 template<
class T,
bool UseSleep>
75 using flag_type_ = char;
79 using prefix_underlying_type =
80 typename std::conditional<
87 struct alignas(sizeof(prefix_underlying_type)) prefix_type
93 static_assert(
sizeof(prefix_underlying_type) ==
sizeof(prefix_type),
"");
97 using flag_type = flag_type_;
101 ROCPRIM_HOST
static inline 104 (void) number_of_blocks;
106 state.prefixes =
reinterpret_cast<prefix_underlying_type*
>(temp_storage);
110 ROCPRIM_HOST
static inline 111 size_t get_storage_size(
const unsigned int number_of_blocks)
117 get_temp_storage_layout(
const unsigned int number_of_blocks)
120 alignof(prefix_underlying_type)};
123 ROCPRIM_DEVICE ROCPRIM_INLINE
124 void initialize_prefix(
const unsigned int block_id,
125 const unsigned int number_of_blocks)
129 if(block_id < number_of_blocks)
132 prefix.flag = PREFIX_EMPTY;
133 prefix_underlying_type p;
134 #ifndef __HIP_CPU_RT__ 135 __builtin_memcpy(&p, &prefix,
sizeof(prefix_type));
137 std::memcpy(&p, &prefix,
sizeof(prefix_type));
141 if(block_id < padding)
144 prefix.flag = PREFIX_INVALID;
145 prefix_underlying_type p;
146 #ifndef __HIP_CPU_RT__ 147 __builtin_memcpy(&p, &prefix,
sizeof(prefix_type));
149 std::memcpy(&p, &prefix,
sizeof(prefix_type));
155 ROCPRIM_DEVICE ROCPRIM_INLINE
156 void set_partial(
const unsigned int block_id,
const T value)
158 this->
set(
block_id, PREFIX_PARTIAL, value);
161 ROCPRIM_DEVICE ROCPRIM_INLINE
162 void set_complete(
const unsigned int block_id,
const T value)
164 this->
set(
block_id, PREFIX_COMPLETE, value);
168 ROCPRIM_DEVICE ROCPRIM_INLINE
169 void get(
const unsigned int block_id, flag_type& flag, T& value)
175 const unsigned int SLEEP_MAX = 32;
176 unsigned int times_through = 1;
178 prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
179 #ifndef __HIP_CPU_RT__ 180 __builtin_memcpy(&prefix, &p,
sizeof(prefix_type));
182 std::memcpy(&prefix, &p,
sizeof(prefix_type));
184 while(prefix.flag == PREFIX_EMPTY)
188 for (
unsigned int j = 0; j < times_through; j++)
189 #ifndef __HIP_CPU_RT__
190 __builtin_amdgcn_s_sleep(1);
192 std::this_thread::sleep_for(std::chrono::microseconds{1});
194 if (times_through < SLEEP_MAX)
198 prefix_underlying_type p = ::rocprim::detail::atomic_add(&prefixes[padding + block_id], 0);
199 #ifndef __HIP_CPU_RT__ 200 __builtin_memcpy(&prefix, &p,
sizeof(prefix_type));
202 std::memcpy(&prefix, &p,
sizeof(prefix_type));
208 value = prefix.value;
212 ROCPRIM_DEVICE ROCPRIM_INLINE
213 void set(
const unsigned int block_id,
const flag_type flag,
const T value)
217 prefix_type prefix = { flag, value };
218 prefix_underlying_type p;
219 #ifndef __HIP_CPU_RT__ 220 __builtin_memcpy(&p, &prefix,
sizeof(prefix_type));
222 std::memcpy(&p, &prefix,
sizeof(prefix_type));
224 ::rocprim::detail::atomic_exch(&prefixes[padding + block_id], p);
227 prefix_underlying_type * prefixes;
232 template<
class T,
bool UseSleep>
237 using flag_type =
unsigned int;
238 using value_type = T;
241 ROCPRIM_HOST
static inline 247 auto ptr =
static_cast<char*
>(temp_storage);
249 state.prefixes_flags =
reinterpret_cast<flag_type*
>(ptr);
250 ptr += ::rocprim::detail::align_size(n *
sizeof(flag_type));
252 state.prefixes_partial_values =
reinterpret_cast<T*
>(ptr);
253 ptr += ::rocprim::detail::align_size(n *
sizeof(T));
255 state.prefixes_complete_values =
reinterpret_cast<T*
>(ptr);
259 ROCPRIM_HOST
static inline 260 size_t get_storage_size(
const unsigned int number_of_blocks)
263 size_t size = ::rocprim::detail::align_size(n *
sizeof(flag_type));
264 size += 2 * ::rocprim::detail::align_size(n *
sizeof(T));
269 get_temp_storage_layout(
const unsigned int number_of_blocks)
271 size_t alignment =
std::max(
alignof(flag_type),
alignof(T));
275 ROCPRIM_DEVICE ROCPRIM_INLINE
276 void initialize_prefix(
const unsigned int block_id,
277 const unsigned int number_of_blocks)
280 if(block_id < number_of_blocks)
282 prefixes_flags[padding +
block_id] = PREFIX_EMPTY;
284 if(block_id < padding)
286 prefixes_flags[
block_id] = PREFIX_INVALID;
290 ROCPRIM_DEVICE ROCPRIM_INLINE
291 void set_partial(
const unsigned int block_id,
const T value)
295 prefixes_partial_values[padding +
block_id] = value;
296 ::rocprim::detail::memory_fence_device();
297 ::rocprim::detail::atomic_exch(&prefixes_flags[padding + block_id], PREFIX_PARTIAL);
300 ROCPRIM_DEVICE ROCPRIM_INLINE
301 void set_complete(
const unsigned int block_id,
const T value)
305 prefixes_complete_values[padding +
block_id] = value;
306 ::rocprim::detail::memory_fence_device();
307 ::rocprim::detail::atomic_exch(&prefixes_flags[padding + block_id], PREFIX_COMPLETE);
311 ROCPRIM_DEVICE ROCPRIM_INLINE
312 void get(
const unsigned int block_id, flag_type& flag, T& value)
316 const unsigned int SLEEP_MAX = 32;
317 unsigned int times_through = 1;
320 flag = ::rocprim::detail::atomic_add(&prefixes_flags[padding + block_id], 0);
321 ::rocprim::detail::memory_fence_device();
322 while(flag == PREFIX_EMPTY)
326 for (
unsigned int j = 0; j < times_through; j++)
327 #ifndef __HIP_CPU_RT__
328 __builtin_amdgcn_s_sleep(1);
330 std::this_thread::sleep_for(std::chrono::microseconds{1});
332 if (times_through < SLEEP_MAX)
336 flag = ::rocprim::detail::atomic_add(&prefixes_flags[padding + block_id], 0);
337 ::rocprim::detail::memory_fence_device();
340 if(flag == PREFIX_PARTIAL)
341 value = prefixes_partial_values[padding +
block_id];
343 value = prefixes_complete_values[padding +
block_id];
347 flag_type * prefixes_flags;
351 T * prefixes_partial_values;
352 T * prefixes_complete_values;
355 template<
class T,
class BinaryFunction,
class LookbackScanState>
358 using flag_type =
typename LookbackScanState::flag_type;
360 std::is_same<T, typename LookbackScanState::value_type>::value,
361 "T must be LookbackScanState::value_type" 365 ROCPRIM_DEVICE ROCPRIM_INLINE
367 BinaryFunction scan_op,
368 LookbackScanState &scan_state)
369 : block_id_(block_id),
371 scan_state_(scan_state)
375 ROCPRIM_DEVICE ROCPRIM_INLINE
376 ~lookback_scan_prefix_op() =
default;
378 ROCPRIM_DEVICE ROCPRIM_INLINE
379 void reduce_partial_prefixes(
unsigned int block_id,
389 using warp_reduce_prefix_type = warp_reduce_crosslane<
394 scan_state_.get(block_id, flag, block_prefix);
396 auto headflag_scan_op = headflag_scan_op_type(scan_op_);
397 warp_reduce_prefix_type()
398 .tail_segmented_reduce(
401 (flag == PREFIX_COMPLETE),
406 ROCPRIM_DEVICE ROCPRIM_INLINE
415 reduce_partial_prefixes(previous_block_id, flag, partial_prefix);
416 T prefix = partial_prefix;
419 while(::rocprim::detail::warp_all(flag != PREFIX_COMPLETE))
422 reduce_partial_prefixes(previous_block_id, flag, partial_prefix);
423 prefix = scan_op_(partial_prefix, prefix);
428 ROCPRIM_DEVICE ROCPRIM_INLINE
429 T operator()(T reduction)
434 scan_state_.set_partial(block_id_, reduction);
438 auto prefix = get_prefix();
443 scan_state_.set_complete(block_id_, scan_op_(prefix, reduction));
449 unsigned int block_id_;
450 BinaryFunction scan_op_;
451 LookbackScanState& scan_state_;
454 inline hipError_t is_sleep_scan_state_used(
bool& use_sleep)
456 hipDeviceProp_t prop;
458 if(
const hipError_t error = hipGetDevice(&deviceId))
462 else if(
const hipError_t error = hipGetDeviceProperties(&prop, deviceId))
466 #if HIP_VERSION >= 307 467 const int asicRevision = prop.asicRevision;
469 const int asicRevision = 0;
471 use_sleep = std::string(prop.gcnArchName).find(
"908") != std::string::npos && asicRevision < 2;
488 template<
typename PrefixOp>
489 static ROCPRIM_DEVICE
auto create(PrefixOp& prefix_op,
storage_type& storage)
491 return [&](T reduction)
mutable 493 auto prefix = prefix_op(reduction);
496 storage.get().block_reduction = std::move(reduction);
497 storage.get().prefix = prefix;
503 static ROCPRIM_DEVICE T get_reduction(
const storage_type& storage)
505 return storage.get().block_reduction;
508 static ROCPRIM_DEVICE T get_prefix(
const storage_type& storage)
510 return storage.get().prefix;
514 template<
class T,
class LookbackScanState,
class BinaryOp = ::rocprim::plus<T>>
522 ROCPRIM_DEVICE ROCPRIM_INLINE
base_type& base()
531 LookbackScanState& state,
532 storage_type& storage,
533 BinaryOp binary_op = BinaryOp())
534 :
base_type(block_id, BinaryOp(std::move(binary_op)), state), storage(storage)
537 ROCPRIM_DEVICE ROCPRIM_INLINE T operator()(T reduction)
539 return factory::create(base(), storage)(reduction);
542 ROCPRIM_DEVICE ROCPRIM_INLINE T get_reduction()
const 544 return factory::get_reduction(storage);
547 ROCPRIM_DEVICE ROCPRIM_INLINE T get_prefix()
const 549 return factory::get_prefix(storage);
553 ROCPRIM_DEVICE ROCPRIM_INLINE T get_exclusive_prefix()
const 559 storage_type& storage;
564 END_ROCPRIM_NAMESPACE
566 #endif // ROCPRIM_DEVICE_DETAIL_LOOKBACK_SCAN_STATE_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_id()
Returns block identifier in a multidimensional grid by dimension.
Definition: thread.hpp:258
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
Definition: lookback_scan_state.hpp:68
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
Definition: binary_op_wrappers.hpp:43
This value-structure describes the required layout of some piece of temporary memory, which includes the required size and the required alignment.
Definition: temp_storage.hpp:47
Definition: lookback_scan_state.hpp:356
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: lookback_scan_state.hpp:476
ROCPRIM_HOST unsigned int host_warp_size()
Returns a number of threads in a hardware warp for the actual device.
Definition: thread.hpp:52
Definition: lookback_scan_state.hpp:72
Definition: lookback_scan_state.hpp:515
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93