21 #ifndef ROCPRIM_WARP_WARP_EXCHANGE_HPP_ 22 #define ROCPRIM_WARP_WARP_EXCHANGE_HPP_ 24 #include "../config.hpp" 25 #include "../detail/various.hpp" 27 #include "../intrinsics.hpp" 28 #include "../intrinsics/warp_shuffle.hpp" 29 #include "../functional.hpp" 30 #include "../types.hpp" 35 BEGIN_ROCPRIM_NAMESPACE
77 unsigned int ItemsPerThread,
82 static_assert(::rocprim::detail::is_power_of_two(WarpSize),
83 "Logical warp size must be a power of two.");
85 "Logical warp size cannot be larger than physical warp size.");
90 T buffer[WarpSize * ItemsPerThread];
103 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 144 ROCPRIM_DEVICE ROCPRIM_INLINE
146 U (&output)[ItemsPerThread],
149 const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
150 storage_type_& storage_ = storage.get();
153 for(
unsigned int i = 0; i < ItemsPerThread; i++)
155 storage_.buffer[flat_id * ItemsPerThread + i] = input[i];
160 for(
unsigned int i = 0; i < ItemsPerThread; i++)
162 output[i] = storage_.buffer[i * WarpSize + flat_id];
196 ROCPRIM_DEVICE ROCPRIM_INLINE
198 U (&output)[ItemsPerThread])
200 static_assert(WarpSize % ItemsPerThread == 0,
201 "ItemsPerThread must be a divisor of WarpSize to use blocked_to_striped_shuffle");
202 const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
203 U work_array[ItemsPerThread];
206 for(
unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++)
209 for(
unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++)
213 flat_id / ItemsPerThread + dst_idx * (WarpSize / ItemsPerThread)
215 if(src_idx == flat_id % ItemsPerThread)
217 work_array[dst_idx] = value;
223 for(
unsigned int i = 0; i < ItemsPerThread; i++)
225 output[i] = work_array[i];
264 ROCPRIM_DEVICE ROCPRIM_INLINE
266 U (&output)[ItemsPerThread],
269 const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
270 storage_type_& storage_ = storage.get();
273 for(
unsigned int i = 0; i < ItemsPerThread; i++)
275 storage_.buffer[i * WarpSize + flat_id] = input[i];
280 for(
unsigned int i = 0; i < ItemsPerThread; i++)
282 output[i] = storage_.buffer[flat_id * ItemsPerThread + i];
316 ROCPRIM_DEVICE ROCPRIM_INLINE
318 U (&output)[ItemsPerThread])
320 static_assert(WarpSize % ItemsPerThread == 0,
321 "ItemsPerThread must be a divisor of WarpSize to use striped_to_blocked_shuffle");
322 const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
323 U work_array[ItemsPerThread];
326 for(
unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++)
329 for(
unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++)
333 (ItemsPerThread * flat_id + dst_idx) % WarpSize
335 if(flat_id / (WarpSize / ItemsPerThread) == src_idx)
337 work_array[dst_idx] = value;
343 for(
unsigned int i = 0; i < ItemsPerThread; i++)
345 output[i] = work_array[i];
388 template<
class U,
class OffsetT>
389 ROCPRIM_DEVICE ROCPRIM_INLINE
391 const T (&input)[ItemsPerThread],
392 U (&output)[ItemsPerThread],
393 const OffsetT (&ranks)[ItemsPerThread],
396 const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
397 storage_type_& storage_ = storage.get();
400 for (
unsigned int i = 0; i < ItemsPerThread; i++)
402 storage_.buffer[ranks[i]] = input[i];
407 for (
unsigned int i = 0; i < ItemsPerThread; i++)
409 unsigned int item_offset = (i * WarpSize) + flat_id;
410 output[i] = storage_.buffer[item_offset];
415 END_ROCPRIM_NAMESPACE
420 #endif // ROCPRIM_WARP_WARP_EXCHANGE_HPP_ The warp_exchange class is a warp level parallel primitive which provides methods for rearranging ite...
Definition: warp_exchange.hpp:80
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const OffsetT(&ranks)[ItemsPerThread], storage_type &storage)
Orders input values according to ranks using temporary storage, then writes the values to output in a...
Definition: warp_exchange.hpp:390
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type.
Definition: warp_shuffle.hpp:172
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront.
Definition: thread.hpp:235
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped_shuffle(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations.
Definition: warp_exchange.hpp:197
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage.
Definition: warp_exchange.hpp:145
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked_shuffle(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations.
Definition: warp_exchange.hpp:317
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage.
Definition: warp_exchange.hpp:265