21 #ifndef ROCPRIM_WARP_WARP_EXCHANGE_HPP_    22 #define ROCPRIM_WARP_WARP_EXCHANGE_HPP_    24 #include "../config.hpp"    25 #include "../detail/various.hpp"    27 #include "../intrinsics.hpp"    28 #include "../intrinsics/warp_shuffle.hpp"    29 #include "../functional.hpp"    30 #include "../types.hpp"    35 BEGIN_ROCPRIM_NAMESPACE
    77     unsigned int ItemsPerThread,
    82     static_assert(::rocprim::detail::is_power_of_two(WarpSize),
    83                   "Logical warp size must be a power of two.");
    85                   "Logical warp size cannot be larger than physical warp size.");
    90         T buffer[WarpSize * ItemsPerThread];
   103     #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen   144     ROCPRIM_DEVICE ROCPRIM_INLINE
   146                             U (&output)[ItemsPerThread],
   149         const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
   150         storage_type_& storage_ = storage.get();
   153         for(
unsigned int i = 0; i < ItemsPerThread; i++)
   155             storage_.buffer[flat_id * ItemsPerThread + i] = input[i];
   160         for(
unsigned int i = 0; i < ItemsPerThread; i++)
   162             output[i] = storage_.buffer[i * WarpSize + flat_id];
   196     ROCPRIM_DEVICE ROCPRIM_INLINE
   198                                     U (&output)[ItemsPerThread])
   200         static_assert(WarpSize % ItemsPerThread == 0,
   201                       "ItemsPerThread must be a divisor of WarpSize to use blocked_to_striped_shuffle");
   202         const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
   203         U work_array[ItemsPerThread];
   206         for(
unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++)
   209             for(
unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++)
   213                     flat_id / ItemsPerThread + dst_idx * (WarpSize / ItemsPerThread)
   215                 if(src_idx == flat_id % ItemsPerThread)
   217                     work_array[dst_idx] = value;
   223         for(
unsigned int i = 0; i < ItemsPerThread; i++)
   225             output[i] = work_array[i];
   264     ROCPRIM_DEVICE ROCPRIM_INLINE
   266                             U (&output)[ItemsPerThread],
   269         const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
   270         storage_type_& storage_ = storage.get();
   273         for(
unsigned int i = 0; i < ItemsPerThread; i++)
   275             storage_.buffer[i * WarpSize + flat_id] = input[i];
   280         for(
unsigned int i = 0; i < ItemsPerThread; i++)
   282             output[i] = storage_.buffer[flat_id * ItemsPerThread + i];
   316     ROCPRIM_DEVICE ROCPRIM_INLINE
   318                                     U (&output)[ItemsPerThread])
   320         static_assert(WarpSize % ItemsPerThread == 0,
   321                       "ItemsPerThread must be a divisor of WarpSize to use striped_to_blocked_shuffle");
   322         const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
   323         U work_array[ItemsPerThread];
   326         for(
unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++)
   329             for(
unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++)
   333                     (ItemsPerThread * flat_id + dst_idx) % WarpSize
   335                 if(flat_id / (WarpSize / ItemsPerThread) == src_idx)
   337                     work_array[dst_idx] = value;
   343         for(
unsigned int i = 0; i < ItemsPerThread; i++)
   345             output[i] = work_array[i];
   388     template<
class U, 
class OffsetT>
   389     ROCPRIM_DEVICE ROCPRIM_INLINE
   391             const T (&input)[ItemsPerThread],
   392             U (&output)[ItemsPerThread],
   393             const OffsetT (&ranks)[ItemsPerThread],
   396         const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
   397         storage_type_& storage_ = storage.get();
   400         for (
unsigned int i = 0; i < ItemsPerThread; i++)
   402             storage_.buffer[ranks[i]] = input[i];
   407         for (
unsigned int i = 0; i < ItemsPerThread; i++)
   409             unsigned int item_offset = (i * WarpSize) + flat_id;
   410             output[i] = storage_.buffer[item_offset];
   415 END_ROCPRIM_NAMESPACE
   420 #endif // ROCPRIM_WARP_WARP_EXCHANGE_HPP_ The warp_exchange class is a warp level parallel primitive which provides methods for rearranging ite...
Definition: warp_exchange.hpp:80
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const OffsetT(&ranks)[ItemsPerThread], storage_type &storage)
Orders input values according to ranks using temporary storage, then writes the values to output in a...
Definition: warp_exchange.hpp:390
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type. 
Definition: warp_shuffle.hpp:172
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target. 
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront. 
Definition: thread.hpp:235
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped_shuffle(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations. 
Definition: warp_exchange.hpp:197
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage. 
Definition: warp_exchange.hpp:145
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked_shuffle(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations. 
Definition: warp_exchange.hpp:317
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage. 
Definition: warp_exchange.hpp:265