30 #ifndef ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_    31 #define ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_    33 #include <type_traits>    35 #include "../config.hpp"    36 #include "../detail/various.hpp"    38 #include "../intrinsics.hpp"    39 #include "../functional.hpp"    41 #include "detail/block_reduce_warp_reduce.hpp"    42 #include "detail/block_reduce_raking_reduce.hpp"    47 BEGIN_ROCPRIM_NAMESPACE
    92     unsigned int BlockSizeX,
    93     unsigned int BlockSizeY = 1,
    94     unsigned int BlockSizeZ = 1>
    97     static constexpr 
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
   116     #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen   147     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   153             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   154             input, output, distance
   167     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   174         offset(flat_id, input, output, distance, storage);
   187     ROCPRIM_DEVICE ROCPRIM_INLINE
   194         storage_type_& storage_ = storage.get();
   195         storage_.prev[flat_id] = input;
   199         const int offset_tid = 
static_cast<int>(flat_id) + distance;
   200         if ((offset_tid >= 0) && (offset_tid < (int)BlockSize))
   202             output = storage_.prev[
static_cast<size_t>(offset_tid)];
   231     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   234                 unsigned int distance = 1)
   237             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   238             input, output, distance
   251     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   255                 unsigned int distance)
   258         rotate(flat_id, input, output, distance, storage);
   271     ROCPRIM_DEVICE ROCPRIM_INLINE
   275                 unsigned int distance,
   278         storage_type_& storage_ = storage.get();
   279         storage_.prev[flat_id] = input;
   283         unsigned int offset = threadIdx.x + distance;
   284         if (offset >= BlockSize)
   287         output = storage_.prev[
offset];
   314     template <
unsigned int ItemsPerThread>
   315     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   316     void up(T (&input)[ItemsPerThread],
   317             T (&prev)[ItemsPerThread])
   320             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   332     template <
unsigned int ItemsPerThread>
   333     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   334     void up(
const size_t& flat_id,
   335             T (&input)[ItemsPerThread],
   336             T (&prev)[ItemsPerThread])
   339         this->
up(flat_id, input, prev, storage);
   350     template <
unsigned int ItemsPerThread>
   351     ROCPRIM_DEVICE ROCPRIM_INLINE
   352     void up(
const size_t& flat_id,
   353             T (&input)[ItemsPerThread],
   354             T (&prev)[ItemsPerThread],
   357         storage_type_& storage_ = storage.get();
   358         storage_.prev[flat_id] = input[ItemsPerThread -1];
   363         for (
unsigned int i = ItemsPerThread - 1; i > 0; --i)
   365             prev[i] = input[i - 1];
   370             prev[0] = storage_.prev[flat_id - 1];
   383     template <
unsigned int ItemsPerThread>
   384     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   385     void up(T (&input)[ItemsPerThread],
   386             T (&prev)[ItemsPerThread],
   390             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   391             input, prev, block_suffix
   404     template <
unsigned int ItemsPerThread>
   405     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   406     void up(
const size_t& flat_id,
   407             T (&input)[ItemsPerThread],
   408             T (&prev)[ItemsPerThread],
   412         this->
up(flat_id, input, prev, block_suffix, storage);
   425     template <
int ItemsPerThread>
   426     ROCPRIM_DEVICE ROCPRIM_INLINE
   427     void up(
const size_t& flat_id,
   428             T (&input)[ItemsPerThread],
   429             T (&prev)[ItemsPerThread],
   433         up(flat_id, input, prev, storage);
   436         block_suffix = storage->prev[BlockSize - 1];
   462     template <
unsigned int ItemsPerThread>
   463     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   464     void down(T (&input)[ItemsPerThread],
   465               T (&next)[ItemsPerThread])
   468             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   480     template <
unsigned int ItemsPerThread>
   481     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   482     void down(
const size_t& flat_id,
   483               T (&input)[ItemsPerThread],
   484               T (&next)[ItemsPerThread])
   487         this->
down(flat_id, input, next, storage);
   498     template <
unsigned int ItemsPerThread>
   499     ROCPRIM_DEVICE ROCPRIM_INLINE
   500     void down(
const size_t& flat_id,
   501               T (&input)[ItemsPerThread],
   502               T (&next)[ItemsPerThread],
   505         storage_type_& storage_ = storage.get();
   506         storage_.next[flat_id] = input[0];
   511         for (
unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
   513           next[i] = input[i + 1];
   516         if (flat_id <(BlockSize -1))
   518           next[ItemsPerThread -1] = storage_.next[flat_id + 1];
   529     template <
unsigned int ItemsPerThread>
   530     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   531     void down(T (&input)[ItemsPerThread],
   532               T (&next)[ItemsPerThread],
   536             ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
   537             input, next, block_prefix
   549     template <
unsigned int ItemsPerThread>
   550     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
   551     void down(
const size_t& flat_id,
   552               T (&input)[ItemsPerThread],
   553               T (&next)[ItemsPerThread],
   557         this->
down(flat_id, input, next, block_prefix, storage);
   569     template <
unsigned int ItemsPerThread>
   570     ROCPRIM_DEVICE ROCPRIM_INLINE
   571     void down(
const size_t& flat_id,
   572               T (&input)[ItemsPerThread],
   573               T (&next)[ItemsPerThread],
   577         this->
down(flat_id, input, next, storage);
   580         block_prefix = storage->next[0];
   585 END_ROCPRIM_NAMESPACE
   590 #endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it up by one item, using temporary storage. 
Definition: block_shuffle.hpp:352
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:551
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(const size_t &flat_id, T input, T &output, int distance)
Shuffles data across threads in a block, offseted by the distance value. 
Definition: block_shuffle.hpp:168
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:482
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(const size_t &flat_id, T input, T &output, unsigned int distance)
Shuffles data across threads in a block, offseted by the distance value. 
Definition: block_shuffle.hpp:252
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it up by one item. 
Definition: block_shuffle.hpp:334
ROCPRIM_DEVICE ROCPRIM_INLINE void offset(const size_t &flat_id, T input, T &output, int distance, storage_type &storage)
Shuffles data across threads in a block, offseted by the distance value, using temporary storage...
Definition: block_shuffle.hpp:188
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:531
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix)
The thread block rotates a blocked arrange of input items, shifting it up by one item. 
Definition: block_shuffle.hpp:385
ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix, storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:571
ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix, storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it up by one item, using temporary storage. 
Definition: block_shuffle.hpp:427
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile) 
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(T input, T &output, int distance=1)
Shuffles data across threads in a block, offseted by the distance value. 
Definition: block_shuffle.hpp:148
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it up by one item. 
Definition: block_shuffle.hpp:316
The block_shuffle class is a block level parallel primitive which provides methods for shuffling data...
Definition: block_shuffle.hpp:95
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T(&input)[ItemsPerThread], T(&next)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:464
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix)
The thread block rotates a blocked arrange of input items, shifting it up by one item. 
Definition: block_shuffle.hpp:406
ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:500
ROCPRIM_DEVICE ROCPRIM_INLINE void rotate(const size_t &flat_id, T input, T &output, unsigned int distance, storage_type &storage)
Shuffles data across threads in a block, offseted by the distance value, using temporary storage...
Definition: block_shuffle.hpp:272
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(T input, T &output, unsigned int distance=1)
Shuffles data across threads in a block, offseted by the distance value. 
Definition: block_shuffle.hpp:232