30 #ifndef ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_ 31 #define ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_ 33 #include <type_traits> 35 #include "../config.hpp" 36 #include "../detail/various.hpp" 38 #include "../intrinsics.hpp" 39 #include "../functional.hpp" 41 #include "detail/block_reduce_warp_reduce.hpp" 42 #include "detail/block_reduce_raking_reduce.hpp" 47 BEGIN_ROCPRIM_NAMESPACE
92 unsigned int BlockSizeX,
93 unsigned int BlockSizeY = 1,
94 unsigned int BlockSizeZ = 1>
97 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
116 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 147 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
153 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
154 input, output, distance
167 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
174 offset(flat_id, input, output, distance, storage);
187 ROCPRIM_DEVICE ROCPRIM_INLINE
194 storage_type_& storage_ = storage.get();
195 storage_.prev[flat_id] = input;
199 const int offset_tid =
static_cast<int>(flat_id) + distance;
200 if ((offset_tid >= 0) && (offset_tid < (int)BlockSize))
202 output = storage_.prev[
static_cast<size_t>(offset_tid)];
231 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
234 unsigned int distance = 1)
237 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
238 input, output, distance
251 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
255 unsigned int distance)
258 rotate(flat_id, input, output, distance, storage);
271 ROCPRIM_DEVICE ROCPRIM_INLINE
275 unsigned int distance,
278 storage_type_& storage_ = storage.get();
279 storage_.prev[flat_id] = input;
283 unsigned int offset = threadIdx.x + distance;
284 if (offset >= BlockSize)
287 output = storage_.prev[
offset];
314 template <
unsigned int ItemsPerThread>
315 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
316 void up(T (&input)[ItemsPerThread],
317 T (&prev)[ItemsPerThread])
320 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
332 template <
unsigned int ItemsPerThread>
333 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
334 void up(
const size_t& flat_id,
335 T (&input)[ItemsPerThread],
336 T (&prev)[ItemsPerThread])
339 this->
up(flat_id, input, prev, storage);
350 template <
unsigned int ItemsPerThread>
351 ROCPRIM_DEVICE ROCPRIM_INLINE
352 void up(
const size_t& flat_id,
353 T (&input)[ItemsPerThread],
354 T (&prev)[ItemsPerThread],
357 storage_type_& storage_ = storage.get();
358 storage_.prev[flat_id] = input[ItemsPerThread -1];
363 for (
unsigned int i = ItemsPerThread - 1; i > 0; --i)
365 prev[i] = input[i - 1];
370 prev[0] = storage_.prev[flat_id - 1];
383 template <
unsigned int ItemsPerThread>
384 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
385 void up(T (&input)[ItemsPerThread],
386 T (&prev)[ItemsPerThread],
390 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
391 input, prev, block_suffix
404 template <
unsigned int ItemsPerThread>
405 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
406 void up(
const size_t& flat_id,
407 T (&input)[ItemsPerThread],
408 T (&prev)[ItemsPerThread],
412 this->
up(flat_id, input, prev, block_suffix, storage);
425 template <
int ItemsPerThread>
426 ROCPRIM_DEVICE ROCPRIM_INLINE
427 void up(
const size_t& flat_id,
428 T (&input)[ItemsPerThread],
429 T (&prev)[ItemsPerThread],
433 up(flat_id, input, prev, storage);
436 block_suffix = storage->prev[BlockSize - 1];
462 template <
unsigned int ItemsPerThread>
463 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
464 void down(T (&input)[ItemsPerThread],
465 T (&next)[ItemsPerThread])
468 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
480 template <
unsigned int ItemsPerThread>
481 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
482 void down(
const size_t& flat_id,
483 T (&input)[ItemsPerThread],
484 T (&next)[ItemsPerThread])
487 this->
down(flat_id, input, next, storage);
498 template <
unsigned int ItemsPerThread>
499 ROCPRIM_DEVICE ROCPRIM_INLINE
500 void down(
const size_t& flat_id,
501 T (&input)[ItemsPerThread],
502 T (&next)[ItemsPerThread],
505 storage_type_& storage_ = storage.get();
506 storage_.next[flat_id] = input[0];
511 for (
unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
513 next[i] = input[i + 1];
516 if (flat_id <(BlockSize -1))
518 next[ItemsPerThread -1] = storage_.next[flat_id + 1];
529 template <
unsigned int ItemsPerThread>
530 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
531 void down(T (&input)[ItemsPerThread],
532 T (&next)[ItemsPerThread],
536 ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
537 input, next, block_prefix
549 template <
unsigned int ItemsPerThread>
550 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
551 void down(
const size_t& flat_id,
552 T (&input)[ItemsPerThread],
553 T (&next)[ItemsPerThread],
557 this->
down(flat_id, input, next, block_prefix, storage);
569 template <
unsigned int ItemsPerThread>
570 ROCPRIM_DEVICE ROCPRIM_INLINE
571 void down(
const size_t& flat_id,
572 T (&input)[ItemsPerThread],
573 T (&next)[ItemsPerThread],
577 this->
down(flat_id, input, next, storage);
580 block_prefix = storage->next[0];
585 END_ROCPRIM_NAMESPACE
590 #endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it up by one item, using temporary storage.
Definition: block_shuffle.hpp:352
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:551
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(const size_t &flat_id, T input, T &output, int distance)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:168
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:482
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(const size_t &flat_id, T input, T &output, unsigned int distance)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:252
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:334
ROCPRIM_DEVICE ROCPRIM_INLINE void offset(const size_t &flat_id, T input, T &output, int distance, storage_type &storage)
Shuffles data across threads in a block, offseted by the distance value, using temporary storage...
Definition: block_shuffle.hpp:188
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:531
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix)
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:385
ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix, storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:571
ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix, storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it up by one item, using temporary storage.
Definition: block_shuffle.hpp:427
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(T input, T &output, int distance=1)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:148
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:316
The block_shuffle class is a block level parallel primitive which provides methods for shuffling data...
Definition: block_shuffle.hpp:95
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T(&input)[ItemsPerThread], T(&next)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:464
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix)
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:406
ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:500
ROCPRIM_DEVICE ROCPRIM_INLINE void rotate(const size_t &flat_id, T input, T &output, unsigned int distance, storage_type &storage)
Shuffles data across threads in a block, offseted by the distance value, using temporary storage...
Definition: block_shuffle.hpp:272
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(T input, T &output, unsigned int distance=1)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:232