21 #ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_ 22 #define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_ 24 #include "../config.hpp" 25 #include "../detail/various.hpp" 27 #include "../intrinsics.hpp" 28 #include "../functional.hpp" 29 #include "../types.hpp" 34 BEGIN_ROCPRIM_NAMESPACE
76 unsigned int BlockSizeX,
77 unsigned int ItemsPerThread,
78 unsigned int BlockSizeY = 1,
79 unsigned int BlockSizeZ = 1
83 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
85 static constexpr
unsigned int warp_size =
88 static constexpr
unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
93 static constexpr
bool has_bank_conflicts =
94 ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread);
95 static constexpr
unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
96 static constexpr
unsigned int bank_conflicts_padding =
97 has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0;
102 T buffer[BlockSize * ItemsPerThread + bank_conflicts_padding];
115 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 129 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
131 U (&output)[ItemsPerThread])
167 ROCPRIM_DEVICE ROCPRIM_INLINE
169 U (&output)[ItemsPerThread],
172 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
173 storage_type_& storage_ = storage.get();
175 for(
unsigned int i = 0; i < ItemsPerThread; i++)
177 storage_.buffer[index(flat_id * ItemsPerThread + i)] = input[i];
181 for(
unsigned int i = 0; i < ItemsPerThread; i++)
183 output[i] = storage_.buffer[index(i * BlockSize + flat_id)];
195 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
197 U (&output)[ItemsPerThread])
233 ROCPRIM_DEVICE ROCPRIM_INLINE
235 U (&output)[ItemsPerThread],
238 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
239 storage_type_& storage_ = storage.get();
241 for(
unsigned int i = 0; i < ItemsPerThread; i++)
243 storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
247 for(
unsigned int i = 0; i < ItemsPerThread; i++)
249 output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
261 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
263 U (&output)[ItemsPerThread])
299 ROCPRIM_DEVICE ROCPRIM_INLINE
301 U (&output)[ItemsPerThread],
304 constexpr
unsigned int items_per_warp = warp_size * ItemsPerThread;
306 const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
307 const unsigned int current_warp_size = get_current_warp_size();
308 const unsigned int offset = warp_id * items_per_warp;
309 storage_type_& storage_ = storage.get();
311 for(
unsigned int i = 0; i < ItemsPerThread; i++)
313 storage_.buffer[index(offset + lane_id * ItemsPerThread + i)] = input[i];
318 for(
unsigned int i = 0; i < ItemsPerThread; i++)
320 output[i] = storage_.buffer[index(offset + i * current_warp_size + lane_id)];
332 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
334 U (&output)[ItemsPerThread])
370 ROCPRIM_DEVICE ROCPRIM_INLINE
372 U (&output)[ItemsPerThread],
375 constexpr
unsigned int items_per_warp = warp_size * ItemsPerThread;
377 const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
378 const unsigned int current_warp_size = get_current_warp_size();
379 const unsigned int offset = warp_id * items_per_warp;
380 storage_type_& storage_ = storage.get();
382 for(
unsigned int i = 0; i < ItemsPerThread; i++)
384 storage_.buffer[index(offset + i * current_warp_size + lane_id)] = input[i];
389 for(
unsigned int i = 0; i < ItemsPerThread; i++)
391 output[i] = storage_.buffer[index(offset + lane_id * ItemsPerThread + i)];
404 template<
class U,
class Offset>
405 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
407 U (&output)[ItemsPerThread],
408 const Offset (&ranks)[ItemsPerThread])
423 template<
class U,
class Offset>
424 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
426 U (&output)[ItemsPerThread],
427 const Offset (&ranks)[ItemsPerThread])
465 template<
class U,
class Offset>
466 ROCPRIM_DEVICE ROCPRIM_INLINE
468 U (&output)[ItemsPerThread],
469 const Offset (&ranks)[ItemsPerThread],
472 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
473 storage_type_& storage_ = storage.get();
475 for(
unsigned int i = 0; i < ItemsPerThread; i++)
477 const Offset rank = ranks[i];
478 storage_.buffer[index(rank)] = input[i];
482 for(
unsigned int i = 0; i < ItemsPerThread; i++)
484 output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
498 template <
class U,
class Offset>
499 ROCPRIM_DEVICE ROCPRIM_INLINE
501 U (&output)[ItemsPerThread],
502 const Offset (&ranks)[ItemsPerThread],
505 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
506 storage_type_& storage_ = storage.get();
508 for(
unsigned int i = 0; i < ItemsPerThread; i++)
510 storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
514 for(
unsigned int i = 0; i < ItemsPerThread; i++)
516 const Offset rank = ranks[i];
517 output[i] = storage_.buffer[index(rank)];
530 template<
class U,
class Offset>
531 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
533 U (&output)[ItemsPerThread],
534 const Offset (&ranks)[ItemsPerThread])
572 template<
class U,
class Offset>
573 ROCPRIM_DEVICE ROCPRIM_INLINE
575 U (&output)[ItemsPerThread],
576 const Offset (&ranks)[ItemsPerThread],
579 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
580 storage_type_& storage_ = storage.get();
582 for(
unsigned int i = 0; i < ItemsPerThread; i++)
584 const Offset rank = ranks[i];
585 storage_.buffer[rank] = input[i];
589 for(
unsigned int i = 0; i < ItemsPerThread; i++)
591 output[i] = storage_.buffer[i * BlockSize + flat_id];
607 template<
class U,
class Offset>
608 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
610 U (&output)[ItemsPerThread],
611 const Offset (&ranks)[ItemsPerThread])
652 template<
class U,
class Offset>
653 ROCPRIM_DEVICE ROCPRIM_INLINE
655 U (&output)[ItemsPerThread],
656 const Offset (&ranks)[ItemsPerThread],
659 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
660 storage_type_& storage_ = storage.get();
662 for(
unsigned int i = 0; i < ItemsPerThread; i++)
664 const Offset rank = ranks[i];
667 storage_.buffer[rank] = input[i];
672 for(
unsigned int i = 0; i < ItemsPerThread; i++)
674 output[i] = storage_.buffer[i * BlockSize + flat_id];
689 template<
class U,
class Offset,
class Val
idFlag>
690 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
692 U (&output)[ItemsPerThread],
693 const Offset (&ranks)[ItemsPerThread],
694 const ValidFlag (&is_valid)[ItemsPerThread])
736 template<
class U,
class Offset,
class Val
idFlag>
737 ROCPRIM_DEVICE ROCPRIM_INLINE
739 U (&output)[ItemsPerThread],
740 const Offset (&ranks)[ItemsPerThread],
741 const ValidFlag (&is_valid)[ItemsPerThread],
744 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
745 storage_type_& storage_ = storage.get();
747 for(
unsigned int i = 0; i < ItemsPerThread; i++)
749 const Offset rank = ranks[i];
752 storage_.buffer[rank] = input[i];
757 for(
unsigned int i = 0; i < ItemsPerThread; i++)
759 output[i] = storage_.buffer[i * BlockSize + flat_id];
765 ROCPRIM_DEVICE ROCPRIM_INLINE
766 unsigned int get_current_warp_size()
const 768 const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
769 return (warp_id == warps_no - 1)
770 ? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size)
775 ROCPRIM_DEVICE ROCPRIM_INLINE
776 unsigned int index(
unsigned int n)
779 return has_bank_conflicts ? (n + n / banks_no) : n;
783 END_ROCPRIM_NAMESPACE
788 #endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_warp_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block...
Definition: block_exchange.hpp:262
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped_flagged(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread], storage_type &storage)
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage.
Definition: block_exchange.hpp:738
ROCPRIM_DEVICE ROCPRIM_INLINE void warp_striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:371
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped_guarded(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank.
Definition: block_exchange.hpp:609
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a striped arrangement across the thread block...
Definition: block_exchange.hpp:168
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Scatters items to a blocked arrangement based on their ranks across the thread block.
Definition: block_exchange.hpp:406
The block_exchange class is a block level parallel primitive which provides methods for rearranging i...
Definition: block_exchange.hpp:81
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:196
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront.
Definition: thread.hpp:235
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:234
const unsigned int warp_id
Returns warp id in a block (tile).
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Scatters items to a striped arrangement based on their ranks across the thread block.
Definition: block_exchange.hpp:532
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage.
Definition: block_exchange.hpp:467
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void gather_from_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Gathers items from a striped arrangement based on their ranks across the thread block.
Definition: block_exchange.hpp:425
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped_flagged(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread])
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity.
Definition: block_exchange.hpp:691
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped_guarded(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage.
Definition: block_exchange.hpp:654
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_warp_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block...
Definition: block_exchange.hpp:300
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the thread block...
Definition: block_exchange.hpp:130
ROCPRIM_DEVICE ROCPRIM_INLINE void gather_from_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage.
Definition: block_exchange.hpp:500
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void warp_striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:333
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage.
Definition: block_exchange.hpp:574