rocPRIM
block_exchange.hpp
1 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
22 #define ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
23 
24 #include "../config.hpp"
25 #include "../detail/various.hpp"
26 
27 #include "../intrinsics.hpp"
28 #include "../functional.hpp"
29 #include "../types.hpp"
30 
33 
34 BEGIN_ROCPRIM_NAMESPACE
35 
74 template<
75  class T,
76  unsigned int BlockSizeX,
77  unsigned int ItemsPerThread,
78  unsigned int BlockSizeY = 1,
79  unsigned int BlockSizeZ = 1
80 >
82 {
83  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
84  // Select warp size
85  static constexpr unsigned int warp_size =
86  detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
87  // Number of warps in block
88  static constexpr unsigned int warps_no = (BlockSize + warp_size - 1) / warp_size;
89 
90  // Minimize LDS bank conflicts for power-of-two strides, i.e. when items accessed
91  // using `thread_id * ItemsPerThread` pattern where ItemsPerThread is power of two
92  // (all exchanges from/to blocked).
93  static constexpr bool has_bank_conflicts =
94  ItemsPerThread >= 2 && ::rocprim::detail::is_power_of_two(ItemsPerThread);
95  static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
96  static constexpr unsigned int bank_conflicts_padding =
97  has_bank_conflicts ? (BlockSize * ItemsPerThread / banks_no) : 0;
98 
99  // Struct used for creating a raw_storage object for this primitive's temporary storage.
100  struct storage_type_
101  {
102  T buffer[BlockSize * ItemsPerThread + bank_conflicts_padding];
103  };
104 
105 public:
106 
115  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
117  #else
118  using storage_type = storage_type_; // only for Doxygen
119  #endif
120 
128  template<class U>
129  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
130  void blocked_to_striped(const T (&input)[ItemsPerThread],
131  U (&output)[ItemsPerThread])
132  {
133  ROCPRIM_SHARED_MEMORY storage_type storage;
134  blocked_to_striped(input, output, storage);
135  }
136 
166  template<class U>
167  ROCPRIM_DEVICE ROCPRIM_INLINE
168  void blocked_to_striped(const T (&input)[ItemsPerThread],
169  U (&output)[ItemsPerThread],
170  storage_type& storage)
171  {
172  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
173  storage_type_& storage_ = storage.get();
174 
175  for(unsigned int i = 0; i < ItemsPerThread; i++)
176  {
177  storage_.buffer[index(flat_id * ItemsPerThread + i)] = input[i];
178  }
180 
181  for(unsigned int i = 0; i < ItemsPerThread; i++)
182  {
183  output[i] = storage_.buffer[index(i * BlockSize + flat_id)];
184  }
185  }
186 
194  template<class U>
195  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
196  void striped_to_blocked(const T (&input)[ItemsPerThread],
197  U (&output)[ItemsPerThread])
198  {
199  ROCPRIM_SHARED_MEMORY storage_type storage;
200  striped_to_blocked(input, output, storage);
201  }
202 
232  template<class U>
233  ROCPRIM_DEVICE ROCPRIM_INLINE
234  void striped_to_blocked(const T (&input)[ItemsPerThread],
235  U (&output)[ItemsPerThread],
236  storage_type& storage)
237  {
238  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
239  storage_type_& storage_ = storage.get();
240 
241  for(unsigned int i = 0; i < ItemsPerThread; i++)
242  {
243  storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
244  }
246 
247  for(unsigned int i = 0; i < ItemsPerThread; i++)
248  {
249  output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
250  }
251  }
252 
260  template<class U>
261  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
262  void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
263  U (&output)[ItemsPerThread])
264  {
265  ROCPRIM_SHARED_MEMORY storage_type storage;
266  blocked_to_warp_striped(input, output, storage);
267  }
268 
298  template<class U>
299  ROCPRIM_DEVICE ROCPRIM_INLINE
300  void blocked_to_warp_striped(const T (&input)[ItemsPerThread],
301  U (&output)[ItemsPerThread],
302  storage_type& storage)
303  {
304  constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
305  const unsigned int lane_id = ::rocprim::lane_id();
306  const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
307  const unsigned int current_warp_size = get_current_warp_size();
308  const unsigned int offset = warp_id * items_per_warp;
309  storage_type_& storage_ = storage.get();
310 
311  for(unsigned int i = 0; i < ItemsPerThread; i++)
312  {
313  storage_.buffer[index(offset + lane_id * ItemsPerThread + i)] = input[i];
314  }
315 
317 
318  for(unsigned int i = 0; i < ItemsPerThread; i++)
319  {
320  output[i] = storage_.buffer[index(offset + i * current_warp_size + lane_id)];
321  }
322  }
323 
331  template<class U>
332  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
333  void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
334  U (&output)[ItemsPerThread])
335  {
336  ROCPRIM_SHARED_MEMORY storage_type storage;
337  warp_striped_to_blocked(input, output, storage);
338  }
339 
369  template<class U>
370  ROCPRIM_DEVICE ROCPRIM_INLINE
371  void warp_striped_to_blocked(const T (&input)[ItemsPerThread],
372  U (&output)[ItemsPerThread],
373  storage_type& storage)
374  {
375  constexpr unsigned int items_per_warp = warp_size * ItemsPerThread;
376  const unsigned int lane_id = ::rocprim::lane_id();
377  const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
378  const unsigned int current_warp_size = get_current_warp_size();
379  const unsigned int offset = warp_id * items_per_warp;
380  storage_type_& storage_ = storage.get();
381 
382  for(unsigned int i = 0; i < ItemsPerThread; i++)
383  {
384  storage_.buffer[index(offset + i * current_warp_size + lane_id)] = input[i];
385  }
386 
388 
389  for(unsigned int i = 0; i < ItemsPerThread; i++)
390  {
391  output[i] = storage_.buffer[index(offset + lane_id * ItemsPerThread + i)];
392  }
393  }
394 
404  template<class U, class Offset>
405  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
406  void scatter_to_blocked(const T (&input)[ItemsPerThread],
407  U (&output)[ItemsPerThread],
408  const Offset (&ranks)[ItemsPerThread])
409  {
410  ROCPRIM_SHARED_MEMORY storage_type storage;
411  scatter_to_blocked(input, output, ranks, storage);
412  }
413 
423  template<class U, class Offset>
424  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
425  void gather_from_striped(const T (&input)[ItemsPerThread],
426  U (&output)[ItemsPerThread],
427  const Offset (&ranks)[ItemsPerThread])
428  {
429  ROCPRIM_SHARED_MEMORY storage_type storage;
430  gather_from_striped(input, output, ranks, storage);
431  }
432 
465  template<class U, class Offset>
466  ROCPRIM_DEVICE ROCPRIM_INLINE
467  void scatter_to_blocked(const T (&input)[ItemsPerThread],
468  U (&output)[ItemsPerThread],
469  const Offset (&ranks)[ItemsPerThread],
470  storage_type& storage)
471  {
472  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
473  storage_type_& storage_ = storage.get();
474 
475  for(unsigned int i = 0; i < ItemsPerThread; i++)
476  {
477  const Offset rank = ranks[i];
478  storage_.buffer[index(rank)] = input[i];
479  }
481 
482  for(unsigned int i = 0; i < ItemsPerThread; i++)
483  {
484  output[i] = storage_.buffer[index(flat_id * ItemsPerThread + i)];
485  }
486  }
487 
498  template <class U, class Offset>
499  ROCPRIM_DEVICE ROCPRIM_INLINE
500  void gather_from_striped(const T (&input)[ItemsPerThread],
501  U (&output)[ItemsPerThread],
502  const Offset (&ranks)[ItemsPerThread],
503  storage_type& storage)
504  {
505  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
506  storage_type_& storage_ = storage.get();
507 
508  for(unsigned int i = 0; i < ItemsPerThread; i++)
509  {
510  storage_.buffer[index(i * BlockSize + flat_id)] = input[i];
511  }
513 
514  for(unsigned int i = 0; i < ItemsPerThread; i++)
515  {
516  const Offset rank = ranks[i];
517  output[i] = storage_.buffer[index(rank)];
518  }
519  }
520 
530  template<class U, class Offset>
531  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
532  void scatter_to_striped(const T (&input)[ItemsPerThread],
533  U (&output)[ItemsPerThread],
534  const Offset (&ranks)[ItemsPerThread])
535  {
536  ROCPRIM_SHARED_MEMORY storage_type storage;
537  scatter_to_striped(input, output, ranks, storage);
538  }
539 
572  template<class U, class Offset>
573  ROCPRIM_DEVICE ROCPRIM_INLINE
574  void scatter_to_striped(const T (&input)[ItemsPerThread],
575  U (&output)[ItemsPerThread],
576  const Offset (&ranks)[ItemsPerThread],
577  storage_type& storage)
578  {
579  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
580  storage_type_& storage_ = storage.get();
581 
582  for(unsigned int i = 0; i < ItemsPerThread; i++)
583  {
584  const Offset rank = ranks[i];
585  storage_.buffer[rank] = input[i];
586  }
588 
589  for(unsigned int i = 0; i < ItemsPerThread; i++)
590  {
591  output[i] = storage_.buffer[i * BlockSize + flat_id];
592  }
593  }
594 
607  template<class U, class Offset>
608  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
609  void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
610  U (&output)[ItemsPerThread],
611  const Offset (&ranks)[ItemsPerThread])
612  {
613  ROCPRIM_SHARED_MEMORY storage_type storage;
614  scatter_to_striped_guarded(input, output, ranks, storage);
615  }
616 
652  template<class U, class Offset>
653  ROCPRIM_DEVICE ROCPRIM_INLINE
654  void scatter_to_striped_guarded(const T (&input)[ItemsPerThread],
655  U (&output)[ItemsPerThread],
656  const Offset (&ranks)[ItemsPerThread],
657  storage_type& storage)
658  {
659  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
660  storage_type_& storage_ = storage.get();
661 
662  for(unsigned int i = 0; i < ItemsPerThread; i++)
663  {
664  const Offset rank = ranks[i];
665  if(rank >= 0)
666  {
667  storage_.buffer[rank] = input[i];
668  }
669  }
671 
672  for(unsigned int i = 0; i < ItemsPerThread; i++)
673  {
674  output[i] = storage_.buffer[i * BlockSize + flat_id];
675  }
676  }
677 
689  template<class U, class Offset, class ValidFlag>
690  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
691  void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
692  U (&output)[ItemsPerThread],
693  const Offset (&ranks)[ItemsPerThread],
694  const ValidFlag (&is_valid)[ItemsPerThread])
695  {
696  ROCPRIM_SHARED_MEMORY storage_type storage;
697  scatter_to_striped_flagged(input, output, ranks, is_valid, storage);
698  }
699 
736  template<class U, class Offset, class ValidFlag>
737  ROCPRIM_DEVICE ROCPRIM_INLINE
738  void scatter_to_striped_flagged(const T (&input)[ItemsPerThread],
739  U (&output)[ItemsPerThread],
740  const Offset (&ranks)[ItemsPerThread],
741  const ValidFlag (&is_valid)[ItemsPerThread],
742  storage_type& storage)
743  {
744  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
745  storage_type_& storage_ = storage.get();
746 
747  for(unsigned int i = 0; i < ItemsPerThread; i++)
748  {
749  const Offset rank = ranks[i];
750  if(is_valid[i])
751  {
752  storage_.buffer[rank] = input[i];
753  }
754  }
756 
757  for(unsigned int i = 0; i < ItemsPerThread; i++)
758  {
759  output[i] = storage_.buffer[i * BlockSize + flat_id];
760  }
761  }
762 
763 private:
764 
765  ROCPRIM_DEVICE ROCPRIM_INLINE
766  unsigned int get_current_warp_size() const
767  {
768  const unsigned int warp_id = ::rocprim::warp_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
769  return (warp_id == warps_no - 1)
770  ? (BlockSize % warp_size > 0 ? BlockSize % warp_size : warp_size)
771  : warp_size;
772  }
773 
774  // Change index to minimize LDS bank conflicts if necessary
775  ROCPRIM_DEVICE ROCPRIM_INLINE
776  unsigned int index(unsigned int n)
777  {
778  // Move every 32-bank wide "row" (32 banks * 4 bytes) by one item
779  return has_bank_conflicts ? (n + n / banks_no) : n;
780  }
781 };
782 
783 END_ROCPRIM_NAMESPACE
784 
786 // end of group blockmodule
787 
788 #endif // ROCPRIM_BLOCK_BLOCK_EXCHANGE_HPP_
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_warp_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block...
Definition: block_exchange.hpp:262
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped_flagged(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread], storage_type &storage)
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage.
Definition: block_exchange.hpp:738
ROCPRIM_DEVICE ROCPRIM_INLINE void warp_striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:371
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped_guarded(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank.
Definition: block_exchange.hpp:609
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a striped arrangement across the thread block...
Definition: block_exchange.hpp:168
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Scatters items to a blocked arrangement based on their ranks across the thread block.
Definition: block_exchange.hpp:406
The block_exchange class is a block level parallel primitive which provides methods for rearranging i...
Definition: block_exchange.hpp:81
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:196
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront.
Definition: thread.hpp:235
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:234
const unsigned int warp_id
Returns warp id in a block (tile).
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Scatters items to a striped arrangement based on their ranks across the thread block.
Definition: block_exchange.hpp:532
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage.
Definition: block_exchange.hpp:467
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void gather_from_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
Gathers items from a striped arrangement based on their ranks across the thread block.
Definition: block_exchange.hpp:425
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped_flagged(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread])
Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity.
Definition: block_exchange.hpp:691
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped_guarded(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage.
Definition: block_exchange.hpp:654
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_warp_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block...
Definition: block_exchange.hpp:300
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the thread block...
Definition: block_exchange.hpp:130
ROCPRIM_DEVICE ROCPRIM_INLINE void gather_from_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage.
Definition: block_exchange.hpp:500
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void warp_striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:333
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage.
Definition: block_exchange.hpp:574