rocPRIM
block_shuffle.hpp
1 /******************************************************************************
2  * Copyright (c) 2011, Duane Merrill. All rights reserved.
3  * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
4  * Modifications Copyright (c) 2021, Advanced Micro Devices, Inc. All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  * * Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * * Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * * Neither the name of the NVIDIA CORPORATION nor the
14  * names of its contributors may be used to endorse or promote products
15  * derived from this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20  * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
21  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  *
28  ******************************************************************************/
29 
30 #ifndef ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
31 #define ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
32 
33 #include <type_traits>
34 
35 #include "../config.hpp"
36 #include "../detail/various.hpp"
37 
38 #include "../intrinsics.hpp"
39 #include "../functional.hpp"
40 
41 #include "detail/block_reduce_warp_reduce.hpp"
42 #include "detail/block_reduce_raking_reduce.hpp"
43 
46 
47 BEGIN_ROCPRIM_NAMESPACE
48 
90 template<
91  class T,
92  unsigned int BlockSizeX,
93  unsigned int BlockSizeY = 1,
94  unsigned int BlockSizeZ = 1>
96 {
97  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
98 
99  // Struct used for creating a raw_storage object for this primitive's temporary storage.
100  struct storage_type_
101  {
102  T prev[BlockSize];
103  T next[BlockSize];
104  };
105 
106 public:
107 
116  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
118  #else
119  using storage_type = storage_type_; // only for Doxygen
120  #endif
121 
147  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
148  void offset(T input,
149  T& output,
150  int distance = 1)
151  {
152  offset(
153  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
154  input, output, distance
155  );
156  }
157 
167  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
168  void offset(const size_t& flat_id,
169  T input,
170  T& output,
171  int distance)
172  {
173  ROCPRIM_SHARED_MEMORY storage_type storage;
174  offset(flat_id, input, output, distance, storage);
175  }
176 
187  ROCPRIM_DEVICE ROCPRIM_INLINE
188  void offset(const size_t& flat_id,
189  T input,
190  T& output,
191  int distance,
192  storage_type& storage)
193  {
194  storage_type_& storage_ = storage.get();
195  storage_.prev[flat_id] = input;
196 
198 
199  const int offset_tid = static_cast<int>(flat_id) + distance;
200  if ((offset_tid >= 0) && (offset_tid < (int)BlockSize))
201  {
202  output = storage_.prev[static_cast<size_t>(offset_tid)];
203  }
204  }
205 
231  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
232  void rotate(T input,
233  T& output,
234  unsigned int distance = 1)
235  {
236  rotate(
237  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
238  input, output, distance
239  );
240  }
241 
251  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
252  void rotate(const size_t& flat_id,
253  T input,
254  T& output,
255  unsigned int distance)
256  {
257  ROCPRIM_SHARED_MEMORY storage_type storage;
258  rotate(flat_id, input, output, distance, storage);
259  }
260 
271  ROCPRIM_DEVICE ROCPRIM_INLINE
272  void rotate(const size_t& flat_id,
273  T input,
274  T& output,
275  unsigned int distance,
276  storage_type& storage)
277  {
278  storage_type_& storage_ = storage.get();
279  storage_.prev[flat_id] = input;
280 
282 
283  unsigned int offset = threadIdx.x + distance;
284  if (offset >= BlockSize)
285  offset -= BlockSize;
286 
287  output = storage_.prev[offset];
288  }
289 
290 
314  template <unsigned int ItemsPerThread>
315  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
316  void up(T (&input)[ItemsPerThread],
317  T (&prev)[ItemsPerThread])
318  {
319  this->up(
320  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
321  input, prev
322  );
323  }
324 
332  template <unsigned int ItemsPerThread>
333  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
334  void up(const size_t& flat_id,
335  T (&input)[ItemsPerThread],
336  T (&prev)[ItemsPerThread])
337  {
338  ROCPRIM_SHARED_MEMORY storage_type storage;
339  this->up(flat_id, input, prev, storage);
340  }
341 
350  template <unsigned int ItemsPerThread>
351  ROCPRIM_DEVICE ROCPRIM_INLINE
352  void up(const size_t& flat_id,
353  T (&input)[ItemsPerThread],
354  T (&prev)[ItemsPerThread],
355  storage_type& storage)
356  {
357  storage_type_& storage_ = storage.get();
358  storage_.prev[flat_id] = input[ItemsPerThread -1];
359 
361 
362  ROCPRIM_UNROLL
363  for (unsigned int i = ItemsPerThread - 1; i > 0; --i)
364  {
365  prev[i] = input[i - 1];
366  }
367 
368  if (flat_id > 0)
369  {
370  prev[0] = storage_.prev[flat_id - 1];
371  }
372  }
373 
374 
383  template <unsigned int ItemsPerThread>
384  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
385  void up(T (&input)[ItemsPerThread],
386  T (&prev)[ItemsPerThread],
387  T &block_suffix)
388  {
389  this->up(
390  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
391  input, prev, block_suffix
392  );
393  }
394 
404  template <unsigned int ItemsPerThread>
405  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
406  void up(const size_t& flat_id,
407  T (&input)[ItemsPerThread],
408  T (&prev)[ItemsPerThread],
409  T &block_suffix)
410  {
411  ROCPRIM_SHARED_MEMORY storage_type storage;
412  this->up(flat_id, input, prev, block_suffix, storage);
413  }
414 
425  template <int ItemsPerThread>
426  ROCPRIM_DEVICE ROCPRIM_INLINE
427  void up(const size_t& flat_id,
428  T (&input)[ItemsPerThread],
429  T (&prev)[ItemsPerThread],
430  T &block_suffix,
431  storage_type& storage)
432  {
433  up(flat_id, input, prev, storage);
434 
435  // Update block prefix
436  block_suffix = storage->prev[BlockSize - 1];
437  }
438 
462  template <unsigned int ItemsPerThread>
463  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
464  void down(T (&input)[ItemsPerThread],
465  T (&next)[ItemsPerThread])
466  {
467  this->down(
468  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
469  input, next
470  );
471  }
472 
480  template <unsigned int ItemsPerThread>
481  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
482  void down(const size_t& flat_id,
483  T (&input)[ItemsPerThread],
484  T (&next)[ItemsPerThread])
485  {
486  ROCPRIM_SHARED_MEMORY storage_type storage;
487  this->down(flat_id, input, next, storage);
488  }
489 
498  template <unsigned int ItemsPerThread>
499  ROCPRIM_DEVICE ROCPRIM_INLINE
500  void down(const size_t& flat_id,
501  T (&input)[ItemsPerThread],
502  T (&next)[ItemsPerThread],
503  storage_type& storage)
504  {
505  storage_type_& storage_ = storage.get();
506  storage_.next[flat_id] = input[0];
507 
509 
510  ROCPRIM_UNROLL
511  for (unsigned int i = 0; i < (ItemsPerThread - 1); ++i)
512  {
513  next[i] = input[i + 1];
514  }
515 
516  if (flat_id <(BlockSize -1))
517  {
518  next[ItemsPerThread -1] = storage_.next[flat_id + 1];
519  }
520  }
521 
529  template <unsigned int ItemsPerThread>
530  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
531  void down(T (&input)[ItemsPerThread],
532  T (&next)[ItemsPerThread],
533  T &block_prefix)
534  {
535  this->down(
536  ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>(),
537  input, next, block_prefix
538  );
539  }
540 
549  template <unsigned int ItemsPerThread>
550  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
551  void down(const size_t& flat_id,
552  T (&input)[ItemsPerThread],
553  T (&next)[ItemsPerThread],
554  T &block_prefix)
555  {
556  ROCPRIM_SHARED_MEMORY storage_type storage;
557  this->down(flat_id, input, next, block_prefix, storage);
558  }
559 
569  template <unsigned int ItemsPerThread>
570  ROCPRIM_DEVICE ROCPRIM_INLINE
571  void down(const size_t& flat_id,
572  T (&input)[ItemsPerThread],
573  T (&next)[ItemsPerThread],
574  T &block_prefix,
575  storage_type& storage)
576  {
577  this->down(flat_id, input, next, storage);
578 
579  // Update block prefixstorage_->
580  block_prefix = storage->next[0];
581  }
582 };
583 
584 
585 END_ROCPRIM_NAMESPACE
586 
588 // end of group blockmodule
589 
590 #endif // ROCPRIM_BLOCK_BLOCK_SHUFFLE_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it up by one item, using temporary storage.
Definition: block_shuffle.hpp:352
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:551
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(const size_t &flat_id, T input, T &output, int distance)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:168
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:482
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(const size_t &flat_id, T input, T &output, unsigned int distance)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:252
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:334
ROCPRIM_DEVICE ROCPRIM_INLINE void offset(const size_t &flat_id, T input, T &output, int distance, storage_type &storage)
Shuffles data across threads in a block, offseted by the distance value, using temporary storage...
Definition: block_shuffle.hpp:188
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:531
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix)
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:385
ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], T &block_prefix, storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:571
ROCPRIM_DEVICE ROCPRIM_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix, storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it up by one item, using temporary storage.
Definition: block_shuffle.hpp:427
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void offset(T input, T &output, int distance=1)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:148
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:316
The block_shuffle class is a block level parallel primitive which provides methods for shuffling data...
Definition: block_shuffle.hpp:95
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void down(T(&input)[ItemsPerThread], T(&next)[ItemsPerThread])
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:464
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void up(const size_t &flat_id, T(&input)[ItemsPerThread], T(&prev)[ItemsPerThread], T &block_suffix)
The thread block rotates a blocked arrange of input items, shifting it up by one item.
Definition: block_shuffle.hpp:406
ROCPRIM_DEVICE ROCPRIM_INLINE void down(const size_t &flat_id, T(&input)[ItemsPerThread], T(&next)[ItemsPerThread], storage_type &storage)
The thread block rotates a blocked arrange of input items, shifting it down by one item...
Definition: block_shuffle.hpp:500
ROCPRIM_DEVICE ROCPRIM_INLINE void rotate(const size_t &flat_id, T input, T &output, unsigned int distance, storage_type &storage)
Shuffles data across threads in a block, offseted by the distance value, using temporary storage...
Definition: block_shuffle.hpp:272
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void rotate(T input, T &output, unsigned int distance=1)
Shuffles data across threads in a block, offseted by the distance value.
Definition: block_shuffle.hpp:232