rocPRIM
block_store.hpp
1 // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_BLOCK_STORE_HPP_
22 #define ROCPRIM_BLOCK_BLOCK_STORE_HPP_
23 
24 #include "../config.hpp"
25 #include "../detail/various.hpp"
26 
27 #include "../intrinsics.hpp"
28 #include "../functional.hpp"
29 #include "../types.hpp"
30 
31 #include "block_store_func.hpp"
32 #include "block_exchange.hpp"
33 
36 
37 BEGIN_ROCPRIM_NAMESPACE
38 
42 {
49 
53 
68 
77 
88 
91 };
92 
126 template<
127  class T,
128  unsigned int BlockSizeX,
129  unsigned int ItemsPerThread,
131  unsigned int BlockSizeY = 1,
132  unsigned int BlockSizeZ = 1
133 >
135 {
136 private:
137  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
138 
139 public:
148  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
149  using storage_type = typename ::rocprim::detail::empty_storage_type;
150  #else
151  using storage_type = storage_type_; // only for Doxygen
152  #endif
153 
166  template<class OutputIterator>
167  ROCPRIM_DEVICE ROCPRIM_INLINE
168  void store(OutputIterator block_output,
169  T (&items)[ItemsPerThread])
170  {
171  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
172  block_store_direct_blocked(flat_id, block_output, items);
173  }
174 
188  template<class OutputIterator>
189  ROCPRIM_DEVICE ROCPRIM_INLINE
190  void store(OutputIterator block_output,
191  T (&items)[ItemsPerThread],
192  unsigned int valid)
193  {
194  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
195  block_store_direct_blocked(flat_id, block_output, items, valid);
196  }
197 
228  template<class OutputIterator>
229  ROCPRIM_DEVICE ROCPRIM_INLINE
230  void store(OutputIterator block_output,
231  T (&items)[ItemsPerThread],
232  storage_type& storage)
233  {
234  (void) storage;
235  store(block_output, items);
236  }
237 
270  template<class OutputIterator>
271  ROCPRIM_DEVICE ROCPRIM_INLINE
272  void store(OutputIterator block_output,
273  T (&items)[ItemsPerThread],
274  unsigned int valid,
275  storage_type& storage)
276  {
277  (void) storage;
278  store(block_output, items, valid);
279  }
280 };
281 
282 #ifndef DOXYGEN_SHOULD_SKIP_THIS
283 
284 template<
285  class T,
286  unsigned int BlockSizeX,
287  unsigned int ItemsPerThread,
288  unsigned int BlockSizeY,
289  unsigned int BlockSizeZ
290  >
291 class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_striped, BlockSizeY, BlockSizeZ>
292 {
293  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
294 private:
295  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
296 
297 public:
298  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
299  using storage_type = typename ::rocprim::detail::empty_storage_type;
300  #else
301  using storage_type = storage_type_; // only for Doxygen
302  #endif
303 
304  template<class OutputIterator>
305  ROCPRIM_DEVICE inline
306  void store(OutputIterator block_output,
307  T (&items)[ItemsPerThread])
308  {
309  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
310  block_store_direct_striped<BlockSize>(flat_id, block_output, items);
311  }
312 
313  template<class OutputIterator>
314  ROCPRIM_DEVICE inline
315  void store(OutputIterator block_output,
316  T (&items)[ItemsPerThread],
317  unsigned int valid)
318  {
319  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
320  block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
321  }
322 
323  template<class OutputIterator>
324  ROCPRIM_DEVICE inline
325  void store(OutputIterator block_output,
326  T (&items)[ItemsPerThread],
327  storage_type& storage)
328  {
329  (void) storage;
330  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
331  block_store_direct_striped<BlockSize>(flat_id, block_output, items);
332  }
333 
334  template<class OutputIterator>
335  ROCPRIM_DEVICE inline
336  void store(OutputIterator block_output,
337  T (&items)[ItemsPerThread],
338  unsigned int valid,
339  storage_type& storage)
340  {
341  (void) storage;
342  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
343  block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
344  }
345 };
346 
347 template<
348  class T,
349  unsigned int BlockSizeX,
350  unsigned int ItemsPerThread,
351  unsigned int BlockSizeY,
352  unsigned int BlockSizeZ
353 >
354 class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_vectorize, BlockSizeY, BlockSizeZ>
355 {
356 private:
357  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
358 
359 public:
360  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
361  using storage_type = typename ::rocprim::detail::empty_storage_type;
362  #else
363  using storage_type = storage_type_; // only for Doxygen
364  #endif
365 
366  ROCPRIM_DEVICE ROCPRIM_INLINE
367  void store(T* block_output,
368  T (&_items)[ItemsPerThread])
369  {
370  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
371  block_store_direct_blocked_vectorized(flat_id, block_output, _items);
372  }
373 
374  template<class OutputIterator, class U>
375  ROCPRIM_DEVICE ROCPRIM_INLINE
376  void store(OutputIterator block_output,
377  U (&items)[ItemsPerThread])
378  {
379  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
380  block_store_direct_blocked(flat_id, block_output, items);
381  }
382 
383  template<class OutputIterator>
384  ROCPRIM_DEVICE ROCPRIM_INLINE
385  void store(OutputIterator block_output,
386  T (&items)[ItemsPerThread],
387  unsigned int valid)
388  {
389  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
390  block_store_direct_blocked(flat_id, block_output, items, valid);
391  }
392 
393  ROCPRIM_DEVICE ROCPRIM_INLINE
394  void store(T* block_output,
395  T (&items)[ItemsPerThread],
396  storage_type& storage)
397  {
398  (void) storage;
399  store(block_output, items);
400  }
401 
402  template<class OutputIterator, class U>
403  ROCPRIM_DEVICE ROCPRIM_INLINE
404  void store(OutputIterator block_output,
405  U (&items)[ItemsPerThread],
406  storage_type& storage)
407  {
408  (void) storage;
409  store(block_output, items);
410  }
411 
412  template<class OutputIterator>
413  ROCPRIM_DEVICE ROCPRIM_INLINE
414  void store(OutputIterator block_output,
415  T (&items)[ItemsPerThread],
416  unsigned int valid,
417  storage_type& storage)
418  {
419  (void) storage;
420  store(block_output, items, valid);
421  }
422 };
423 
424 template<
425  class T,
426  unsigned int BlockSizeX,
427  unsigned int ItemsPerThread,
428  unsigned int BlockSizeY,
429  unsigned int BlockSizeZ
430 >
431 class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_transpose, BlockSizeY, BlockSizeZ>
432 {
433  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
434 private:
436 
437 public:
438  using storage_type = typename block_exchange_type::storage_type;
439 
440  template<class OutputIterator>
441  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
442  void store(OutputIterator block_output,
443  T (&items)[ItemsPerThread])
444  {
445  ROCPRIM_SHARED_MEMORY storage_type storage;
446  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
447  block_exchange_type().blocked_to_striped(items, items, storage);
448  block_store_direct_striped<BlockSize>(flat_id, block_output, items);
449  }
450 
451  template<class OutputIterator>
452  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
453  void store(OutputIterator block_output,
454  T (&items)[ItemsPerThread],
455  unsigned int valid)
456  {
457  ROCPRIM_SHARED_MEMORY storage_type storage;
458  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
459  block_exchange_type().blocked_to_striped(items, items, storage);
460  block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
461  }
462 
463  template<class OutputIterator>
464  ROCPRIM_DEVICE ROCPRIM_INLINE
465  void store(OutputIterator block_output,
466  T (&items)[ItemsPerThread],
467  storage_type& storage)
468  {
469  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
470  block_exchange_type().blocked_to_striped(items, items, storage);
471  block_store_direct_striped<BlockSize>(flat_id, block_output, items);
472  }
473 
474  template<class OutputIterator>
475  ROCPRIM_DEVICE ROCPRIM_INLINE
476  void store(OutputIterator block_output,
477  T (&items)[ItemsPerThread],
478  unsigned int valid,
479  storage_type& storage)
480  {
481  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
482  block_exchange_type().blocked_to_striped(items, items, storage);
483  block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
484  }
485 };
486 
487 template<
488  class T,
489  unsigned int BlockSizeX,
490  unsigned int ItemsPerThread,
491  unsigned int BlockSizeY,
492  unsigned int BlockSizeZ
493 >
494 class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store_warp_transpose, BlockSizeY, BlockSizeZ>
495 {
496  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
497 private:
499 
500 public:
501  static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
502  "BlockSize must be a multiple of hardware warpsize");
503 
504  using storage_type = typename block_exchange_type::storage_type;
505 
506  template<class OutputIterator>
507  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
508  void store(OutputIterator block_output,
509  T (&items)[ItemsPerThread])
510  {
511  ROCPRIM_SHARED_MEMORY storage_type storage;
512  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
513  block_exchange_type().blocked_to_warp_striped(items, items, storage);
514  block_store_direct_warp_striped(flat_id, block_output, items);
515  }
516 
517  template<class OutputIterator>
518  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
519  void store(OutputIterator block_output,
520  T (&items)[ItemsPerThread],
521  unsigned int valid)
522  {
523  ROCPRIM_SHARED_MEMORY storage_type storage;
524  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
525  block_exchange_type().blocked_to_warp_striped(items, items, storage);
526  block_store_direct_warp_striped(flat_id, block_output, items, valid);
527  }
528 
529  template<class OutputIterator>
530  ROCPRIM_DEVICE ROCPRIM_INLINE
531  void store(OutputIterator block_output,
532  T (&items)[ItemsPerThread],
533  storage_type& storage)
534  {
535  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
536  block_exchange_type().blocked_to_warp_striped(items, items, storage);
537  block_store_direct_warp_striped(flat_id, block_output, items);
538  }
539 
540  template<class OutputIterator>
541  ROCPRIM_DEVICE ROCPRIM_INLINE
542  void store(OutputIterator block_output,
543  T (&items)[ItemsPerThread],
544  unsigned int valid,
545  storage_type& storage)
546  {
547  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
548  block_exchange_type().blocked_to_warp_striped(items, items, storage);
549  block_store_direct_warp_striped(flat_id, block_output, items, valid);
550  }
551 };
552 
553 #endif // DOXYGEN_SHOULD_SKIP_THIS
554 
555 END_ROCPRIM_NAMESPACE
556 
558 // end of group blockmodule
559 
560 #endif // ROCPRIM_BLOCK_BLOCK_STORE_HPP_
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_warp_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block...
Definition: block_exchange.hpp:262
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread])
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:168
A blocked arrangement of items is locally transposed and stored as a warp-striped arrangement of data...
block_store_method
block_store_method enumerates the methods available to store a striped arrangement of items into a bl...
Definition: block_store.hpp:41
A blocked arrangement of items is stored into a blocked arrangement on continuous memory using vector...
The block_store class is a block level parallel primitive which provides methods for storing an arran...
Definition: block_store.hpp:134
ROCPRIM_DEVICE ROCPRIM_INLINE auto block_store_direct_blocked_vectorized(unsigned int flat_id, T *block_output, U(&items)[ItemsPerThread]) -> typename std::enable_if< detail::is_vectorizable< T, ItemsPerThread >::value >::type
Stores a blocked arrangement of items from across the thread block into a blocked arrangement on cont...
Definition: block_store_func.hpp:151
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid, storage_type &storage)
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:272
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
The block_exchange class is a block level parallel primitive which provides methods for rearranging i...
Definition: block_exchange.hpp:81
Defaults to block_load_direct.
A striped arrangement of items is stored into a blocked arrangement on continuous memory...
A blocked arrangement of items is stored into a blocked arrangement on continuous memory...
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid)
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:190
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread], storage_type &storage)
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:230
typename ::rocprim::detail::empty_storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: block_store.hpp:149
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread])
Stores a blocked arrangement of items from across the thread block into a blocked arrangement on cont...
Definition: block_store_func.hpp:58
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the thread block...
Definition: block_exchange.hpp:130
ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread])
Stores a warp-striped arrangement of items from across the thread block into a blocked arrangement on...
Definition: block_store_func.hpp:306
A blocked arrangement of items is locally transposed and stored as a striped arrangement of data on c...