rocPRIM
block_load.hpp
1 // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
22 #define ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
23 
24 #include "../config.hpp"
25 #include "../detail/various.hpp"
26 
27 #include "../intrinsics.hpp"
28 #include "../functional.hpp"
29 #include "../types.hpp"
30 
31 #include "block_load_func.hpp"
32 #include "block_exchange.hpp"
33 
36 
37 BEGIN_ROCPRIM_NAMESPACE
38 
42 {
48 
51 
66 
75 
86 
89 };
90 
125 template<
126  class T,
127  unsigned int BlockSizeX,
128  unsigned int ItemsPerThread,
130  unsigned int BlockSizeY = 1,
131  unsigned int BlockSizeZ = 1
132 >
134 {
135 private:
136  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
137 
138 public:
147  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
148  using storage_type = typename ::rocprim::detail::empty_storage_type;
149  #else
150  using storage_type = storage_type_; // only for Doxygen
151  #endif
152 
165  template<class InputIterator>
166  ROCPRIM_DEVICE ROCPRIM_INLINE
167  void load(InputIterator block_input,
168  T (&items)[ItemsPerThread])
169  {
170  using value_type = typename std::iterator_traits<InputIterator>::value_type;
171  static_assert(std::is_convertible<value_type, T>::value,
172  "The type T must be such that an object of type InputIterator "
173  "can be dereferenced and then implicitly converted to T.");
174  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
175  block_load_direct_blocked(flat_id, block_input, items);
176  }
177 
191  template<class InputIterator>
192  ROCPRIM_DEVICE ROCPRIM_INLINE
193  void load(InputIterator block_input,
194  T (&items)[ItemsPerThread],
195  unsigned int valid)
196  {
197  using value_type = typename std::iterator_traits<InputIterator>::value_type;
198  static_assert(std::is_convertible<value_type, T>::value,
199  "The type T must be such that an object of type InputIterator "
200  "can be dereferenced and then implicitly converted to T.");
201  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
202  block_load_direct_blocked(flat_id, block_input, items, valid);
203  }
204 
221  template<
222  class InputIterator,
223  class Default
224  >
225  ROCPRIM_DEVICE ROCPRIM_INLINE
226  void load(InputIterator block_input,
227  T (&items)[ItemsPerThread],
228  unsigned int valid,
229  Default out_of_bounds)
230  {
231  using value_type = typename std::iterator_traits<InputIterator>::value_type;
232  static_assert(std::is_convertible<value_type, T>::value,
233  "The type T must be such that an object of type InputIterator "
234  "can be dereferenced and then implicitly converted to T.");
235  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
236  block_load_direct_blocked(flat_id, block_input, items, valid,
237  out_of_bounds);
238  }
239 
270  template<class InputIterator>
271  ROCPRIM_DEVICE ROCPRIM_INLINE
272  void load(InputIterator block_input,
273  T (&items)[ItemsPerThread],
274  storage_type& storage)
275  {
276  using value_type = typename std::iterator_traits<InputIterator>::value_type;
277  static_assert(std::is_convertible<value_type, T>::value,
278  "The type T must be such that an object of type InputIterator "
279  "can be dereferenced and then implicitly converted to T.");
280  (void) storage;
281  load(block_input, items);
282  }
283 
315  template<class InputIterator>
316  ROCPRIM_DEVICE ROCPRIM_INLINE
317  void load(InputIterator block_input,
318  T (&items)[ItemsPerThread],
319  unsigned int valid,
320  storage_type& storage)
321  {
322  using value_type = typename std::iterator_traits<InputIterator>::value_type;
323  static_assert(std::is_convertible<value_type, T>::value,
324  "The type T must be such that an object of type InputIterator "
325  "can be dereferenced and then implicitly converted to T.");
326  (void) storage;
327  load(block_input, items, valid);
328  }
329 
364  template<
365  class InputIterator,
366  class Default
367  >
368  ROCPRIM_DEVICE ROCPRIM_INLINE
369  void load(InputIterator block_input,
370  T (&items)[ItemsPerThread],
371  unsigned int valid,
372  Default out_of_bounds,
373  storage_type& storage)
374  {
375  using value_type = typename std::iterator_traits<InputIterator>::value_type;
376  static_assert(std::is_convertible<value_type, T>::value,
377  "The type T must be such that an object of type InputIterator "
378  "can be dereferenced and then implicitly converted to T.");
379  (void) storage;
380  load(block_input, items, valid, out_of_bounds);
381  }
382 };
383 
384 #ifndef DOXYGEN_SHOULD_SKIP_THIS
385 
386 template<
387  class T,
388  unsigned int BlockSizeX,
389  unsigned int ItemsPerThread,
390  unsigned int BlockSizeY,
391  unsigned int BlockSizeZ
392  >
393 class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_striped, BlockSizeY, BlockSizeZ>
394 {
395  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
396 
397 private:
398  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
399 
400 public:
401  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
402  using storage_type = typename ::rocprim::detail::empty_storage_type;
403  #else
404  using storage_type = storage_type_; // only for Doxygen
405  #endif
406 
407  template<class InputIterator>
408  ROCPRIM_DEVICE inline
409  void load(InputIterator block_input,
410  T (&items)[ItemsPerThread])
411  {
412  using value_type = typename std::iterator_traits<InputIterator>::value_type;
413  static_assert(std::is_convertible<value_type, T>::value,
414  "The type T must be such that an object of type InputIterator "
415  "can be dereferenced and then implicitly converted to T.");
416  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
417  block_load_direct_striped<BlockSize>(flat_id, block_input, items);
418  }
419 
420  template<class InputIterator>
421  ROCPRIM_DEVICE inline
422  void load(InputIterator block_input,
423  T (&items)[ItemsPerThread],
424  unsigned int valid)
425  {
426  using value_type = typename std::iterator_traits<InputIterator>::value_type;
427  static_assert(std::is_convertible<value_type, T>::value,
428  "The type T must be such that an object of type InputIterator "
429  "can be dereferenced and then implicitly converted to T.");
430  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
431  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
432  }
433 
434  template<
435  class InputIterator,
436  class Default
437  >
438  ROCPRIM_DEVICE inline
439  void load(InputIterator block_input,
440  T (&items)[ItemsPerThread],
441  unsigned int valid,
442  Default out_of_bounds)
443  {
444  using value_type = typename std::iterator_traits<InputIterator>::value_type;
445  static_assert(std::is_convertible<value_type, T>::value,
446  "The type T must be such that an object of type InputIterator "
447  "can be dereferenced and then implicitly converted to T.");
448  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
449  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
450  out_of_bounds);
451  }
452 
453  template<class InputIterator>
454  ROCPRIM_DEVICE inline
455  void load(InputIterator block_input,
456  T (&items)[ItemsPerThread],
457  storage_type& storage)
458  {
459  using value_type = typename std::iterator_traits<InputIterator>::value_type;
460  static_assert(std::is_convertible<value_type, T>::value,
461  "The type T must be such that an object of type InputIterator "
462  "can be dereferenced and then implicitly converted to T.");
463  (void) storage;
464  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
465  block_load_direct_striped<BlockSize>(flat_id, block_input, items);
466  }
467 
468  template<class InputIterator>
469  ROCPRIM_DEVICE inline
470  void load(InputIterator block_input,
471  T (&items)[ItemsPerThread],
472  unsigned int valid,
473  storage_type& storage)
474  {
475  using value_type = typename std::iterator_traits<InputIterator>::value_type;
476  static_assert(std::is_convertible<value_type, T>::value,
477  "The type T must be such that an object of type InputIterator "
478  "can be dereferenced and then implicitly converted to T.");
479  (void) storage;
480  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
481  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
482  }
483 
484  template<
485  class InputIterator,
486  class Default
487  >
488  ROCPRIM_DEVICE inline
489  void load(InputIterator block_input,
490  T (&items)[ItemsPerThread],
491  unsigned int valid,
492  Default out_of_bounds,
493  storage_type& storage)
494  {
495  using value_type = typename std::iterator_traits<InputIterator>::value_type;
496  static_assert(std::is_convertible<value_type, T>::value,
497  "The type T must be such that an object of type InputIterator "
498  "can be dereferenced and then implicitly converted to T.");
499  (void) storage;
500  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
501  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
502  out_of_bounds);
503  }
504 };
505 
506 template<
507  class T,
508  unsigned int BlockSizeX,
509  unsigned int ItemsPerThread,
510  unsigned int BlockSizeY,
511  unsigned int BlockSizeZ
512 >
513 class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_vectorize, BlockSizeY, BlockSizeZ>
514 {
515 private:
516  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
517 
518 public:
519  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
520  using storage_type = typename ::rocprim::detail::empty_storage_type;
521  #else
522  using storage_type = storage_type_; // only for Doxygen
523  #endif
524 
525  ROCPRIM_DEVICE ROCPRIM_INLINE
526  void load(T* block_input,
527  T (&_items)[ItemsPerThread])
528  {
529  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
530  block_load_direct_blocked_vectorized(flat_id, block_input, _items);
531  }
532 
533  template<class InputIterator, class U>
534  ROCPRIM_DEVICE ROCPRIM_INLINE
535  void load(InputIterator block_input,
536  U (&items)[ItemsPerThread])
537  {
538  using value_type = typename std::iterator_traits<InputIterator>::value_type;
539  static_assert(std::is_convertible<value_type, T>::value,
540  "The type T must be such that an object of type InputIterator "
541  "can be dereferenced and then implicitly converted to T.");
542  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
543  block_load_direct_blocked(flat_id, block_input, items);
544  }
545 
546  template<class InputIterator>
547  ROCPRIM_DEVICE ROCPRIM_INLINE
548  void load(InputIterator block_input,
549  T (&items)[ItemsPerThread],
550  unsigned int valid)
551  {
552  using value_type = typename std::iterator_traits<InputIterator>::value_type;
553  static_assert(std::is_convertible<value_type, T>::value,
554  "The type T must be such that an object of type InputIterator "
555  "can be dereferenced and then implicitly converted to T.");
556  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
557  block_load_direct_blocked(flat_id, block_input, items, valid);
558  }
559 
560  template<
561  class InputIterator,
562  class Default
563  >
564  ROCPRIM_DEVICE ROCPRIM_INLINE
565  void load(InputIterator block_input,
566  T (&items)[ItemsPerThread],
567  unsigned int valid,
568  Default out_of_bounds)
569  {
570  using value_type = typename std::iterator_traits<InputIterator>::value_type;
571  static_assert(std::is_convertible<value_type, T>::value,
572  "The type T must be such that an object of type InputIterator "
573  "can be dereferenced and then implicitly converted to T.");
574  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
575  block_load_direct_blocked(flat_id, block_input, items, valid,
576  out_of_bounds);
577  }
578 
579  ROCPRIM_DEVICE ROCPRIM_INLINE
580  void load(T* block_input,
581  T (&items)[ItemsPerThread],
582  storage_type& storage)
583  {
584  (void) storage;
585  load(block_input, items);
586  }
587 
588  template<class InputIterator, class U>
589  ROCPRIM_DEVICE ROCPRIM_INLINE
590  void load(InputIterator block_input,
591  U (&items)[ItemsPerThread],
592  storage_type& storage)
593  {
594  using value_type = typename std::iterator_traits<InputIterator>::value_type;
595  static_assert(std::is_convertible<value_type, T>::value,
596  "The type T must be such that an object of type InputIterator "
597  "can be dereferenced and then implicitly converted to T.");
598  (void) storage;
599  load(block_input, items);
600  }
601 
602  template<class InputIterator>
603  ROCPRIM_DEVICE ROCPRIM_INLINE
604  void load(InputIterator block_input,
605  T (&items)[ItemsPerThread],
606  unsigned int valid,
607  storage_type& storage)
608  {
609  using value_type = typename std::iterator_traits<InputIterator>::value_type;
610  static_assert(std::is_convertible<value_type, T>::value,
611  "The type T must be such that an object of type InputIterator "
612  "can be dereferenced and then implicitly converted to T.");
613  (void) storage;
614  load(block_input, items, valid);
615  }
616 
617  template<
618  class InputIterator,
619  class Default
620  >
621  ROCPRIM_DEVICE ROCPRIM_INLINE
622  void load(InputIterator block_input,
623  T (&items)[ItemsPerThread],
624  unsigned int valid,
625  Default out_of_bounds,
626  storage_type& storage)
627  {
628  using value_type = typename std::iterator_traits<InputIterator>::value_type;
629  static_assert(std::is_convertible<value_type, T>::value,
630  "The type T must be such that an object of type InputIterator "
631  "can be dereferenced and then implicitly converted to T.");
632  (void) storage;
633  load(block_input, items, valid, out_of_bounds);
634  }
635 };
636 
637 template<
638  class T,
639  unsigned int BlockSizeX,
640  unsigned int ItemsPerThread,
641  unsigned int BlockSizeY,
642  unsigned int BlockSizeZ
643 >
644 class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_transpose, BlockSizeY, BlockSizeZ>
645 {
646  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
647 
648 private:
650 
651 public:
652  using storage_type = typename block_exchange_type::storage_type;
653 
654  template<class InputIterator>
655  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
656  void load(InputIterator block_input,
657  T (&items)[ItemsPerThread])
658  {
659  using value_type = typename std::iterator_traits<InputIterator>::value_type;
660  static_assert(std::is_convertible<value_type, T>::value,
661  "The type T must be such that an object of type InputIterator "
662  "can be dereferenced and then implicitly converted to T.");
663  ROCPRIM_SHARED_MEMORY storage_type storage;
664  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
665  block_load_direct_striped<BlockSize>(flat_id, block_input, items);
666  block_exchange_type().striped_to_blocked(items, items, storage);
667  }
668 
669  template<class InputIterator>
670  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
671  void load(InputIterator block_input,
672  T (&items)[ItemsPerThread],
673  unsigned int valid)
674  {
675  using value_type = typename std::iterator_traits<InputIterator>::value_type;
676  static_assert(std::is_convertible<value_type, T>::value,
677  "The type T must be such that an object of type InputIterator "
678  "can be dereferenced and then implicitly converted to T.");
679  ROCPRIM_SHARED_MEMORY storage_type storage;
680  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
681  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
682  block_exchange_type().striped_to_blocked(items, items, storage);
683  }
684 
685  template<
686  class InputIterator,
687  class Default
688  >
689  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
690  void load(InputIterator block_input,
691  T (&items)[ItemsPerThread],
692  unsigned int valid,
693  Default out_of_bounds)
694  {
695  using value_type = typename std::iterator_traits<InputIterator>::value_type;
696  static_assert(std::is_convertible<value_type, T>::value,
697  "The type T must be such that an object of type InputIterator "
698  "can be dereferenced and then implicitly converted to T.");
699  ROCPRIM_SHARED_MEMORY storage_type storage;
700  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
701  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
702  out_of_bounds);
703  block_exchange_type().striped_to_blocked(items, items, storage);
704  }
705 
706  template<class InputIterator>
707  ROCPRIM_DEVICE ROCPRIM_INLINE
708  void load(InputIterator block_input,
709  T (&items)[ItemsPerThread],
710  storage_type& storage)
711  {
712  using value_type = typename std::iterator_traits<InputIterator>::value_type;
713  static_assert(std::is_convertible<value_type, T>::value,
714  "The type T must be such that an object of type InputIterator "
715  "can be dereferenced and then implicitly converted to T.");
716  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
717  block_load_direct_striped<BlockSize>(flat_id, block_input, items);
718  block_exchange_type().striped_to_blocked(items, items, storage);
719  }
720 
721  template<class InputIterator>
722  ROCPRIM_DEVICE ROCPRIM_INLINE
723  void load(InputIterator block_input,
724  T (&items)[ItemsPerThread],
725  unsigned int valid,
726  storage_type& storage)
727  {
728  using value_type = typename std::iterator_traits<InputIterator>::value_type;
729  static_assert(std::is_convertible<value_type, T>::value,
730  "The type T must be such that an object of type InputIterator "
731  "can be dereferenced and then implicitly converted to T.");
732  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
733  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid);
734  block_exchange_type().striped_to_blocked(items, items, storage);
735  }
736 
737  template<
738  class InputIterator,
739  class Default
740  >
741  ROCPRIM_DEVICE ROCPRIM_INLINE
742  void load(InputIterator block_input,
743  T (&items)[ItemsPerThread],
744  unsigned int valid,
745  Default out_of_bounds,
746  storage_type& storage)
747  {
748  using value_type = typename std::iterator_traits<InputIterator>::value_type;
749  static_assert(std::is_convertible<value_type, T>::value,
750  "The type T must be such that an object of type InputIterator "
751  "can be dereferenced and then implicitly converted to T.");
752  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
753  block_load_direct_striped<BlockSize>(flat_id, block_input, items, valid,
754  out_of_bounds);
755  block_exchange_type().striped_to_blocked(items, items, storage);
756  }
757 };
758 
759 template<
760  class T,
761  unsigned int BlockSizeX,
762  unsigned int ItemsPerThread,
763  unsigned int BlockSizeY,
764  unsigned int BlockSizeZ
765 >
766 class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_warp_transpose, BlockSizeY, BlockSizeZ>
767 {
768  static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
769 private:
771 
772 public:
773  static_assert(BlockSize % ::rocprim::device_warp_size() == 0,
774  "BlockSize must be a multiple of hardware warpsize");
775 
776  using storage_type = typename block_exchange_type::storage_type;
777 
778  template<class InputIterator>
779  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
780  void load(InputIterator block_input,
781  T (&items)[ItemsPerThread])
782  {
783  using value_type = typename std::iterator_traits<InputIterator>::value_type;
784  static_assert(std::is_convertible<value_type, T>::value,
785  "The type T must be such that an object of type InputIterator "
786  "can be dereferenced and then implicitly converted to T.");
787  ROCPRIM_SHARED_MEMORY storage_type storage;
788  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
789  block_load_direct_warp_striped(flat_id, block_input, items);
790  block_exchange_type().warp_striped_to_blocked(items, items, storage);
791  }
792 
793  template<class InputIterator>
794  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
795  void load(InputIterator block_input,
796  T (&items)[ItemsPerThread],
797  unsigned int valid)
798  {
799  using value_type = typename std::iterator_traits<InputIterator>::value_type;
800  static_assert(std::is_convertible<value_type, T>::value,
801  "The type T must be such that an object of type InputIterator "
802  "can be dereferenced and then implicitly converted to T.");
803  ROCPRIM_SHARED_MEMORY storage_type storage;
804  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
805  block_load_direct_warp_striped(flat_id, block_input, items, valid);
806  block_exchange_type().warp_striped_to_blocked(items, items, storage);
807 
808  }
809 
810  template<
811  class InputIterator,
812  class Default
813  >
814  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
815  void load(InputIterator block_input,
816  T (&items)[ItemsPerThread],
817  unsigned int valid,
818  Default out_of_bounds)
819  {
820  using value_type = typename std::iterator_traits<InputIterator>::value_type;
821  static_assert(std::is_convertible<value_type, T>::value,
822  "The type T must be such that an object of type InputIterator "
823  "can be dereferenced and then implicitly converted to T.");
824  ROCPRIM_SHARED_MEMORY storage_type storage;
825  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
826  block_load_direct_warp_striped(flat_id, block_input, items, valid,
827  out_of_bounds);
828  block_exchange_type().warp_striped_to_blocked(items, items, storage);
829  }
830 
831  template<class InputIterator>
832  ROCPRIM_DEVICE ROCPRIM_INLINE
833  void load(InputIterator block_input,
834  T (&items)[ItemsPerThread],
835  storage_type& storage)
836  {
837  using value_type = typename std::iterator_traits<InputIterator>::value_type;
838  static_assert(std::is_convertible<value_type, T>::value,
839  "The type T must be such that an object of type InputIterator "
840  "can be dereferenced and then implicitly converted to T.");
841  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
842  block_load_direct_warp_striped(flat_id, block_input, items);
843  block_exchange_type().warp_striped_to_blocked(items, items, storage);
844  }
845 
846  template<class InputIterator>
847  ROCPRIM_DEVICE ROCPRIM_INLINE
848  void load(InputIterator block_input,
849  T (&items)[ItemsPerThread],
850  unsigned int valid,
851  storage_type& storage)
852  {
853  using value_type = typename std::iterator_traits<InputIterator>::value_type;
854  static_assert(std::is_convertible<value_type, T>::value,
855  "The type T must be such that an object of type InputIterator "
856  "can be dereferenced and then implicitly converted to T.");
857  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
858  block_load_direct_warp_striped(flat_id, block_input, items, valid);
859  block_exchange_type().warp_striped_to_blocked(items, items, storage);
860  }
861 
862  template<
863  class InputIterator,
864  class Default
865  >
866  ROCPRIM_DEVICE ROCPRIM_INLINE
867  void load(InputIterator block_input,
868  T (&items)[ItemsPerThread],
869  unsigned int valid,
870  Default out_of_bounds,
871  storage_type& storage)
872  {
873  using value_type = typename std::iterator_traits<InputIterator>::value_type;
874  static_assert(std::is_convertible<value_type, T>::value,
875  "The type T must be such that an object of type InputIterator "
876  "can be dereferenced and then implicitly converted to T.");
877  const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
878  block_load_direct_warp_striped(flat_id, block_input, items, valid,
879  out_of_bounds);
880  block_exchange_type().warp_striped_to_blocked(items, items, storage);
881  }
882 };
883 
884 #endif // DOXYGEN_SHOULD_SKIP_THIS
885 
886 END_ROCPRIM_NAMESPACE
887 
889 // end of group blockmodule
890 
891 #endif // ROCPRIM_BLOCK_BLOCK_LOAD_HPP_
Data from continuous memory is loaded into a blocked arrangement of items using vectorization as an o...
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, storage_type &storage)
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range valid, using temporary storage.
Definition: block_load.hpp:317
The block_exchange class is a block level parallel primitive which provides methods for rearranging i...
Definition: block_exchange.hpp:81
Defaults to block_load_direct.
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:196
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator block_input, T(&items)[ItemsPerThread])
Loads data from continuous memory into an arrangement of items across the thread block.
Definition: block_load.hpp:167
ROCPRIM_DEVICE ROCPRIM_INLINE auto block_load_direct_blocked_vectorized(unsigned int flat_id, T *block_input, U(&items)[ItemsPerThread]) -> typename std::enable_if< detail::is_vectorizable< T, ItemsPerThread >::value >::type
Loads data from continuous memory into a blocked arrangement of items across the thread block...
Definition: block_load_func.hpp:186
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid)
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range valid.
Definition: block_load.hpp:193
A striped arrangement of data is read directly from memory.
block_load_method
block_load_method enumerates the methods available to load data from continuous memory into a blocked...
Definition: block_load.hpp:41
typename ::rocprim::detail::empty_storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: block_load.hpp:148
A striped arrangement of data from continuous memory is locally transposed into a blocked arrangement...
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_warp_striped(unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread])
Loads data from continuous memory into a warp-striped arrangement of items across the thread block...
Definition: block_load_func.hpp:378
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator block_input, T(&items)[ItemsPerThread], storage_type &storage)
Loads data from continuous memory into an arrangement of items across the thread block, using temporary storage.
Definition: block_load.hpp:272
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds, storage_type &storage)
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements, using temporary storage.
Definition: block_load.hpp:369
Data from continuous memory is loaded into a blocked arrangement of items.
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread])
Loads data from continuous memory into a blocked arrangement of items across the thread block...
Definition: block_load_func.hpp:58
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds)
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements.
Definition: block_load.hpp:226
A warp-striped arrangement of data from continuous memory is locally transposed into a blocked arrang...
The block_load class is a block level parallel primitive which provides methods for loading data from...
Definition: block_load.hpp:133
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void warp_striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block...
Definition: block_exchange.hpp:333