rocPRIM
block_scan.hpp
1 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
22 #define ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
23 
24 #include <type_traits>
25 
26 #include "../config.hpp"
27 #include "../detail/various.hpp"
28 
29 #include "../intrinsics.hpp"
30 #include "../functional.hpp"
31 
32 #include "detail/block_scan_warp_scan.hpp"
33 #include "detail/block_scan_reduce_then_scan.hpp"
34 
37 
38 BEGIN_ROCPRIM_NAMESPACE
39 
42 {
49 };
50 
51 namespace detail
52 {
53 
54 // Selector for block_scan algorithm which gives block scan implementation
55 // type based on passed block_scan_algorithm enum
56 template<block_scan_algorithm Algorithm>
58 
59 template<>
61 {
62  template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
64 };
65 
66 template<>
68 {
69  template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
70  // When BlockSize is less than hardware warp size block_scan_warp_scan performs better than
71  // block_scan_reduce_then_scan by specializing for warps
72  using type = typename std::conditional<
73  (BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::device_warp_size()),
76  >::type;
77 };
78 
79 } // end namespace detail
80 
127 template<
128  class T,
129  unsigned int BlockSizeX,
131  unsigned int BlockSizeY = 1,
132  unsigned int BlockSizeZ = 1
133 >
135 #ifndef DOXYGEN_SHOULD_SKIP_THIS
136  : private detail::select_block_scan_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>
137 #endif
138 {
139  using base_type = typename detail::select_block_scan_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
140 public:
149  using storage_type = typename base_type::storage_type;
150 
197  template<class BinaryFunction = ::rocprim::plus<T>>
198  ROCPRIM_DEVICE ROCPRIM_INLINE
199  void inclusive_scan(T input,
200  T& output,
201  storage_type& storage,
202  BinaryFunction scan_op = BinaryFunction())
203  {
204  base_type::inclusive_scan(input, output, storage, scan_op);
205  }
206 
222  template<class BinaryFunction = ::rocprim::plus<T>>
223  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
224  void inclusive_scan(T input,
225  T& output,
226  BinaryFunction scan_op = BinaryFunction())
227  {
228  base_type::inclusive_scan(input, output, scan_op);
229  }
230 
281  template<class BinaryFunction = ::rocprim::plus<T>>
282  ROCPRIM_DEVICE ROCPRIM_INLINE
283  void inclusive_scan(T input,
284  T& output,
285  T& reduction,
286  storage_type& storage,
287  BinaryFunction scan_op = BinaryFunction())
288  {
289  base_type::inclusive_scan(input, output, reduction, storage, scan_op);
290  }
291 
308  template<class BinaryFunction = ::rocprim::plus<T>>
309  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
310  void inclusive_scan(T input,
311  T& output,
312  T& reduction,
313  BinaryFunction scan_op = BinaryFunction())
314  {
315  base_type::inclusive_scan(input, output, reduction, scan_op);
316  }
317 
394  template<
395  class PrefixCallback,
396  class BinaryFunction = ::rocprim::plus<T>
397  >
398  ROCPRIM_DEVICE ROCPRIM_INLINE
399  void inclusive_scan(T input,
400  T& output,
401  storage_type& storage,
402  PrefixCallback& prefix_callback_op,
403  BinaryFunction scan_op)
404  {
405  base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op);
406  }
407 
455  template<
456  unsigned int ItemsPerThread,
457  class BinaryFunction = ::rocprim::plus<T>
458  >
459  ROCPRIM_DEVICE ROCPRIM_INLINE
460  void inclusive_scan(T (&input)[ItemsPerThread],
461  T (&output)[ItemsPerThread],
462  storage_type& storage,
463  BinaryFunction scan_op = BinaryFunction())
464  {
465  if(ItemsPerThread == 1)
466  {
467  base_type::inclusive_scan(input[0], output[0], storage, scan_op);
468  }
469  else
470  {
471  base_type::inclusive_scan(input, output, storage, scan_op);
472  }
473  }
474 
491  template<
492  unsigned int ItemsPerThread,
493  class BinaryFunction = ::rocprim::plus<T>
494  >
495  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
496  void inclusive_scan(T (&input)[ItemsPerThread],
497  T (&output)[ItemsPerThread],
498  BinaryFunction scan_op = BinaryFunction())
499  {
500  if(ItemsPerThread == 1)
501  {
502  base_type::inclusive_scan(input[0], output[0], scan_op);
503  }
504  else
505  {
506  base_type::inclusive_scan(input, output, scan_op);
507  }
508  }
509 
560  template<
561  unsigned int ItemsPerThread,
562  class BinaryFunction = ::rocprim::plus<T>
563  >
564  ROCPRIM_DEVICE ROCPRIM_INLINE
565  void inclusive_scan(T (&input)[ItemsPerThread],
566  T (&output)[ItemsPerThread],
567  T& reduction,
568  storage_type& storage,
569  BinaryFunction scan_op = BinaryFunction())
570  {
571  if(ItemsPerThread == 1)
572  {
573  base_type::inclusive_scan(input[0], output[0], reduction, storage, scan_op);
574  }
575  else
576  {
577  base_type::inclusive_scan(input, output, reduction, storage, scan_op);
578  }
579  }
580 
598  template<
599  unsigned int ItemsPerThread,
600  class BinaryFunction = ::rocprim::plus<T>
601  >
602  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
603  void inclusive_scan(T (&input)[ItemsPerThread],
604  T (&output)[ItemsPerThread],
605  T& reduction,
606  BinaryFunction scan_op = BinaryFunction())
607  {
608  if(ItemsPerThread == 1)
609  {
610  base_type::inclusive_scan(input[0], output[0], reduction, scan_op);
611  }
612  else
613  {
614  base_type::inclusive_scan(input, output, reduction, scan_op);
615  }
616  }
617 
695  template<
696  unsigned int ItemsPerThread,
697  class PrefixCallback,
698  class BinaryFunction
699  >
700  ROCPRIM_DEVICE ROCPRIM_INLINE
701  void inclusive_scan(T (&input)[ItemsPerThread],
702  T (&output)[ItemsPerThread],
703  storage_type& storage,
704  PrefixCallback& prefix_callback_op,
705  BinaryFunction scan_op)
706  {
707  if(ItemsPerThread == 1)
708  {
709  base_type::inclusive_scan(input[0], output[0], storage, prefix_callback_op, scan_op);
710  }
711  else
712  {
713  base_type::inclusive_scan(input, output, storage, prefix_callback_op, scan_op);
714  }
715  }
716 
767  template<class BinaryFunction = ::rocprim::plus<T>>
768  ROCPRIM_DEVICE ROCPRIM_INLINE
769  void exclusive_scan(T input,
770  T& output,
771  T init,
772  storage_type& storage,
773  BinaryFunction scan_op = BinaryFunction())
774  {
775  base_type::exclusive_scan(input, output, init, storage, scan_op);
776  }
777 
795  template<class BinaryFunction = ::rocprim::plus<T>>
796  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
797  void exclusive_scan(T input,
798  T& output,
799  T init,
800  BinaryFunction scan_op = BinaryFunction())
801  {
802  base_type::exclusive_scan(input, output, init, scan_op);
803  }
804 
859  template<class BinaryFunction = ::rocprim::plus<T>>
860  ROCPRIM_DEVICE ROCPRIM_INLINE
861  void exclusive_scan(T input,
862  T& output,
863  T init,
864  T& reduction,
865  storage_type& storage,
866  BinaryFunction scan_op = BinaryFunction())
867  {
868  base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
869  }
870 
889  template<class BinaryFunction = ::rocprim::plus<T>>
890  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
891  void exclusive_scan(T input,
892  T& output,
893  T init,
894  T& reduction,
895  BinaryFunction scan_op = BinaryFunction())
896  {
897  base_type::exclusive_scan(input, output, init, reduction, scan_op);
898  }
899 
976  template<
977  class PrefixCallback,
978  class BinaryFunction = ::rocprim::plus<T>
979  >
980  ROCPRIM_DEVICE ROCPRIM_INLINE
981  void exclusive_scan(T input,
982  T& output,
983  storage_type& storage,
984  PrefixCallback& prefix_callback_op,
985  BinaryFunction scan_op)
986  {
987  base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op);
988  }
989 
1041  template<
1042  unsigned int ItemsPerThread,
1043  class BinaryFunction = ::rocprim::plus<T>
1044  >
1045  ROCPRIM_DEVICE ROCPRIM_INLINE
1046  void exclusive_scan(T (&input)[ItemsPerThread],
1047  T (&output)[ItemsPerThread],
1048  T init,
1049  storage_type& storage,
1050  BinaryFunction scan_op = BinaryFunction())
1051  {
1052  if(ItemsPerThread == 1)
1053  {
1054  base_type::exclusive_scan(input[0], output[0], init, storage, scan_op);
1055  }
1056  else
1057  {
1058  base_type::exclusive_scan(input, output, init, storage, scan_op);
1059  }
1060  }
1061 
1080  template<
1081  unsigned int ItemsPerThread,
1082  class BinaryFunction = ::rocprim::plus<T>
1083  >
1084  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
1085  void exclusive_scan(T (&input)[ItemsPerThread],
1086  T (&output)[ItemsPerThread],
1087  T init,
1088  BinaryFunction scan_op = BinaryFunction())
1089  {
1090  if(ItemsPerThread == 1)
1091  {
1092  base_type::exclusive_scan(input[0], output[0], init, scan_op);
1093  }
1094  else
1095  {
1096  base_type::exclusive_scan(input, output, init, scan_op);
1097  }
1098  }
1099 
1155  template<
1156  unsigned int ItemsPerThread,
1157  class BinaryFunction = ::rocprim::plus<T>
1158  >
1159  ROCPRIM_DEVICE ROCPRIM_INLINE
1160  void exclusive_scan(T (&input)[ItemsPerThread],
1161  T (&output)[ItemsPerThread],
1162  T init,
1163  T& reduction,
1164  storage_type& storage,
1165  BinaryFunction scan_op = BinaryFunction())
1166  {
1167  if(ItemsPerThread == 1)
1168  {
1169  base_type::exclusive_scan(input[0], output[0], init, reduction, storage, scan_op);
1170  }
1171  else
1172  {
1173  base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
1174  }
1175  }
1176 
1196  template<
1197  unsigned int ItemsPerThread,
1198  class BinaryFunction = ::rocprim::plus<T>
1199  >
1200  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
1201  void exclusive_scan(T (&input)[ItemsPerThread],
1202  T (&output)[ItemsPerThread],
1203  T init,
1204  T& reduction,
1205  BinaryFunction scan_op = BinaryFunction())
1206  {
1207  if(ItemsPerThread == 1)
1208  {
1209  base_type::exclusive_scan(input[0], output[0], init, reduction, scan_op);
1210  }
1211  else
1212  {
1213  base_type::exclusive_scan(input, output, init, reduction, scan_op);
1214  }
1215  }
1216 
1294  template<
1295  unsigned int ItemsPerThread,
1296  class PrefixCallback,
1297  class BinaryFunction
1298  >
1299  ROCPRIM_DEVICE ROCPRIM_INLINE
1300  void exclusive_scan(T (&input)[ItemsPerThread],
1301  T (&output)[ItemsPerThread],
1302  storage_type& storage,
1303  PrefixCallback& prefix_callback_op,
1304  BinaryFunction scan_op)
1305  {
1306  if(ItemsPerThread == 1)
1307  {
1308  base_type::exclusive_scan(input[0], output[0], storage, prefix_callback_op, scan_op);
1309  }
1310  else
1311  {
1312  base_type::exclusive_scan(input, output, storage, prefix_callback_op, scan_op);
1313  }
1314  }
1315 };
1316 
1317 END_ROCPRIM_NAMESPACE
1318 
1320 // end of group blockmodule
1321 
1322 #endif // ROCPRIM_BLOCK_BLOCK_SCAN_HPP_
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void inclusive_scan(T input, T &output, T &reduction, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:310
An algorithm which limits calculations to a single hardware warp.
typename base_type::storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: block_scan.hpp:149
Default block_scan algorithm.
ROCPRIM_DEVICE ROCPRIM_INLINE void inclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs inclusive scan and reduction across threads in a block.
Definition: block_scan.hpp:565
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void exclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], T init, T &reduction, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:1201
ROCPRIM_DEVICE ROCPRIM_INLINE void exclusive_scan(T input, T &output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs exclusive scan across threads in a block.
Definition: block_scan.hpp:769
Definition: block_scan_warp_scan.hpp:45
The block_scan class is a block level parallel primitive which provides methods for performing inclus...
Definition: block_scan.hpp:134
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void inclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], T &reduction, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:603
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
Definition: block_scan_reduce_then_scan.hpp:45
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void exclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], T init, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:1085
Definition: block_scan.hpp:57
A warp_scan based algorithm.
hipError_t exclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel exclusive scan primitive for device level.
Definition: device_scan.hpp:651
ROCPRIM_DEVICE ROCPRIM_INLINE void exclusive_scan(T input, T &output, storage_type &storage, PrefixCallback &prefix_callback_op, BinaryFunction scan_op)
Performs exclusive scan across threads in a block, and uses prefix_callback_op to generate prefix val...
Definition: block_scan.hpp:981
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void exclusive_scan(T input, T &output, T init, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:797
ROCPRIM_DEVICE ROCPRIM_INLINE void exclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs exclusive scan across threads in a block.
Definition: block_scan.hpp:1046
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void inclusive_scan(T input, T &output, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:224
ROCPRIM_DEVICE ROCPRIM_INLINE void inclusive_scan(T input, T &output, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs inclusive scan across threads in a block.
Definition: block_scan.hpp:199
ROCPRIM_DEVICE ROCPRIM_INLINE void inclusive_scan(T input, T &output, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs inclusive scan and reduction across threads in a block.
Definition: block_scan.hpp:283
hipError_t inclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel inclusive scan primitive for device level.
Definition: device_scan.hpp:539
ROCPRIM_DEVICE ROCPRIM_INLINE void inclusive_scan(T input, T &output, storage_type &storage, PrefixCallback &prefix_callback_op, BinaryFunction scan_op)
Performs inclusive scan across threads in a block, and uses prefix_callback_op to generate prefix val...
Definition: block_scan.hpp:399
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void exclusive_scan(T input, T &output, T init, T &reduction, BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:891
ROCPRIM_DEVICE ROCPRIM_INLINE void exclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], storage_type &storage, PrefixCallback &prefix_callback_op, BinaryFunction scan_op)
Performs exclusive scan across threads in a block, and uses prefix_callback_op to generate prefix val...
Definition: block_scan.hpp:1300
ROCPRIM_DEVICE ROCPRIM_INLINE void exclusive_scan(T input, T &output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs exclusive scan and reduction across threads in a block.
Definition: block_scan.hpp:861
ROCPRIM_DEVICE ROCPRIM_INLINE void exclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs exclusive scan and reduction across threads in a block.
Definition: block_scan.hpp:1160
ROCPRIM_DEVICE ROCPRIM_INLINE void inclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], storage_type &storage, BinaryFunction scan_op=BinaryFunction())
Performs inclusive scan across threads in a block.
Definition: block_scan.hpp:460
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void inclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], BinaryFunction scan_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_scan.hpp:496
Default block_histogram algorithm.
block_scan_algorithm
Available algorithms for block_scan primitive.
Definition: block_scan.hpp:41
ROCPRIM_DEVICE ROCPRIM_INLINE void inclusive_scan(T(&input)[ItemsPerThread], T(&output)[ItemsPerThread], storage_type &storage, PrefixCallback &prefix_callback_op, BinaryFunction scan_op)
Performs inclusive scan across threads in a block, and uses prefix_callback_op to generate prefix val...
Definition: block_scan.hpp:701