rocPRIM
device_config_helper.hpp
1 // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_HELPER_HPP_
22 #define ROCPRIM_DEVICE_DETAIL_CONFIG_HELPER_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../block/block_load.hpp"
30 #include "../../block/block_reduce.hpp"
31 #include "../../block/block_scan.hpp"
32 #include "../../block/block_store.hpp"
33 
34 #include "../config_types.hpp"
35 #include "rocprim/block/block_radix_rank.hpp"
36 #include "rocprim/block/block_sort.hpp"
37 
40 
41 BEGIN_ROCPRIM_NAMESPACE
42 
43 namespace detail
44 {
45 
48 {
49  kernel_config_params block_sort_config = {0, 0};
51 };
52 
53 // Necessary to construct a parameterized type of `merge_sort_block_sort_config_params`.
54 // Used in passing to host-side sub-algorithms and GPU kernels so non-default parameters can be available during compile-time.
55 template<unsigned int BlockSize, unsigned int ItemsPerThread, rocprim::block_sort_algorithm Algo>
56 struct merge_sort_block_sort_config : rocprim::detail::merge_sort_block_sort_config_params
57 {
60  : rocprim::detail::merge_sort_block_sort_config_params{sort_config(), Algo} {};
61 };
62 
63 constexpr unsigned int merge_sort_items_per_thread(const unsigned int item_scale)
64 {
65  if(item_scale <= 4)
66  {
67  return 8;
68  }
69  else if(item_scale <= 64)
70  {
71  return 4;
72  }
73  return 2;
74 }
75 constexpr unsigned int merge_sort_block_size(const unsigned int item_scale)
76 {
77  if(item_scale <= 32)
78  {
79  return 128;
80  }
81  else if(item_scale <= 128)
82  {
83  return 64;
84  }
85  return 32;
86 }
87 
88 // Calculate kernel configurations, such that it will not exceed shared memory maximum
89 template<class Key, class Value>
91 {
92  static constexpr unsigned int item_scale = ::rocprim::max(sizeof(Key), sizeof(Value));
93  // multiply by 2 to ensure block_sort's items_per_block >= block_merge's items_per_block
94  static constexpr unsigned int block_size = merge_sort_block_size(item_scale) * 2;
95  static constexpr unsigned int items_per_thread = merge_sort_items_per_thread(item_scale);
97  items_per_thread,
99 };
100 
101 // Calculate kernel configurations, such that it will not exceed shared memory maximum
102 // No radix_sort_block_sort_params and radix_sort_block_sort_config exist since the only
103 // configuration member is a kernel_config.
104 template<class Key, class Value>
106 {
107  static constexpr unsigned int item_scale = ::rocprim::max(sizeof(Key), sizeof(Value));
108 
109  // multiply by 2 to ensure block_sort's items_per_block >= block_merge's items_per_block
110  static constexpr unsigned int block_size = merge_sort_block_size(item_scale) * 2;
111  static constexpr unsigned int items_per_thread
112  = rocprim::min(4u, merge_sort_items_per_thread(item_scale));
114 
115  // The items per block should be a power of two, as this is a requirement for the
116  // radix sort merge sort.
117  static_assert(is_power_of_two(block_size * items_per_thread),
118  "Sorted items per block should be a power of two.");
119 };
120 
123 {
124  kernel_config_params merge_oddeven_config = {0, 0, 0};
125  kernel_config_params merge_mergepath_partition_config = {0, 0};
126  kernel_config_params merge_mergepath_config = {0, 0};
127 };
128 
129 // Necessary to construct a parameterized type of `merge_sort_block_merge_config_params`.
130 // Used in passing to host-side sub-algorithms and GPU kernels so non-default parameters can be available during compile-time.
131 template<unsigned int OddEvenBlockSize = 256,
132  unsigned int OddEvenItemsPerThread = 1,
133  unsigned int OddEvenSizeLimit = (1 << 17) + 70000,
134  unsigned int PartitionBlockSize = 128,
135  unsigned int MergePathBlockSize = 128,
136  unsigned int MergePathItemsPerThread = 4>
137 struct merge_sort_block_merge_config : rocprim::detail::merge_sort_block_merge_config_params
138 {
140  : rocprim::detail::merge_sort_block_merge_config_params{
141  {OddEvenBlockSize, OddEvenItemsPerThread, OddEvenSizeLimit},
142  {PartitionBlockSize, 1},
143  {MergePathBlockSize, MergePathItemsPerThread}
144  } {};
145 };
146 
147 template<class Key, class Value>
149 {
150  static constexpr unsigned int item_scale = ::rocprim::max(sizeof(Key), sizeof(Value));
151 
152  static constexpr unsigned int block_size = merge_sort_block_size(item_scale);
153  static constexpr unsigned int items_per_thread = merge_sort_items_per_thread(item_scale);
155  1,
156  (1 << 17) + 70000,
157  128,
158  block_size,
159  items_per_thread>;
160 };
161 
164 {
166  kernel_config_params sort = {0, 0};
167 
169  unsigned int radix_bits_per_place = 1;
170 
173 };
174 
175 } // namespace detail
176 
183 template<class HistogramConfig = kernel_config<256, 12>,
184  class SortConfig = kernel_config<256, 12>,
185  unsigned int RadixBits = 4,
186  block_radix_rank_algorithm RadixRankAlgorithm
187  = block_radix_rank_algorithm::default_algorithm>
189 {
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS
191  using histogram = HistogramConfig;
194  using sort = SortConfig;
195 
196  constexpr radix_sort_onesweep_config()
197  : radix_sort_onesweep_config_params{
198  {HistogramConfig::block_size, HistogramConfig::items_per_thread},
199  { SortConfig::block_size, SortConfig::items_per_thread},
200  RadixBits,
201  RadixRankAlgorithm,
202  } {};
203 #endif
204 };
205 
206 namespace detail
207 {
208 
209 // Calculate kernel configurations, such that it will not exceed shared memory maximum
210 template<class Key, class Value>
212 {
213  static constexpr unsigned int item_scale = ::rocprim::max(sizeof(Key), sizeof(Value));
214 
215  static constexpr unsigned int block_size = merge_sort_block_size(item_scale) * 4;
218  kernel_config<block_size, ::rocprim::max(1u, 65000u / block_size / item_scale)>,
219  4>;
220 };
221 
223 {
225  block_reduce_algorithm block_reduce_method;
226 };
227 
228 } // namespace detail
229 
236 template<unsigned int BlockSize = 256,
237  unsigned int ItemsPerThread = 8,
238  ::rocprim::block_reduce_algorithm BlockReduceMethod
239  = ::rocprim::block_reduce_algorithm::default_algorithm,
240  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
241 struct reduce_config : rocprim::detail::reduce_config_params
242 {
243  constexpr reduce_config()
244  : rocprim::detail::reduce_config_params{
245  {BlockSize, ItemsPerThread, SizeLimit},
246  BlockReduceMethod
247  } {};
248 };
249 
250 namespace detail
251 {
252 
253 template<class Value>
255 {
256  static constexpr unsigned int item_scale
257  = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
258 
260  ::rocprim::max(1u, 16u / item_scale),
261  ::rocprim::block_reduce_algorithm::using_warp_reduce>;
262 };
263 
264 template<class Value>
266 {};
267 
271 {
275  ::rocprim::block_scan_algorithm block_scan_method{};
276 };
277 
278 } // namespace detail
279 
288 template<unsigned int BlockSize,
289  unsigned int ItemsPerThread,
290  ::rocprim::block_load_method BlockLoadMethod,
291  ::rocprim::block_store_method BlockStoreMethod,
292  ::rocprim::block_scan_algorithm BlockScanMethod,
293  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
294 struct scan_config_v2 : ::rocprim::detail::scan_config_params
295 {
296 #ifndef DOXYGEN_SHOULD_SKIP_THIS
297  // Requirement dictated by init_lookback_scan_state_kernel.
298  static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE,
299  "Block size should at most be ROCPRIM_DEFAULT_MAX_BLOCK_SIZE.");
300 
302  static constexpr unsigned int block_size = BlockSize;
304  static constexpr unsigned int items_per_thread = ItemsPerThread;
310  static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
312  static constexpr unsigned int size_limit = SizeLimit;
313 
314  constexpr scan_config_v2()
315  : ::rocprim::detail::scan_config_params{
316  {BlockSize, ItemsPerThread, SizeLimit},
317  BlockLoadMethod,
318  BlockStoreMethod,
319  BlockScanMethod
320  } {};
321 #endif
322 };
323 
333 template<unsigned int BlockSize,
334  unsigned int ItemsPerThread,
335  bool UseLookback,
336  ::rocprim::block_load_method BlockLoadMethod,
337  ::rocprim::block_store_method BlockStoreMethod,
338  ::rocprim::block_scan_algorithm BlockScanMethod,
339  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
340 struct
341 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition
342 [[deprecated("The UseLookback switch has been removed, as scan now only supports the "
343  "lookback-scan implementation. Use scan_config_v2 instead.")]]
344 #endif
345 scan_config : ::rocprim::detail::scan_config_params
346 {
348  static constexpr unsigned int block_size = BlockSize;
350  static constexpr unsigned int items_per_thread = ItemsPerThread;
352  static constexpr bool use_lookback = UseLookback;
358  static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
360  static constexpr unsigned int size_limit = SizeLimit;
361 
362  constexpr scan_config()
363  : ::rocprim::detail::scan_config_params{
364  {BlockSize, ItemsPerThread, SizeLimit},
365  BlockLoadMethod,
366  BlockStoreMethod,
367  BlockScanMethod
368  } {};
369 };
370 
371 namespace detail
372 {
373 
374 template<class Value>
376 {
377  static constexpr unsigned int item_scale
378  = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
379 
381  ::rocprim::max(1u, 16u / item_scale),
382  ::rocprim::block_load_method::block_load_transpose,
383  ::rocprim::block_store_method::block_store_transpose,
384  ::rocprim::block_scan_algorithm::using_warp_scan>;
385 };
386 
387 template<class Value>
389 {};
390 
394 {
398  ::rocprim::block_scan_algorithm block_scan_method;
399 };
400 
401 } // namespace detail
402 
411 template<unsigned int BlockSize,
412  unsigned int ItemsPerThread,
413  ::rocprim::block_load_method BlockLoadMethod,
414  ::rocprim::block_store_method BlockStoreMethod,
415  ::rocprim::block_scan_algorithm BlockScanMethod,
416  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
417 struct scan_by_key_config_v2 : ::rocprim::detail::scan_by_key_config_params
418 {
419 #ifndef DOXYGEN_SHOULD_SKIP_THIS
420  // Requirement dictated by init_lookback_scan_state_kernel.
421  static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE,
422  "Block size should at most be ROCPRIM_DEFAULT_MAX_BLOCK_SIZE.");
423 
425  static constexpr unsigned int block_size = BlockSize;
427  static constexpr unsigned int items_per_thread = ItemsPerThread;
433  static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
435  static constexpr unsigned int size_limit = SizeLimit;
436 
437  constexpr scan_by_key_config_v2()
438  : ::rocprim::detail::scan_by_key_config_params{
439  {BlockSize, ItemsPerThread, SizeLimit},
440  BlockLoadMethod,
441  BlockStoreMethod,
442  BlockScanMethod
443  } {};
444 #endif
445 };
446 
456 template<unsigned int BlockSize,
457  unsigned int ItemsPerThread,
458  bool UseLookback,
459  ::rocprim::block_load_method BlockLoadMethod,
460  ::rocprim::block_store_method BlockStoreMethod,
461  ::rocprim::block_scan_algorithm BlockScanMethod,
462  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
463 struct
464 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition
465 [[deprecated(
466  "The UseLookback switch has been removed, as scan now only supports the lookback-scan "
467  "implementation. Use scan_by_key_config_v2 instead.")]]
468 #endif
469 scan_by_key_config : ::rocprim::detail::scan_by_key_config_params
470 {
472  static constexpr unsigned int block_size = BlockSize;
474  static constexpr unsigned int items_per_thread = ItemsPerThread;
476  static constexpr bool use_lookback = UseLookback;
482  static constexpr ::rocprim::block_scan_algorithm block_scan_method = BlockScanMethod;
484  static constexpr unsigned int size_limit = SizeLimit;
485 
486  constexpr scan_by_key_config()
487  : ::rocprim::detail::scan_by_key_config_params{
488  {BlockSize, ItemsPerThread, SizeLimit},
489  BlockLoadMethod,
490  BlockStoreMethod,
491  BlockScanMethod
492  } {};
493 };
494 
495 namespace detail
496 {
497 
498 template<class Key, class Value>
500 {
501  static constexpr unsigned int item_scale = ::rocprim::detail::ceiling_div<unsigned int>(
502  sizeof(Key) + sizeof(Value), 2 * sizeof(int));
503 
504  using type = scan_by_key_config_v2<
506  ::rocprim::max(1u, 16u / item_scale),
507  ::rocprim::block_load_method::block_load_transpose,
508  ::rocprim::block_store_method::block_store_transpose,
509  ::rocprim::block_scan_algorithm::using_warp_scan>;
510 };
511 
512 template<class Key, class Value>
514 {};
515 
517 {
519 };
520 
521 } // namespace detail
522 
527 template<unsigned int BlockSize,
528  unsigned int ItemsPerThread,
529  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
531 {
532 #ifndef DOXYGEN_SHOULD_SKIP_THIS
533 
535  static constexpr unsigned int block_size = BlockSize;
536 
538  static constexpr unsigned int items_per_thread = ItemsPerThread;
539 
541  static constexpr unsigned int size_limit = SizeLimit;
542 
543 #endif
544 };
545 
546 namespace detail
547 {
548 
549 template<class Value>
551 {
552  static constexpr unsigned int item_scale
553  = ::rocprim::detail::ceiling_div<unsigned int>(sizeof(Value), sizeof(int));
554 
555  using type = transform_config<256, ::rocprim::max(1u, 16u / item_scale)>;
556 };
557 
558 template<class Value>
560 {};
561 
562 } // namespace detail
563 
568 template<unsigned int BlockSize,
569  unsigned int ItemsPerThread,
570  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
571 struct binary_search_config : transform_config<BlockSize, ItemsPerThread, SizeLimit>
572 {};
573 
578 template<unsigned int BlockSize,
579  unsigned int ItemsPerThread,
580  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
581 struct upper_bound_config : transform_config<BlockSize, ItemsPerThread, SizeLimit>
582 {};
583 
588 template<unsigned int BlockSize,
589  unsigned int ItemsPerThread,
590  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
591 struct lower_bound_config : transform_config<BlockSize, ItemsPerThread, SizeLimit>
592 {};
593 
594 namespace detail
595 {
596 
597 template<class Value, class Output>
600  limit_block_size<256U, sizeof(Value) + sizeof(Output), ROCPRIM_WARP_SIZE_64>::value,
601  1>
602 {};
603 
608 {
610 
611  unsigned int max_grid_size = 0;
612  unsigned int shared_impl_max_bins = 0;
613  unsigned int shared_impl_histograms = 0;
614 };
615 
616 } // namespace detail
617 
627 template<class HistogramConfig,
628  unsigned int MaxGridSize = 1024,
629  unsigned int SharedImplMaxBins = 2048,
630  unsigned int SharedImplHistograms = 3>
632 {
633 #ifndef DOXYGEN_SHOULD_SKIP_THIS
634  using histogram = HistogramConfig;
635 
636  static constexpr unsigned int max_grid_size = MaxGridSize;
637  static constexpr unsigned int shared_impl_max_bins = SharedImplMaxBins;
638  static constexpr unsigned int shared_impl_histograms = SharedImplHistograms;
639 
640  constexpr histogram_config()
642  HistogramConfig{}, MaxGridSize, SharedImplMaxBins, SharedImplHistograms} {};
643 #endif
644 };
645 
646 namespace detail
647 {
648 
649 template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
651 {
652  static constexpr unsigned int item_scale
653  = ::rocprim::detail::ceiling_div(sizeof(Sample), sizeof(int));
654 
655  using type
656  = histogram_config<kernel_config<256, ::rocprim::max(8u / Channels / item_scale, 1u)>>;
657 };
658 
659 template<class Sample, unsigned int Channels, unsigned int ActiveChannels>
661  : default_histogram_config_base_helper<Sample, Channels, ActiveChannels>::type
662 {};
663 
664 } // namespace detail
665 
666 END_ROCPRIM_NAMESPACE
667 
669 // end of group primitivesmodule_deviceconfigs
670 
671 #endif //ROCPRIM_DEVICE_DETAIL_CONFIG_HELPER_HPP_
block_radix_rank_algorithm
Available algorithms for the block_radix_rank primitive.
Definition: block_radix_rank.hpp:40
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
Configuration of device-level scan-by-key operation.
Definition: device_config_helper.hpp:417
Definition: device_config_helper.hpp:550
Definition: device_config_helper.hpp:650
A merged sort based algorithm which sorts stably.
Definition: device_config_helper.hpp:148
Definition: config_types.hpp:65
block_store_method
block_store_method enumerates the methods available to store a striped arrangement of items into a bl...
Definition: block_store.hpp:41
Configuration of device-level histogram operation.
Definition: device_config_helper.hpp:631
Definition: config_types.hpp:109
Definition: device_config_helper.hpp:513
Configuration of device-level reduce primitives.
Definition: device_config_helper.hpp:241
Definition: device_config_helper.hpp:499
block_reduce_algorithm
Available algorithms for block_reduce primitive.
Definition: block_reduce.hpp:42
Definition: device_config_helper.hpp:222
Definition: test_utils_custom_float_type.hpp:110
The default radix ranking algorithm.
Configuration for the device-level lower bound operation.
Definition: device_config_helper.hpp:591
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
Definition: device_config_helper.hpp:56
Definition: device_config_helper.hpp:137
Configuration for the device-level transform operation.
Definition: device_config_helper.hpp:530
Definition: device_config_helper.hpp:265
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Configuration of subalgorithm Onesweep.
Definition: device_config_helper.hpp:188
block_load_method
block_load_method enumerates the methods available to load data from continuous memory into a blocked...
Definition: block_load.hpp:41
Definition: device_config_helper.hpp:105
Definition: device_config_helper.hpp:516
Configuration for the device-level binary search operation.
Definition: device_config_helper.hpp:571
Default values are provided by merge_sort_block_merge_config_base.
Definition: device_config_helper.hpp:122
Definition: device_config_helper.hpp:660
Default values are provided by merge_sort_block_sort_config_base.
Definition: device_config_helper.hpp:47
Definition: device_config_helper.hpp:559
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension.
Definition: thread.hpp:268
Provides the kernel parameters for exclusive_scan and inclusive_scan based on autotuned configuration...
Definition: device_config_helper.hpp:270
Definition: device_config_helper.hpp:598
Definition: device_config_helper.hpp:388
Definition: device_config_helper.hpp:90
Configuration of device-level scan primitives.
Definition: device_config_helper.hpp:294
Provides the kernel parameters for histogram_even, multi_histogram_even, histogram_range, and multi_histogram_range based on autotuned configurations or user-provided configurations.
Definition: device_config_helper.hpp:607
Definition: benchmark_block_histogram.cpp:64
Configuration for the device-level upper bound operation.
Definition: device_config_helper.hpp:581
Definition: device_config_helper.hpp:375
Definition: device_config_helper.hpp:254
Provides the kernel parameters for exclusive_scan_by_key and inclusive_scan_by_key based on autotuned...
Definition: device_config_helper.hpp:393
block_sort_algorithm
Available algorithms for block_sort primitive.
Definition: block_sort.hpp:41
block_scan_algorithm
Available algorithms for block_scan primitive.
Definition: block_scan.hpp:41
Configuration of particular kernels launched by device-level operation.
Definition: config_types.hpp:84
Definition: device_config_helper.hpp:211
Default values are provided by radix_sort_onesweep_config_base.
Definition: device_config_helper.hpp:163
struct deprecated("use radix_sort_config_v2")]] radix_sort_config
Legacy configuration of device-level radix sort operation.
Definition: device_radix_sort_config.hpp:95