21 #ifndef ROCPRIM_DEVICE_DETAIL_CONFIG_HELPER_HPP_ 22 #define ROCPRIM_DEVICE_DETAIL_CONFIG_HELPER_HPP_ 24 #include <type_traits> 26 #include "../../config.hpp" 27 #include "../../detail/various.hpp" 29 #include "../../block/block_load.hpp" 30 #include "../../block/block_reduce.hpp" 31 #include "../../block/block_scan.hpp" 32 #include "../../block/block_store.hpp" 34 #include "../config_types.hpp" 35 #include "rocprim/block/block_radix_rank.hpp" 36 #include "rocprim/block/block_sort.hpp" 41 BEGIN_ROCPRIM_NAMESPACE
55 template<
unsigned int BlockSize,
unsigned int ItemsPerThread, rocprim::block_sort_algorithm Algo>
60 : rocprim::detail::merge_sort_block_sort_config_params{
sort_config(), Algo} {};
63 constexpr
unsigned int merge_sort_items_per_thread(
const unsigned int item_scale)
69 else if(item_scale <= 64)
75 constexpr
unsigned int merge_sort_block_size(
const unsigned int item_scale)
81 else if(item_scale <= 128)
89 template<
class Key,
class Value>
92 static constexpr
unsigned int item_scale =
::rocprim::max(
sizeof(Key),
sizeof(Value));
94 static constexpr
unsigned int block_size = merge_sort_block_size(item_scale) * 2;
95 static constexpr
unsigned int items_per_thread = merge_sort_items_per_thread(item_scale);
104 template<
class Key,
class Value>
107 static constexpr
unsigned int item_scale =
::rocprim::max(
sizeof(Key),
sizeof(Value));
110 static constexpr
unsigned int block_size = merge_sort_block_size(item_scale) * 2;
111 static constexpr
unsigned int items_per_thread
112 =
rocprim::min(4u, merge_sort_items_per_thread(item_scale));
117 static_assert(is_power_of_two(block_size * items_per_thread),
118 "Sorted items per block should be a power of two.");
131 template<
unsigned int OddEvenBlockSize = 256,
132 unsigned int OddEvenItemsPerThread = 1,
133 unsigned int OddEvenSizeLimit = (1 << 17) + 70000,
134 unsigned int PartitionBlockSize = 128,
135 unsigned int MergePathBlockSize = 128,
136 unsigned int MergePathItemsPerThread = 4>
140 : rocprim::detail::merge_sort_block_merge_config_params{
141 {OddEvenBlockSize, OddEvenItemsPerThread, OddEvenSizeLimit},
142 {PartitionBlockSize, 1},
143 {MergePathBlockSize, MergePathItemsPerThread}
147 template<
class Key,
class Value>
150 static constexpr
unsigned int item_scale =
::rocprim::max(
sizeof(Key),
sizeof(Value));
152 static constexpr
unsigned int block_size = merge_sort_block_size(item_scale);
153 static constexpr
unsigned int items_per_thread = merge_sort_items_per_thread(item_scale);
169 unsigned int radix_bits_per_place = 1;
183 template<
class HistogramConfig = kernel_config<256, 12>,
184 class SortConfig = kernel_config<256, 12>,
185 unsigned int RadixBits = 4,
186 block_radix_rank_algorithm RadixRankAlgorithm
187 = block_radix_rank_algorithm::default_algorithm>
190 #ifndef DOXYGEN_SHOULD_SKIP_THIS 197 : radix_sort_onesweep_config_params{
198 {HistogramConfig::block_size, HistogramConfig::items_per_thread},
199 { SortConfig::block_size, SortConfig::items_per_thread},
210 template<
class Key,
class Value>
213 static constexpr
unsigned int item_scale =
::rocprim::max(
sizeof(Key),
sizeof(Value));
215 static constexpr
unsigned int block_size = merge_sort_block_size(item_scale) * 4;
236 template<
unsigned int BlockSize = 256,
237 unsigned int ItemsPerThread = 8,
239 = ::rocprim::block_reduce_algorithm::default_algorithm,
240 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
244 : rocprim::detail::reduce_config_params{
245 {BlockSize, ItemsPerThread, SizeLimit},
253 template<
class Value>
256 static constexpr
unsigned int item_scale
257 = ::rocprim::detail::ceiling_div<unsigned int>(
sizeof(Value),
sizeof(
int));
261 ::rocprim::block_reduce_algorithm::using_warp_reduce>;
264 template<
class Value>
288 template<
unsigned int BlockSize,
289 unsigned int ItemsPerThread,
293 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
296 #ifndef DOXYGEN_SHOULD_SKIP_THIS 298 static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE,
299 "Block size should at most be ROCPRIM_DEFAULT_MAX_BLOCK_SIZE.");
304 static constexpr
unsigned int items_per_thread = ItemsPerThread;
312 static constexpr
unsigned int size_limit = SizeLimit;
316 {BlockSize, ItemsPerThread, SizeLimit},
333 template<
unsigned int BlockSize,
334 unsigned int ItemsPerThread,
339 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
341 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition 342 [[
deprecated(
"The UseLookback switch has been removed, as scan now only supports the " 343 "lookback-scan implementation. Use scan_config_v2 instead.")]]
345 scan_config : ::rocprim::detail::scan_config_params
348 static constexpr
unsigned int block_size = BlockSize;
350 static constexpr
unsigned int items_per_thread = ItemsPerThread;
352 static constexpr
bool use_lookback = UseLookback;
360 static constexpr
unsigned int size_limit = SizeLimit;
362 constexpr scan_config()
363 : ::rocprim::detail::scan_config_params{
364 {BlockSize, ItemsPerThread, SizeLimit},
374 template<
class Value>
377 static constexpr
unsigned int item_scale
378 = ::rocprim::detail::ceiling_div<unsigned int>(
sizeof(Value),
sizeof(
int));
382 ::rocprim::block_load_method::block_load_transpose,
383 ::rocprim::block_store_method::block_store_transpose,
384 ::rocprim::block_scan_algorithm::using_warp_scan>;
387 template<
class Value>
411 template<
unsigned int BlockSize,
412 unsigned int ItemsPerThread,
416 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
419 #ifndef DOXYGEN_SHOULD_SKIP_THIS 421 static_assert(BlockSize <= ROCPRIM_DEFAULT_MAX_BLOCK_SIZE,
422 "Block size should at most be ROCPRIM_DEFAULT_MAX_BLOCK_SIZE.");
427 static constexpr
unsigned int items_per_thread = ItemsPerThread;
435 static constexpr
unsigned int size_limit = SizeLimit;
439 {BlockSize, ItemsPerThread, SizeLimit},
456 template<
unsigned int BlockSize,
457 unsigned int ItemsPerThread,
462 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
464 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Doxygen seems to have trouble with the syntax used in this definition 466 "The UseLookback switch has been removed, as scan now only supports the lookback-scan " 467 "implementation. Use scan_by_key_config_v2 instead.")]]
469 scan_by_key_config : ::rocprim::detail::scan_by_key_config_params
472 static constexpr
unsigned int block_size = BlockSize;
474 static constexpr
unsigned int items_per_thread = ItemsPerThread;
476 static constexpr
bool use_lookback = UseLookback;
484 static constexpr
unsigned int size_limit = SizeLimit;
486 constexpr scan_by_key_config()
487 : ::rocprim::detail::scan_by_key_config_params{
488 {BlockSize, ItemsPerThread, SizeLimit},
498 template<
class Key,
class Value>
501 static constexpr
unsigned int item_scale = ::rocprim::detail::ceiling_div<unsigned int>(
502 sizeof(Key) +
sizeof(Value), 2 *
sizeof(int));
507 ::rocprim::block_load_method::block_load_transpose,
508 ::rocprim::block_store_method::block_store_transpose,
509 ::rocprim::block_scan_algorithm::using_warp_scan>;
512 template<
class Key,
class Value>
527 template<
unsigned int BlockSize,
528 unsigned int ItemsPerThread,
529 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
532 #ifndef DOXYGEN_SHOULD_SKIP_THIS 538 static constexpr
unsigned int items_per_thread = ItemsPerThread;
541 static constexpr
unsigned int size_limit = SizeLimit;
549 template<
class Value>
552 static constexpr
unsigned int item_scale
553 = ::rocprim::detail::ceiling_div<unsigned int>(
sizeof(Value),
sizeof(
int));
558 template<
class Value>
568 template<
unsigned int BlockSize,
569 unsigned int ItemsPerThread,
570 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
578 template<
unsigned int BlockSize,
579 unsigned int ItemsPerThread,
580 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
588 template<
unsigned int BlockSize,
589 unsigned int ItemsPerThread,
590 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
597 template<
class Value,
class Output>
600 limit_block_size<256U, sizeof(Value) + sizeof(Output), ROCPRIM_WARP_SIZE_64>::value,
611 unsigned int max_grid_size = 0;
612 unsigned int shared_impl_max_bins = 0;
613 unsigned int shared_impl_histograms = 0;
627 template<
class HistogramConfig,
628 unsigned int MaxGridSize = 1024,
629 unsigned int SharedImplMaxBins = 2048,
630 unsigned int SharedImplHistograms = 3>
633 #ifndef DOXYGEN_SHOULD_SKIP_THIS 636 static constexpr
unsigned int max_grid_size = MaxGridSize;
637 static constexpr
unsigned int shared_impl_max_bins = SharedImplMaxBins;
638 static constexpr
unsigned int shared_impl_histograms = SharedImplHistograms;
642 HistogramConfig{}, MaxGridSize, SharedImplMaxBins, SharedImplHistograms} {};
649 template<
class Sample,
unsigned int Channels,
unsigned int ActiveChannels>
652 static constexpr
unsigned int item_scale
653 = ::rocprim::detail::ceiling_div(
sizeof(Sample),
sizeof(
int));
659 template<
class Sample,
unsigned int Channels,
unsigned int ActiveChannels>
666 END_ROCPRIM_NAMESPACE
671 #endif //ROCPRIM_DEVICE_DETAIL_CONFIG_HELPER_HPP_ block_radix_rank_algorithm
Available algorithms for the block_radix_rank primitive.
Definition: block_radix_rank.hpp:40
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
Configuration of device-level scan-by-key operation.
Definition: device_config_helper.hpp:417
Definition: device_config_helper.hpp:650
A merged sort based algorithm which sorts stably.
Definition: device_config_helper.hpp:148
Definition: config_types.hpp:65
block_store_method
block_store_method enumerates the methods available to store a striped arrangement of items into a bl...
Definition: block_store.hpp:41
Configuration of device-level histogram operation.
Definition: device_config_helper.hpp:631
Definition: config_types.hpp:109
Definition: device_config_helper.hpp:513
Configuration of device-level reduce primitives.
Definition: device_config_helper.hpp:241
Definition: device_config_helper.hpp:499
block_reduce_algorithm
Available algorithms for block_reduce primitive.
Definition: block_reduce.hpp:42
Definition: device_config_helper.hpp:222
Definition: test_utils_custom_float_type.hpp:110
The default radix ranking algorithm.
Configuration for the device-level lower bound operation.
Definition: device_config_helper.hpp:591
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
Definition: device_config_helper.hpp:56
Definition: device_config_helper.hpp:137
Definition: device_config_helper.hpp:265
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Configuration of subalgorithm Onesweep.
Definition: device_config_helper.hpp:188
block_load_method
block_load_method enumerates the methods available to load data from continuous memory into a blocked...
Definition: block_load.hpp:41
Definition: device_config_helper.hpp:105
Configuration for the device-level binary search operation.
Definition: device_config_helper.hpp:571
Default values are provided by merge_sort_block_merge_config_base.
Definition: device_config_helper.hpp:122
Definition: device_config_helper.hpp:660
Default values are provided by merge_sort_block_sort_config_base.
Definition: device_config_helper.hpp:47
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension.
Definition: thread.hpp:268
Provides the kernel parameters for exclusive_scan and inclusive_scan based on autotuned configuration...
Definition: device_config_helper.hpp:270
Definition: device_config_helper.hpp:598
Definition: device_config_helper.hpp:388
Definition: device_config_helper.hpp:90
Configuration of device-level scan primitives.
Definition: device_config_helper.hpp:294
Provides the kernel parameters for histogram_even, multi_histogram_even, histogram_range, and multi_histogram_range based on autotuned configurations or user-provided configurations.
Definition: device_config_helper.hpp:607
Definition: benchmark_block_histogram.cpp:64
Configuration for the device-level upper bound operation.
Definition: device_config_helper.hpp:581
Definition: device_config_helper.hpp:375
Definition: device_config_helper.hpp:254
Provides the kernel parameters for exclusive_scan_by_key and inclusive_scan_by_key based on autotuned...
Definition: device_config_helper.hpp:393
block_sort_algorithm
Available algorithms for block_sort primitive.
Definition: block_sort.hpp:41
block_scan_algorithm
Available algorithms for block_scan primitive.
Definition: block_scan.hpp:41
Configuration of particular kernels launched by device-level operation.
Definition: config_types.hpp:84
Definition: device_config_helper.hpp:211
Default values are provided by radix_sort_onesweep_config_base.
Definition: device_config_helper.hpp:163
struct deprecated("use radix_sort_config_v2")]] radix_sort_config
Legacy configuration of device-level radix sort operation.
Definition: device_radix_sort_config.hpp:95