21 #ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_ 22 #define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_ 24 #include "config_types.hpp" 26 #include "../block/block_load.hpp" 27 #include "../block/block_scan.hpp" 28 #include "../block/block_store.hpp" 30 #include "../config.hpp" 37 BEGIN_ROCPRIM_NAMESPACE
50 template<
unsigned int BlockSize,
51 unsigned int ItemsPerThread,
55 unsigned int TilesPerBlock = 1,
56 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
93 template<
class ScanConfig,
class ReduceConfig>
98 using scan = ScanConfig;
100 using reduce = ReduceConfig;
109 template<
typename Key,
typename Value>
112 static constexpr
unsigned int size_memory_per_item =
std::max(
sizeof(Key),
sizeof(Value));
114 static constexpr
unsigned int item_scale
115 =
static_cast<unsigned int>(ceiling_div(size_memory_per_item, 2 *
sizeof(
int)));
121 items_per_thread * size_memory_per_item,
122 ROCPRIM_WARP_SIZE_64>::value,
130 template<
unsigned int TargetArch,
class Key,
class Value>
132 : std::conditional_t<std::max(sizeof(Key), sizeof(Value)) <= 16,
133 rocprim::reduce_by_key_config_v2<256,
135 block_load_method::block_load_transpose,
136 block_load_method::block_load_transpose,
137 block_scan_algorithm::using_warp_scan,
138 sizeof(Value) < 16 ? 1 : 2>,
139 typename reduce_by_key::fallback_config<Key, Value>::type>
146 END_ROCPRIM_NAMESPACE
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
static constexpr block_load_method load_values_method
A rocprim::block_load_method emum value indicating how the values should be loaded.
Definition: device_reduce_by_key_config.hpp:74
Definition: config_types.hpp:109
static constexpr unsigned int tiles_per_block
Number of tiles (BlockSize * ItemsPerThread items) to process per block.
Definition: device_reduce_by_key_config.hpp:63
static constexpr unsigned int block_size
Number of threads in a block.
Definition: device_reduce_by_key_config.hpp:60
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
A warp_scan based algorithm.
Definition: benchmark_block_reduce.cpp:63
Definition: device_reduce_by_key_config.hpp:110
static constexpr block_scan_algorithm scan_algorithm
A rocprim::block_scan_algorithm enum value indicating how the reduction should be done...
Definition: device_reduce_by_key_config.hpp:78
struct deprecated("use reduce_by_key_config_v2")]] reduce_by_key_config
Legacy configuration of device-level reduce-by-key operation.
Definition: device_reduce_by_key_config.hpp:94
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
block_load_method
block_load_method enumerates the methods available to load data from continuous memory into a blocked...
Definition: block_load.hpp:41
A striped arrangement of data from continuous memory is locally transposed into a blocked arrangement...
static constexpr unsigned int items_per_thread
Number of items processed by each thread per tile.
Definition: device_reduce_by_key_config.hpp:66
hipError_t reduce_by_key(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, ValuesInputIterator values_input, const size_t size, UniqueOutputIterator unique_output, AggregatesOutputIterator aggregates_output, UniqueCountOutputIterator unique_count_output, BinaryFunction reduce_op=BinaryFunction(), KeyCompareFunction key_compare_op=KeyCompareFunction(), hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduce-by-key primitive for device level.
Definition: device_reduce_by_key.hpp:478
Configuration of device-level reduce-by-key operation.
Definition: device_reduce_by_key_config.hpp:57
static constexpr block_load_method load_keys_method
A rocprim::block_load_method emum value indicating how the keys should be loaded. ...
Definition: device_reduce_by_key_config.hpp:70
static constexpr unsigned int size_limit
Maximum possible number of values. Defaults to ROCPRIM_GRID_SIZE_LIMIT.
Definition: device_reduce_by_key_config.hpp:81
block_scan_algorithm
Available algorithms for block_scan primitive.
Definition: block_scan.hpp:41