rocPRIM
device_reduce_by_key_config.hpp
1 // Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
22 #define ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
23 
24 #include "config_types.hpp"
25 
26 #include "../block/block_load.hpp"
27 #include "../block/block_scan.hpp"
28 #include "../block/block_store.hpp"
29 
30 #include "../config.hpp"
31 
32 #include <algorithm>
33 
36 
37 BEGIN_ROCPRIM_NAMESPACE
38 
50 template<unsigned int BlockSize,
51  unsigned int ItemsPerThread,
55  unsigned int TilesPerBlock = 1,
56  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
58 {
60  static constexpr unsigned int block_size = BlockSize;
61 
63  static constexpr unsigned int tiles_per_block = TilesPerBlock;
64 
66  static constexpr unsigned int items_per_thread = ItemsPerThread;
67 
70  static constexpr block_load_method load_keys_method = LoadKeysMethod;
71 
74  static constexpr block_load_method load_values_method = LoadValuesMethod;
75 
78  static constexpr block_scan_algorithm scan_algorithm = ScanAlgorithm;
79 
81  static constexpr unsigned int size_limit = SizeLimit;
82 };
83 
93 template<class ScanConfig, class ReduceConfig>
94 struct [[deprecated("use reduce_by_key_config_v2")]] reduce_by_key_config
95  : reduce_by_key_config_v2<ReduceConfig::BlockSize, ReduceConfig::ItemsPerThread>
96 {
98  using scan = ScanConfig;
100  using reduce = ReduceConfig;
101 };
102 
103 namespace detail
104 {
105 
106 namespace reduce_by_key
107 {
108 
109 template<typename Key, typename Value>
111 {
112  static constexpr unsigned int size_memory_per_item = std::max(sizeof(Key), sizeof(Value));
113 
114  static constexpr unsigned int item_scale
115  = static_cast<unsigned int>(ceiling_div(size_memory_per_item, 2 * sizeof(int)));
116 
117  static constexpr unsigned int items_per_thread = std::max(1u, 15u / item_scale);
118 
119  using type
121  items_per_thread * size_memory_per_item,
122  ROCPRIM_WARP_SIZE_64>::value,
127  2>;
128 };
129 
130 template<unsigned int TargetArch, class Key, class Value>
131 struct default_config
132  : std::conditional_t<std::max(sizeof(Key), sizeof(Value)) <= 16,
133  rocprim::reduce_by_key_config_v2<256,
134  15,
135  block_load_method::block_load_transpose,
136  block_load_method::block_load_transpose,
137  block_scan_algorithm::using_warp_scan,
138  sizeof(Value) < 16 ? 1 : 2>,
139  typename reduce_by_key::fallback_config<Key, Value>::type>
140 {};
141 
142 } // namespace reduce_by_key
143 
144 } // end namespace detail
145 
146 END_ROCPRIM_NAMESPACE
147 
149 // end of group primitivesmodule_deviceconfigs
150 
151 #endif // ROCPRIM_DEVICE_DEVICE_REDUCE_BY_KEY_CONFIG_HPP_
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
static constexpr block_load_method load_values_method
A rocprim::block_load_method emum value indicating how the values should be loaded.
Definition: device_reduce_by_key_config.hpp:74
Definition: config_types.hpp:109
static constexpr unsigned int tiles_per_block
Number of tiles (BlockSize * ItemsPerThread items) to process per block.
Definition: device_reduce_by_key_config.hpp:63
static constexpr unsigned int block_size
Number of threads in a block.
Definition: device_reduce_by_key_config.hpp:60
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
A warp_scan based algorithm.
Definition: benchmark_block_reduce.cpp:63
Definition: device_reduce_by_key_config.hpp:110
static constexpr block_scan_algorithm scan_algorithm
A rocprim::block_scan_algorithm enum value indicating how the reduction should be done...
Definition: device_reduce_by_key_config.hpp:78
struct deprecated("use reduce_by_key_config_v2")]] reduce_by_key_config
Legacy configuration of device-level reduce-by-key operation.
Definition: device_reduce_by_key_config.hpp:94
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
block_load_method
block_load_method enumerates the methods available to load data from continuous memory into a blocked...
Definition: block_load.hpp:41
A striped arrangement of data from continuous memory is locally transposed into a blocked arrangement...
static constexpr unsigned int items_per_thread
Number of items processed by each thread per tile.
Definition: device_reduce_by_key_config.hpp:66
hipError_t reduce_by_key(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, ValuesInputIterator values_input, const size_t size, UniqueOutputIterator unique_output, AggregatesOutputIterator aggregates_output, UniqueCountOutputIterator unique_count_output, BinaryFunction reduce_op=BinaryFunction(), KeyCompareFunction key_compare_op=KeyCompareFunction(), hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduce-by-key primitive for device level.
Definition: device_reduce_by_key.hpp:478
Configuration of device-level reduce-by-key operation.
Definition: device_reduce_by_key_config.hpp:57
static constexpr block_load_method load_keys_method
A rocprim::block_load_method emum value indicating how the keys should be loaded. ...
Definition: device_reduce_by_key_config.hpp:70
static constexpr unsigned int size_limit
Maximum possible number of values. Defaults to ROCPRIM_GRID_SIZE_LIMIT.
Definition: device_reduce_by_key_config.hpp:81
block_scan_algorithm
Available algorithms for block_scan primitive.
Definition: block_scan.hpp:41