rocPRIM
config_types.hpp
1 // Copyright (c) 2018-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_CONFIG_TYPES_HPP_
22 #define ROCPRIM_DEVICE_CONFIG_TYPES_HPP_
23 
24 #include <algorithm>
25 #include <atomic>
26 #include <limits>
27 #include <type_traits>
28 
29 #include <cassert>
30 
31 #include "../config.hpp"
32 #include "../intrinsics/thread.hpp"
33 #include "../detail/various.hpp"
34 
37 
38 BEGIN_ROCPRIM_NAMESPACE
39 
46 {
47 #ifndef DOXYGEN_SHOULD_SKIP_THIS
48  // default_config should be able to act as if any other config, members from those configs are provided here
49  // merge_sort_config
52  // radix_sort_config_v2
56  // merge_sort_block_sort_config
58 #endif
59 };
60 
61 namespace detail
62 {
63 
64 // Non-templated kernel_config for dynamic dispatch
66 {
68  unsigned int block_size = 64;
70  unsigned int items_per_thread = 1;
72  unsigned int size_limit = ROCPRIM_GRID_SIZE_LIMIT;
73 };
74 
75 } // namespace detail
76 
81 template<unsigned int BlockSize,
82  unsigned int ItemsPerThread,
83  unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
85 {
86  constexpr kernel_config() : detail::kernel_config_params{BlockSize, ItemsPerThread, SizeLimit}
87  {}
89  static constexpr unsigned int block_size = BlockSize;
91  static constexpr unsigned int items_per_thread = ItemsPerThread;
93  static constexpr unsigned int size_limit = SizeLimit;
94 };
95 
96 namespace detail
97 {
98 
99 template<
100  unsigned int MaxBlockSize,
101  unsigned int SharedMemoryPerThread,
102  // Most kernels require block sizes not smaller than warp
103  unsigned int MinBlockSize,
104  // Can fit in shared memory?
105  // Although GPUs have 64KiB, 32KiB is used here as a "soft" limit,
106  // because some additional memory may be required in kernels
107  bool = (MaxBlockSize * SharedMemoryPerThread <= (1u << 15))
108 >
110 {
111  // No, then try to decrease block size
112  static constexpr unsigned int value =
114  detail::next_power_of_two(MaxBlockSize) / 2,
115  SharedMemoryPerThread,
116  MinBlockSize
117  >::value;
118 };
119 
120 template<
121  unsigned int MaxBlockSize,
122  unsigned int SharedMemoryPerThread,
123  unsigned int MinBlockSize
124 >
125 struct limit_block_size<MaxBlockSize, SharedMemoryPerThread, MinBlockSize, true>
126 {
127  static_assert(MaxBlockSize >= MinBlockSize, "Data is too large, it cannot fit in shared memory");
128 
129  static constexpr unsigned int value = MaxBlockSize;
130 };
131 
132 template<unsigned int Arch, class T>
134 {
135  static constexpr unsigned int arch = Arch;
136  using type = T;
137 };
138 
139 template<unsigned int TargetArch, class Case, class... OtherCases>
141  : std::conditional<
142  Case::arch == TargetArch,
143  extract_type<typename Case::type>,
144  select_arch<TargetArch, OtherCases...>
145  >::type { };
146 
147 template<unsigned int TargetArch, class Universal>
148 struct select_arch<TargetArch, Universal> : extract_type<Universal> { };
149 
150 template<class Config, class Default>
151 using default_or_custom_config =
152  typename std::conditional<
153  std::is_same<Config, default_config>::value,
154  Default,
155  Config
156  >::type;
157 
158 #ifndef DOXYGEN_SHOULD_SKIP_THIS
159 enum class target_arch : unsigned int
160 {
161  // This must be zero, to initialize the device -> architecture cache
162  invalid = 0,
163  gfx803 = 803,
164  gfx900 = 900,
165  gfx906 = 906,
166  gfx908 = 908,
167  gfx90a = 910,
168  gfx1030 = 1030,
169  gfx1102 = 1102,
171 };
172 #endif // DOXYGEN_SHOULD_SKIP_THIS
173 
182 constexpr bool prefix_equals(const char* lhs, const char* rhs, std::size_t n)
183 {
184  std::size_t i = 0;
185  for(; i < n; ++i)
186  {
187  if(*lhs != *rhs || *lhs == '\0')
188  {
189  break;
190  }
191  ++lhs;
192  ++rhs;
193  }
194 
195  // All characters of the prefix of `rhs` was consumed and `lhs` "has run out"
196  return i == n && *lhs == '\0';
197 }
198 
199 constexpr target_arch get_target_arch_from_name(const char* const arch_name, const std::size_t n)
200 {
201  constexpr const char* target_names[]
202  = {"gfx803", "gfx900", "gfx906", "gfx908", "gfx90a", "gfx1030", "gfx1102"};
203  constexpr target_arch target_architectures[] = {
204  target_arch::gfx803,
205  target_arch::gfx900,
206  target_arch::gfx906,
207  target_arch::gfx908,
208  target_arch::gfx90a,
209  target_arch::gfx1030,
210  target_arch::gfx1102,
211  };
212  static_assert(sizeof(target_names) / sizeof(target_names[0])
213  == sizeof(target_architectures) / sizeof(target_architectures[0]),
214  "target_names and target_architectures should have the same number of elements");
215  constexpr auto num_architectures = sizeof(target_names) / sizeof(target_names[0]);
216 
217  for(unsigned int i = 0; i < num_architectures; ++i)
218  {
219  if(prefix_equals(target_names[i], arch_name, n))
220  {
221  return target_architectures[i];
222  }
223  }
224  return target_arch::unknown;
225 }
226 
235 constexpr target_arch device_target_arch()
236 {
237 #if defined(__amdgcn_processor__)
238  // The terminating zero is not counted in the length of the string
239  return get_target_arch_from_name(__amdgcn_processor__,
240  sizeof(__amdgcn_processor__) - sizeof('\0'));
241 #else
242  return target_arch::unknown;
243 #endif
244 }
245 
246 template<class Config>
247 auto dispatch_target_arch(const target_arch target_arch)
248 {
249  switch(target_arch)
250  {
251  case target_arch::unknown:
252  return Config::template architecture_config<target_arch::unknown>::params;
253  case target_arch::gfx803:
254  return Config::template architecture_config<target_arch::gfx803>::params;
255  case target_arch::gfx900:
256  return Config::template architecture_config<target_arch::gfx900>::params;
257  case target_arch::gfx906:
258  return Config::template architecture_config<target_arch::gfx906>::params;
259  case target_arch::gfx908:
260  return Config::template architecture_config<target_arch::gfx908>::params;
261  case target_arch::gfx90a:
262  return Config::template architecture_config<target_arch::gfx90a>::params;
263  case target_arch::gfx1030:
264  return Config::template architecture_config<target_arch::gfx1030>::params;
265  case target_arch::gfx1102:
266  return Config::template architecture_config<target_arch::gfx1102>::params;
267  case target_arch::invalid:
268  assert(false && "Invalid target architecture selected at runtime.");
269  }
270  return Config::template architecture_config<target_arch::unknown>::params;
271 }
272 
273 template<typename Config>
274 constexpr auto device_params()
275 {
276  return Config::template architecture_config<device_target_arch()>::params;
277 }
278 
279 inline target_arch parse_gcn_arch(const char* arch_name)
280 {
281  static constexpr auto length = sizeof(hipDeviceProp_t::gcnArchName);
282 
283  const char* arch_end = std::find_if(arch_name,
284  arch_name + length,
285  [](const char& val) { return val == ':' || val == '\0'; });
286 
287  return get_target_arch_from_name(arch_name, arch_end - arch_name);
288 }
289 
290 inline hipError_t get_device_arch(int device_id, target_arch& arch)
291 {
292  static constexpr unsigned int device_arch_cache_size = 512;
293  static std::atomic<target_arch> arch_cache[device_arch_cache_size] = {};
294 
295  assert(device_id >= 0);
296  if(static_cast<unsigned int>(device_id) >= device_arch_cache_size)
297  {
298  // Device architecture cache is too small.
299  return hipErrorUnknown;
300  }
301 
302  arch = arch_cache[device_id].load(std::memory_order_relaxed);
303  if(arch != target_arch::invalid)
304  {
305  return hipSuccess;
306  }
307 
308  hipDeviceProp_t device_props;
309  const hipError_t result = hipGetDeviceProperties(&device_props, device_id);
310  if(result != hipSuccess)
311  {
312  return result;
313  }
314 
315  arch = parse_gcn_arch(device_props.gcnArchName);
316  arch_cache[device_id].exchange(arch, std::memory_order_relaxed);
317 
318  return hipSuccess;
319 }
320 
321 #ifndef _WIN32
322 inline hipError_t get_device_from_stream(const hipStream_t stream, int& device_id)
323 {
324  static constexpr hipStream_t default_stream = 0;
325  if(stream == default_stream || stream == hipStreamPerThread)
326  {
327  const hipError_t result = hipGetDevice(&device_id);
328  if(result != hipSuccess)
329  {
330  return result;
331  }
332  return hipSuccess;
333  }
334 
335 #ifdef __HIP_PLATFORM_AMD__
336  device_id = hipGetStreamDeviceId(stream);
337  if(device_id < 0)
338  {
339  return hipErrorInvalidHandle;
340  }
341 #else
342  #error("Getting the current device from a stream is not implemented for this platform");
343 #endif
344  return hipSuccess;
345 }
346 #endif
347 
348 inline hipError_t host_target_arch(const hipStream_t stream, target_arch& arch)
349 {
350 #ifdef _WIN32
351  (void)stream;
352  arch = target_arch::unknown;
353  return hipSuccess;
354 #else
355  int device_id;
356  const hipError_t result = get_device_from_stream(stream, device_id);
357  if(result != hipSuccess)
358  {
359  return result;
360  }
361 
362  return get_device_arch(device_id, arch);
363 #endif
364 }
365 
366 } // end namespace detail
367 
368 END_ROCPRIM_NAMESPACE
369 
371 // end of group primitivesmodule_deviceconfigs
372 
373 #endif // ROCPRIM_DEVICE_CONFIG_TYPES_HPP_
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
constexpr bool prefix_equals(const char *lhs, const char *rhs, std::size_t n)
Checks if the first n characters of rhs are equal to lhs
Definition: config_types.hpp:182
Definition: config_types.hpp:65
Definition: config_types.hpp:109
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: config_types.hpp:140
Configuration of particular kernels launched by device-level operation.
Definition: config_types.hpp:84
Definition: config_types.hpp:133
constexpr target_arch device_target_arch()
Get the current architecture in device compilation.
Definition: config_types.hpp:235