21 #ifndef ROCPRIM_DEVICE_CONFIG_TYPES_HPP_ 22 #define ROCPRIM_DEVICE_CONFIG_TYPES_HPP_ 27 #include <type_traits> 31 #include "../config.hpp" 32 #include "../intrinsics/thread.hpp" 33 #include "../detail/various.hpp" 38 BEGIN_ROCPRIM_NAMESPACE
47 #ifndef DOXYGEN_SHOULD_SKIP_THIS 68 unsigned int block_size = 64;
70 unsigned int items_per_thread = 1;
72 unsigned int size_limit = ROCPRIM_GRID_SIZE_LIMIT;
81 template<
unsigned int BlockSize,
82 unsigned int ItemsPerThread,
83 unsigned int SizeLimit = ROCPRIM_GRID_SIZE_LIMIT>
89 static constexpr
unsigned int block_size = BlockSize;
91 static constexpr
unsigned int items_per_thread = ItemsPerThread;
93 static constexpr
unsigned int size_limit = SizeLimit;
100 unsigned int MaxBlockSize,
101 unsigned int SharedMemoryPerThread,
103 unsigned int MinBlockSize,
107 bool = (MaxBlockSize * SharedMemoryPerThread <= (1u << 15))
112 static constexpr
unsigned int value =
114 detail::next_power_of_two(MaxBlockSize) / 2,
115 SharedMemoryPerThread,
121 unsigned int MaxBlockSize,
122 unsigned int SharedMemoryPerThread,
123 unsigned int MinBlockSize
127 static_assert(MaxBlockSize >= MinBlockSize,
"Data is too large, it cannot fit in shared memory");
129 static constexpr
unsigned int value = MaxBlockSize;
132 template<
unsigned int Arch,
class T>
135 static constexpr
unsigned int arch = Arch;
139 template<
unsigned int TargetArch,
class Case,
class... OtherCases>
142 Case::arch == TargetArch,
143 extract_type<typename Case::type>,
144 select_arch<TargetArch, OtherCases...>
147 template<
unsigned int TargetArch,
class Universal>
148 struct select_arch<TargetArch, Universal> : extract_type<Universal> { };
150 template<
class Config,
class Default>
151 using default_or_custom_config =
152 typename std::conditional<
153 std::is_same<Config, default_config>::value,
158 #ifndef DOXYGEN_SHOULD_SKIP_THIS 159 enum class target_arch : unsigned int
172 #endif // DOXYGEN_SHOULD_SKIP_THIS 182 constexpr
bool prefix_equals(
const char* lhs,
const char* rhs, std::size_t n)
187 if(*lhs != *rhs || *lhs ==
'\0')
196 return i == n && *lhs ==
'\0';
199 constexpr target_arch get_target_arch_from_name(
const char*
const arch_name,
const std::size_t n)
201 constexpr
const char* target_names[]
202 = {
"gfx803",
"gfx900",
"gfx906",
"gfx908",
"gfx90a",
"gfx1030",
"gfx1102"};
203 constexpr target_arch target_architectures[] = {
209 target_arch::gfx1030,
210 target_arch::gfx1102,
212 static_assert(
sizeof(target_names) /
sizeof(target_names[0])
213 ==
sizeof(target_architectures) /
sizeof(target_architectures[0]),
214 "target_names and target_architectures should have the same number of elements");
215 constexpr
auto num_architectures =
sizeof(target_names) /
sizeof(target_names[0]);
217 for(
unsigned int i = 0; i < num_architectures; ++i)
221 return target_architectures[i];
224 return target_arch::unknown;
237 #if defined(__amdgcn_processor__) 239 return get_target_arch_from_name(__amdgcn_processor__,
240 sizeof(__amdgcn_processor__) -
sizeof(
'\0'));
242 return target_arch::unknown;
246 template<
class Config>
247 auto dispatch_target_arch(
const target_arch target_arch)
251 case target_arch::unknown:
252 return Config::template architecture_config<target_arch::unknown>::params;
253 case target_arch::gfx803:
254 return Config::template architecture_config<target_arch::gfx803>::params;
255 case target_arch::gfx900:
256 return Config::template architecture_config<target_arch::gfx900>::params;
257 case target_arch::gfx906:
258 return Config::template architecture_config<target_arch::gfx906>::params;
259 case target_arch::gfx908:
260 return Config::template architecture_config<target_arch::gfx908>::params;
261 case target_arch::gfx90a:
262 return Config::template architecture_config<target_arch::gfx90a>::params;
263 case target_arch::gfx1030:
264 return Config::template architecture_config<target_arch::gfx1030>::params;
265 case target_arch::gfx1102:
266 return Config::template architecture_config<target_arch::gfx1102>::params;
267 case target_arch::invalid:
268 assert(
false &&
"Invalid target architecture selected at runtime.");
270 return Config::template architecture_config<target_arch::unknown>::params;
273 template<
typename Config>
274 constexpr
auto device_params()
276 return Config::template architecture_config<device_target_arch()>::params;
279 inline target_arch parse_gcn_arch(
const char* arch_name)
281 static constexpr
auto length =
sizeof(hipDeviceProp_t::gcnArchName);
283 const char* arch_end = std::find_if(arch_name,
285 [](
const char& val) {
return val ==
':' || val ==
'\0'; });
287 return get_target_arch_from_name(arch_name, arch_end - arch_name);
290 inline hipError_t get_device_arch(
int device_id, target_arch& arch)
292 static constexpr
unsigned int device_arch_cache_size = 512;
293 static std::atomic<target_arch> arch_cache[device_arch_cache_size] = {};
295 assert(device_id >= 0);
296 if(static_cast<unsigned int>(device_id) >= device_arch_cache_size)
299 return hipErrorUnknown;
302 arch = arch_cache[device_id].load(std::memory_order_relaxed);
303 if(arch != target_arch::invalid)
308 hipDeviceProp_t device_props;
309 const hipError_t result = hipGetDeviceProperties(&device_props, device_id);
310 if(result != hipSuccess)
315 arch = parse_gcn_arch(device_props.gcnArchName);
316 arch_cache[device_id].exchange(arch, std::memory_order_relaxed);
322 inline hipError_t get_device_from_stream(
const hipStream_t stream,
int& device_id)
324 static constexpr hipStream_t default_stream = 0;
325 if(stream == default_stream || stream == hipStreamPerThread)
327 const hipError_t result = hipGetDevice(&device_id);
328 if(result != hipSuccess)
335 #ifdef __HIP_PLATFORM_AMD__ 336 device_id = hipGetStreamDeviceId(stream);
339 return hipErrorInvalidHandle;
342 #error("Getting the current device from a stream is not implemented for this platform"); 348 inline hipError_t host_target_arch(
const hipStream_t stream, target_arch& arch)
352 arch = target_arch::unknown;
356 const hipError_t result = get_device_from_stream(stream, device_id);
357 if(result != hipSuccess)
362 return get_device_arch(device_id, arch);
368 END_ROCPRIM_NAMESPACE
373 #endif // ROCPRIM_DEVICE_CONFIG_TYPES_HPP_ ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
constexpr bool prefix_equals(const char *lhs, const char *rhs, std::size_t n)
Checks if the first n characters of rhs are equal to lhs
Definition: config_types.hpp:182
Definition: config_types.hpp:65
Definition: config_types.hpp:109
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: config_types.hpp:140
Configuration of particular kernels launched by device-level operation.
Definition: config_types.hpp:84
Definition: config_types.hpp:133
constexpr target_arch device_target_arch()
Get the current architecture in device compilation.
Definition: config_types.hpp:235