10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_ 11 #define CUDA_API_WRAPPERS_KERNEL_HPP_ 18 #include <cuda_runtime.h> 21 #if CUDA_VERSION < 11000 22 #define CAN_GET_APRIORI_KERNEL_HANDLE 0 23 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual 25 #define CAN_GET_APRIORI_KERNEL_HANDLE 1 26 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE 38 using shared_memory_size_determiner_t = size_t (*)(
int block_size);
53 context::handle_t context_id,
58 inline ::std::string identify(
const kernel_t& kernel);
61 static const char* attribute_name(
int attribute_index)
64 static const char* names[] = {
65 "Maximum number of threads per block",
66 "Statically-allocated shared memory size in bytes",
67 "Required constant memory size in bytes",
68 "Required local memory size in bytes",
69 "Number of registers used by each thread",
70 "PTX virtual architecture version into which the kernel code was compiled",
71 "Binary architecture version for which the function was compiled",
72 "Indication whether the function was compiled with cache mode CA",
73 "Maximum allowed size of dynamically-allocated shared memory use size bytes",
74 "Preferred shared memory carve-out to actual shared memory" 76 return names[attribute_index];
80 inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
83 auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
85 ::std::string(
"Failed obtaining attribute ") +
87 ::std::to_string(
static_cast<::std::underlying_type<kernel::attribute_t>::type
>(attribute))
89 attribute_name(attribute)
92 return attribute_value;
119 context_t context() const noexcept;
120 device_t device() const noexcept;
122 device::
id_t device_id() const noexcept {
return device_id_; }
123 context::handle_t context_handle() const noexcept {
return context_handle_; }
124 #if ! CAN_GET_APRIORI_KERNEL_HANDLE 125 kernel::handle_t handle()
const 128 if (handle_ ==
nullptr) {
129 throw runtime_error(status::named_t::invalid_resource_handle,
130 "CUDA driver handle unavailable for kernel");
136 kernel::handle_t handle() const noexcept {
return handle_; }
141 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
144 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
145 return kernel::detail_::get_attribute_in_current_context(handle(), attribute);
148 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
151 auto raw_attribute =
get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
152 return device::compute_capability_t::from_combined_number(raw_attribute);
155 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
157 auto raw_attribute =
get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
158 return device::compute_capability_t::from_combined_number(raw_attribute);
168 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
169 grid::block_dimension_t maximum_threads_per_block()
const 171 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
174 #if CUDA_VERSION >= 10000 201 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
203 grid::composite_dimensions_t min_grid_params_for_max_occupancy(
204 memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
205 grid::block_dimension_t block_size_limit = 0,
206 bool disable_caching_override =
false)
const;
208 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
209 grid::composite_dimensions_t min_grid_params_for_max_occupancy(
210 kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
211 grid::block_dimension_t block_size_limit = 0,
212 bool disable_caching_override =
false)
const;
214 #endif // CUDA_VERSION >= 10000 234 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
235 grid::dimension_t max_active_blocks_per_multiprocessor(
236 grid::block_dimension_t block_size_in_threads,
237 memory::shared::size_t dynamic_shared_memory_per_block,
238 bool disable_caching_override =
false)
const;
244 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
262 throw ::std::invalid_argument(
"Requested amount of maximum shared memory exceeds the " 263 "representation range for kernel attribute values");
266 set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
269 memory::shared::size_t get_maximum_dynamic_shared_memory_per_block()
const 271 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
291 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
294 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
295 auto result = cuFuncSetCacheConfig(handle(), (CUfunc_cache) preference);
297 "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " 298 "CUDA device function");
306 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
310 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
311 auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
312 throw_if_error(result,
"Failed setting the shared memory bank size");
316 kernel_t(
device::id_t device_id, context::handle_t context_handle, kernel::handle_t handle)
317 : device_id_(device_id), context_handle_(context_handle), handle_(handle) { }
320 friend kernel_t kernel::wrap(
device::id_t, context::handle_t, kernel::handle_t);
322 kernel_t(
const kernel_t& other) =
default;
323 kernel_t(kernel_t&& other) =
default;
326 #if ! CAN_GET_APRIORI_KERNEL_HANDLE 327 virtual ~kernel_t() =
default;
332 context::handle_t context_handle_;
333 mutable kernel::handle_t handle_;
338 inline kernel_t
wrap(
340 context::handle_t context_id,
343 return kernel_t{ device_id, context_id, f };
346 namespace occupancy {
350 inline grid::dimension_t max_active_blocks_per_multiprocessor(
352 grid::block_dimension_t block_size_in_threads,
353 memory::shared::size_t dynamic_shared_memory_per_block,
354 bool disable_caching_override)
359 auto flags = (unsigned) disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
360 status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
361 &result, handle, (
int) block_size_in_threads, dynamic_shared_memory_per_block, flags);
363 "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dyanmic memory per block");
367 #if CUDA_VERSION >= 10000 370 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
371 CUfunction kernel_handle,
373 CUoccupancyB2DSize determine_shared_mem_by_block_size,
376 bool disable_caching_override)
378 int min_grid_size_in_blocks { 0 };
379 int block_size { 0 };
383 auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
384 &min_grid_size_in_blocks, &block_size,
386 determine_shared_mem_by_block_size,
387 fixed_shared_mem_size,
388 static_cast<int>(block_size_limit),
389 disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
393 "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
394 +
" with maximum occupancy given dynamic shared memory and block size data");
395 return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
397 #endif // CUDA_VERSION >= 10000 405 #if CUDA_VERSION < 11000 406 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
409 grid::block_dimension_t)
412 "cuOccupancyAvailableDynamicSMemPerBlock() requires CUDA 11.0 or later");
415 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
416 const kernel_t &kernel,
417 grid::dimension_t blocks_on_multiprocessor,
418 grid::block_dimension_t block_size_in_threads)
421 auto status = cuOccupancyAvailableDynamicSMemPerBlock(
422 &result, kernel.handle(), (int) blocks_on_multiprocessor, (
int) block_size_in_threads);
424 "Determining the available dynamic memory per block, given the number of blocks on a multiprocessor and their size");
425 return (memory::shared::size_t) result;
427 #endif // CUDA_VERSION < 11000 432 inline grid::dimension_t max_active_blocks_per_multiprocessor(
433 const kernel_t &kernel,
434 grid::block_dimension_t block_size_in_threads,
435 memory::shared::size_t dynamic_shared_memory_per_block,
436 bool disable_caching_override =
false);
442 inline ::std::string identify(
const kernel_t& kernel)
444 return kernel::detail_::identify(kernel.handle()) +
" in " + context::detail_::identify(kernel.context());
450 #if CUDA_VERSION >= 10000 451 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
452 memory::shared::size_t dynamic_shared_memory_size,
453 grid::block_dimension_t block_size_limit,
454 bool disable_caching_override)
const 456 kernel::shared_memory_size_determiner_t no_shared_memory_size_determiner {
nullptr };
457 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
458 handle(), device_id(), no_shared_memory_size_determiner,
459 dynamic_shared_memory_size, block_size_limit, disable_caching_override);
462 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
463 kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
465 bool disable_caching_override)
const 467 size_t no_fixed_dynamic_shared_memory_size { 0 };
468 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
469 handle(), device_id(), shared_memory_size_determiner,
470 no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
472 #endif // CUDA_VERSION >= 10000 474 inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
475 grid::block_dimension_t block_size_in_threads,
476 memory::shared::size_t dynamic_shared_memory_per_block,
477 bool disable_caching_override)
const 479 return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
480 handle(), block_size_in_threads,
481 dynamic_shared_memory_per_block, disable_caching_override);
486 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_ int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:686
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:196
attribute_value_t get_attribute(attribute_t attribute, device_t first, device_t second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:97
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:22
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:297
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:676
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:74
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:313
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:656
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:600
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:269
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:630
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
Fundamental CUDA-related type definitions.
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:116