10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_ 11 #define CUDA_API_WRAPPERS_KERNEL_HPP_ 20 #if CUDA_VERSION < 11000 21 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 0 22 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual 24 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 1 25 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE 63 kernel::handle_t handle,
64 bool hold_primary_context_refcount_unit =
false);
68 inline ::std::string identify(
const kernel_t& kernel);
70 static const char* attribute_name(
int attribute_index)
73 static const char* names[] = {
74 "Maximum number of threads per block",
75 "Statically-allocated shared memory size in bytes",
76 "Required constant memory size in bytes",
77 "Required local memory size in bytes",
78 "Number of registers used by each thread",
79 "PTX virtual architecture version into which the kernel code was compiled",
80 "Binary architecture version for which the function was compiled",
81 "Indication whether the function was compiled with cache mode CA",
82 "Maximum allowed size of dynamically-allocated shared memory use size bytes",
83 "Preferred shared memory carve-out to actual shared memory" 85 return names[attribute_index];
91 auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
92 throw_if_error_lazy(result, ::std::string(
"Failed obtaining attribute ") + attribute_name(attribute));
93 return attribute_value;
98 #if CUDA_VERSION >= 9000 99 auto result = cuFuncSetAttribute(handle, static_cast<CUfunction_attribute>(attribute), value);
101 "Setting CUDA device function attribute " +
102 ::std::string(kernel::detail_::attribute_name(attribute)) +
" of function at " 103 + cuda::kernel::detail_::identify(handle) +
" to value " + ::std::to_string(value));
109 #if CUDA_VERSION >= 12030 110 inline const char * get_name_in_current_context(handle_t handle)
113 auto status = cuFuncGetName(&result, handle);
118 inline const char * get_name(
context::handle_t context_handle, handle_t kernel_handle)
120 CAW_SET_SCOPE_CONTEXT(context_handle);
121 return get_name_in_current_context(kernel_handle);
127 auto status = cuFuncGetModule(&result, handle);
134 CAW_SET_SCOPE_CONTEXT(context_handle);
135 return get_module_in_current_context(kernel_handle);
138 #endif // CUDA_VERSION >= 12300 178 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE 179 kernel::handle_t handle()
const noexcept {
return handle_; }
184 if (handle_ ==
nullptr) {
185 throw runtime_error(status::named_t::invalid_resource_handle,
186 "CUDA driver handle unavailable for kernel");
193 #if CUDA_VERSION >= 12030 194 const char *mangled_name()
const {
return cuda::kernel::detail_::get_name(context_handle_, handle_); }
205 ::std::swap(device_id_, other.device_id_);
206 ::std::swap(context_handle_, other.context_handle_);
207 ::std::swap(handle_, other.handle_);
208 ::std::swap(holds_pc_refcount_unit, holds_pc_refcount_unit);
215 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
218 return kernel::get_attribute(*
this, attribute);
222 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
225 auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
230 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
232 auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
243 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
246 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
249 #if CUDA_VERSION >= 10000 276 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
281 bool disable_caching_override =
false)
const;
283 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
287 bool disable_caching_override =
false)
const;
289 #endif // CUDA_VERSION >= 10000 309 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
313 bool disable_caching_override =
false)
const;
319 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
336 if (amount_required_by_kernel != static_cast<cuda::memory::shared::size_t>(amount_required_by_kernel_)) {
337 throw ::std::invalid_argument(
"Requested amount of maximum shared memory exceeds the " 338 "representation range for kernel attribute values");
341 set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
346 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
366 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
369 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
370 auto result = cuFuncSetCacheConfig(handle(), static_cast<CUfunc_cache>(preference));
372 "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " 373 "CUDA device function");
376 #if CUDA_VERSION < 12030 382 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
386 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
387 auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
390 #endif // CUDA_VERSION < 12030 397 kernel::handle_t handle,
398 bool hold_primary_context_refcount_unit)
400 device_id_(device_id),
401 context_handle_(context_handle),
403 holds_pc_refcount_unit(hold_primary_context_refcount_unit)
410 kernel_t(other.device_id_, other.context_handle_, other.handle_,
false) { }
413 kernel_t(other.device_id_, other.context_handle_, other.handle_,
false)
415 ::std::swap(holds_pc_refcount_unit, other.holds_pc_refcount_unit);
419 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
420 ~
kernel_t() DESTRUCTOR_EXCEPTION_SPEC
423 if (holds_pc_refcount_unit) {
424 device::primary_context::detail_::decrease_refcount_in_dtor(device_id_);
431 mutable kernel::handle_t handle_;
432 bool holds_pc_refcount_unit;
440 kernel::handle_t handle,
441 bool hold_primary_context_refcount_unit)
443 return kernel_t{device_id, context_handle, handle, hold_primary_context_refcount_unit };
449 return detail_::get_attribute_in_current_context(kernel.
handle(), attribute);
455 return detail_::set_attribute_in_current_context(kernel.
handle(), attribute, value);
458 namespace occupancy {
466 bool disable_caching_override)
470 auto flags =
static_cast<unsigned>(disable_caching_override) ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
471 cuda::status_t status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
472 &result, handle, static_cast<int>(block_size_in_threads), dynamic_shared_memory_per_block, flags);
474 "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
478 #if CUDA_VERSION >= 10000 482 CUfunction kernel_handle,
484 CUoccupancyB2DSize determine_shared_mem_by_block_size,
487 bool disable_caching_override)
489 int min_grid_size_in_blocks { 0 };
490 int block_size { 0 };
494 auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
495 &min_grid_size_in_blocks, &block_size,
497 determine_shared_mem_by_block_size,
498 fixed_shared_mem_size,
499 static_cast<int>(block_size_limit),
500 disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
504 "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
505 +
" with maximum occupancy given dynamic shared memory and block size data");
506 return {
static_cast<grid::dimension_t>(min_grid_size_in_blocks), static_cast<grid::block_dimension_t>(block_size) };
508 #endif // CUDA_VERSION >= 10000 512 #if CUDA_VERSION >= 11000 522 auto status = cuOccupancyAvailableDynamicSMemPerBlock(
523 &result, kernel.
handle(),
static_cast<int>(blocks_on_multiprocessor), static_cast<int>(block_size_in_threads));
525 "the number of blocks on a multiprocessor and their size");
528 #endif // CUDA_VERSION >= 11000 537 bool disable_caching_override =
false);
543 inline ::std::string identify(
const kernel_t& kernel)
545 return kernel::detail_::identify(kernel.
handle()) +
" in " + context::detail_::identify(kernel.
context());
552 #if CUDA_VERSION >= 10000 556 bool disable_caching_override)
const 559 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
560 handle(), device_id(), no_shared_memory_size_determiner,
561 dynamic_shared_memory_size, block_size_limit, disable_caching_override);
567 bool disable_caching_override)
const 570 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
571 handle(), device_id(), shared_memory_size_determiner,
572 no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
574 #endif // CUDA_VERSION >= 10000 579 bool disable_caching_override)
const 581 return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
582 handle(), block_size_in_threads,
583 dynamic_shared_memory_per_block, disable_caching_override);
596 return not (lhs == rhs);
601 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_ context::handle_t context_handle() const noexcept
Get the raw handle of the context in which this kernel is defined.
Definition: kernel.hpp:171
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:990
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:296
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t ptx_version() const
Definition: kernel.hpp:223
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
Wrapper class for a CUDA context.
Definition: context.hpp:249
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:309
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t binary_compilation_target_architecture() const
Definition: kernel.hpp:231
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
Wrapper class for a CUDA code module.
Definition: module.hpp:126
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:832
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:732
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:78
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
Sets a device function's preference of shared memory bank size.
Definition: kernel.hpp:383
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:806
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:505
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:768
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
Change the hardware resource carve-out between L1 cache and shared memory for launches of the kernel ...
Definition: kernel.hpp:333
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
CUmodule handle_t
Raw CUDA driver handle of a module of compiled code; see module_t.
Definition: module.hpp:32
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_cache_preference(multiprocessor_cache_preference_t preference) const
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with c...
Definition: kernel.hpp:367
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: kernel.hpp:576
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:985
Classes representing specific and overall properties of CUDA devices.