10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_ 11 #define CUDA_API_WRAPPERS_KERNEL_HPP_ 20 #if CUDA_VERSION < 11000 21 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 0 22 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual 24 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 1 25 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE 63 kernel::handle_t handle,
64 bool hold_primary_context_refcount_unit =
false);
68 inline ::std::string identify(
const kernel_t& kernel);
70 static const char* attribute_name(
int attribute_index)
73 static const char* names[] = {
74 "Maximum number of threads per block",
75 "Statically-allocated shared memory size in bytes",
76 "Required constant memory size in bytes",
77 "Required local memory size in bytes",
78 "Number of registers used by each thread",
79 "PTX virtual architecture version into which the kernel code was compiled",
80 "Binary architecture version for which the function was compiled",
81 "Indication whether the function was compiled with cache mode CA",
82 "Maximum allowed size of dynamically-allocated shared memory use size bytes",
83 "Preferred shared memory carve-out to actual shared memory" 85 return names[attribute_index];
91 auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
92 throw_if_error_lazy(result, ::std::string(
"Failed obtaining attribute ") + attribute_name(attribute));
93 return attribute_value;
98 #if CUDA_VERSION >= 9000 99 auto result = cuFuncSetAttribute(handle, static_cast<CUfunction_attribute>(attribute), value);
101 "Setting CUDA device function attribute " +
102 ::std::string(kernel::detail_::attribute_name(attribute)) +
" of function at " 103 + cuda::kernel::detail_::identify(handle) +
" to value " + ::std::to_string(value));
109 #if CUDA_VERSION >= 12030 110 inline const char * get_name_in_current_context(handle_t handle)
113 auto status = cuFuncGetName(&result, handle);
118 inline const char * get_name(
context::handle_t context_handle, handle_t kernel_handle)
120 CAW_SET_SCOPE_CONTEXT(context_handle);
121 return get_name_in_current_context(kernel_handle);
127 auto status = cuFuncGetModule(&result, handle);
134 CAW_SET_SCOPE_CONTEXT(context_handle);
135 return get_module_in_current_context(kernel_handle);
138 #endif // CUDA_VERSION >= 12300 178 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE 179 kernel::handle_t handle()
const noexcept {
return handle_; }
184 if (handle_ ==
nullptr) {
185 throw runtime_error(status::named_t::invalid_resource_handle,
186 "CUDA driver handle unavailable for kernel");
193 #if CUDA_VERSION >= 12030 194 const char *mangled_name()
const {
return cuda::kernel::detail_::get_name(context_handle_, handle_); }
205 ::std::swap(device_id_, other.device_id_);
206 ::std::swap(context_handle_, other.context_handle_);
207 ::std::swap(handle_, other.handle_);
208 ::std::swap(holds_pc_refcount_unit, holds_pc_refcount_unit);
215 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
218 return kernel::get_attribute(*
this, attribute);
222 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
225 auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
230 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
232 auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
243 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
246 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
249 #if CUDA_VERSION >= 10000 276 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
281 bool disable_caching_override =
false)
const;
283 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
287 bool disable_caching_override =
false)
const;
289 #endif // CUDA_VERSION >= 10000 309 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
313 bool disable_caching_override =
false)
const;
319 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
336 if (amount_required_by_kernel != static_cast<cuda::memory::shared::size_t>(amount_required_by_kernel_)) {
337 throw ::std::invalid_argument(
"Requested amount of maximum shared memory exceeds the " 338 "representation range for kernel attribute values");
341 set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
346 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
366 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
369 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
370 auto result = cuFuncSetCacheConfig(handle(), static_cast<CUfunc_cache>(preference));
372 "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " 373 "CUDA device function");
376 #if CUDA_VERSION < 12030 382 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
386 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
387 auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
390 #endif // CUDA_VERSION < 12030 397 kernel::handle_t handle,
398 bool hold_primary_context_refcount_unit)
400 device_id_(device_id),
401 context_handle_(context_handle),
403 holds_pc_refcount_unit(hold_primary_context_refcount_unit)
410 kernel_t(other.device_id_, other.context_handle_, other.handle_,
false) { }
413 kernel_t(other.device_id_, other.context_handle_, other.handle_,
false)
415 ::std::swap(holds_pc_refcount_unit, other.holds_pc_refcount_unit);
419 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
423 if (holds_pc_refcount_unit) {
425 device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
429 device::primary_context::detail_::decrease_refcount(device_id_);
437 mutable kernel::handle_t handle_;
438 bool holds_pc_refcount_unit;
446 kernel::handle_t handle,
447 bool hold_primary_context_refcount_unit)
449 return kernel_t{device_id, context_handle, handle, hold_primary_context_refcount_unit };
455 return detail_::get_attribute_in_current_context(kernel.
handle(), attribute);
461 return detail_::set_attribute_in_current_context(kernel.
handle(), attribute, value);
464 namespace occupancy {
472 bool disable_caching_override)
476 auto flags =
static_cast<unsigned>(disable_caching_override) ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
477 cuda::status_t status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
478 &result, handle, static_cast<int>(block_size_in_threads), dynamic_shared_memory_per_block, flags);
480 "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
484 #if CUDA_VERSION >= 10000 488 CUfunction kernel_handle,
490 CUoccupancyB2DSize determine_shared_mem_by_block_size,
493 bool disable_caching_override)
495 int min_grid_size_in_blocks { 0 };
496 int block_size { 0 };
500 auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
501 &min_grid_size_in_blocks, &block_size,
503 determine_shared_mem_by_block_size,
504 fixed_shared_mem_size,
505 static_cast<int>(block_size_limit),
506 disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
510 "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
511 +
" with maximum occupancy given dynamic shared memory and block size data");
512 return {
static_cast<grid::dimension_t>(min_grid_size_in_blocks), static_cast<grid::block_dimension_t>(block_size) };
514 #endif // CUDA_VERSION >= 10000 518 #if CUDA_VERSION >= 11000 528 auto status = cuOccupancyAvailableDynamicSMemPerBlock(
529 &result, kernel.
handle(),
static_cast<int>(blocks_on_multiprocessor), static_cast<int>(block_size_in_threads));
531 "the number of blocks on a multiprocessor and their size");
534 #endif // CUDA_VERSION >= 11000 543 bool disable_caching_override =
false);
549 inline ::std::string identify(
const kernel_t& kernel)
551 return kernel::detail_::identify(kernel.
handle()) +
" in " + context::detail_::identify(kernel.
context());
558 #if CUDA_VERSION >= 10000 562 bool disable_caching_override)
const 565 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
566 handle(), device_id(), no_shared_memory_size_determiner,
567 dynamic_shared_memory_size, block_size_limit, disable_caching_override);
573 bool disable_caching_override)
const 576 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
577 handle(), device_id(), shared_memory_size_determiner,
578 no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
580 #endif // CUDA_VERSION >= 10000 585 bool disable_caching_override)
const 587 return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
588 handle(), block_size_in_threads,
589 dynamic_shared_memory_per_block, disable_caching_override);
602 return not (lhs == rhs);
607 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_ context::handle_t context_handle() const noexcept
Get the raw handle of the context in which this kernel is defined.
Definition: kernel.hpp:171
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t ptx_version() const
Definition: kernel.hpp:223
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t binary_compilation_target_architecture() const
Definition: kernel.hpp:231
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
Wrapper class for a CUDA code module.
Definition: module.hpp:123
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
Sets a device function's preference of shared memory bank size.
Definition: kernel.hpp:383
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
Change the hardware resource carve-out between L1 cache and shared memory for launches of the kernel ...
Definition: kernel.hpp:333
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
CUmodule handle_t
Raw CUDA driver handle of a module of compiled code; see module_t.
Definition: module.hpp:34
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_cache_preference(multiprocessor_cache_preference_t preference) const
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with c...
Definition: kernel.hpp:367
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: kernel.hpp:582
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983
Classes representing specific and overall properties of CUDA devices.