10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_ 11 #define CUDA_API_WRAPPERS_KERNEL_HPP_ 18 #if CUDA_VERSION < 11000 19 #define CAN_GET_APRIORI_KERNEL_HANDLE 0 20 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual 22 #define CAN_GET_APRIORI_KERNEL_HANDLE 1 23 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE 34 using shared_memory_size_determiner_t = size_t (CUDA_CB *)(
int block_size);
49 context::handle_t context_id,
51 bool hold_primary_context_refcount_unit =
false);
55 inline ::std::string identify(
const kernel_t&
kernel);
57 static const char* attribute_name(
int attribute_index)
60 static const char* names[] = {
61 "Maximum number of threads per block",
62 "Statically-allocated shared memory size in bytes",
63 "Required constant memory size in bytes",
64 "Required local memory size in bytes",
65 "Number of registers used by each thread",
66 "PTX virtual architecture version into which the kernel code was compiled",
67 "Binary architecture version for which the function was compiled",
68 "Indication whether the function was compiled with cache mode CA",
69 "Maximum allowed size of dynamically-allocated shared memory use size bytes",
70 "Preferred shared memory carve-out to actual shared memory" 72 return names[attribute_index];
75 inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
77 kernel::attribute_value_t attribute_value;
78 auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
79 throw_if_error_lazy(result, ::std::string(
"Failed obtaining attribute ") + attribute_name(attribute));
80 return attribute_value;
83 inline void set_attribute_in_current_context(handle_t handle, attribute_t attribute, attribute_value_t value)
85 #if CUDA_VERSION >= 9000 86 auto result = cuFuncSetAttribute(handle, static_cast<CUfunction_attribute>(attribute), value);
87 throw_if_error_lazy(result,
88 "Setting CUDA device function attribute " +
89 ::std::string(kernel::detail_::attribute_name(attribute)) +
" of function at " 90 + cuda::kernel::detail_::identify(handle) +
" to value " + ::std::to_string(value));
118 context_t context() const noexcept;
119 device_t device() const noexcept;
121 device::
id_t device_id() const noexcept {
return device_id_; }
122 context::handle_t context_handle() const noexcept {
return context_handle_; }
123 #if CAN_GET_APRIORI_KERNEL_HANDLE 124 kernel::handle_t handle() const noexcept {
return handle_; }
126 kernel::handle_t handle()
const 129 if (handle_ ==
nullptr) {
130 throw runtime_error(status::named_t::invalid_resource_handle,
131 "CUDA driver handle unavailable for kernel");
140 kernel_t& operator=(
const kernel_t&) =
delete;
141 kernel_t& operator=(kernel_t&& other) noexcept
143 ::std::swap(device_id_, other.device_id_);
144 ::std::swap(context_handle_, other.context_handle_);
145 ::std::swap(handle_, other.handle_);
146 ::std::swap(holds_pc_refcount_unit, holds_pc_refcount_unit);
153 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
154 kernel::attribute_value_t
get_attribute(kernel::attribute_t attribute)
const 156 return kernel::get_attribute(*
this, attribute);
159 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
162 auto raw_attribute =
get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
163 return device::compute_capability_t::from_combined_number(raw_attribute);
166 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
168 auto raw_attribute =
get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
169 return device::compute_capability_t::from_combined_number(raw_attribute);
179 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
180 grid::block_dimension_t maximum_threads_per_block()
const 182 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
185 #if CUDA_VERSION >= 10000 212 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
214 grid::composite_dimensions_t min_grid_params_for_max_occupancy(
215 memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
216 grid::block_dimension_t block_size_limit = 0,
217 bool disable_caching_override =
false)
const;
219 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
220 grid::composite_dimensions_t min_grid_params_for_max_occupancy(
221 kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
222 grid::block_dimension_t block_size_limit = 0,
223 bool disable_caching_override =
false)
const;
225 #endif // CUDA_VERSION >= 10000 245 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
246 grid::dimension_t max_active_blocks_per_multiprocessor(
247 grid::block_dimension_t block_size_in_threads,
248 memory::shared::size_t dynamic_shared_memory_per_block,
249 bool disable_caching_override =
false)
const;
255 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
256 void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value)
const;
271 auto amount_required_by_kernel_ =
static_cast<kernel::attribute_value_t
>(amount_required_by_kernel);
272 if (amount_required_by_kernel != static_cast<cuda::memory::shared::size_t>(amount_required_by_kernel_)) {
273 throw ::std::invalid_argument(
"Requested amount of maximum shared memory exceeds the " 274 "representation range for kernel attribute values");
277 set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
280 memory::shared::size_t get_maximum_dynamic_shared_memory_per_block()
const 282 return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
302 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
305 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
306 auto result = cuFuncSetCacheConfig(handle(), static_cast<CUfunc_cache>(preference));
307 throw_if_error_lazy(result,
308 "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " 309 "CUDA device function");
317 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
321 context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
322 auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
323 throw_if_error_lazy(result,
"Failed setting the shared memory bank size");
329 context::handle_t context_handle,
330 kernel::handle_t handle,
331 bool hold_primary_context_refcount_unit)
333 device_id_(device_id),
334 context_handle_(context_handle),
336 holds_pc_refcount_unit(hold_primary_context_refcount_unit)
340 friend kernel_t kernel::wrap(
device::id_t, context::handle_t, kernel::handle_t,
bool);
342 kernel_t(
const kernel_t& other) :
343 kernel_t(other.device_id_, other.context_handle_, other.handle_, false) { }
345 kernel_t(kernel_t&& other) :
346 kernel_t(other.device_id_, other.context_handle_, other.handle_, false)
348 ::std::swap(holds_pc_refcount_unit, other.holds_pc_refcount_unit);
352 VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
353 ~kernel_t() NOEXCEPT_IF_NDEBUG
356 if (holds_pc_refcount_unit) {
358 device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
362 device::primary_context::detail_::decrease_refcount(device_id_);
369 context::handle_t context_handle_;
370 mutable kernel::handle_t handle_;
371 bool holds_pc_refcount_unit;
376 inline kernel_t
wrap(
378 context::handle_t context_id,
380 bool hold_primary_context_refcount_unit)
382 return kernel_t{ device_id, context_id, f, hold_primary_context_refcount_unit };
387 CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
388 return detail_::get_attribute_in_current_context(kernel.handle(), attribute);
391 inline void set_attribute(
const kernel_t& kernel, attribute_t attribute, attribute_value_t value)
393 CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
394 return detail_::set_attribute_in_current_context(kernel.handle(), attribute, value);
397 inline attribute_value_t set_attribute(
const kernel_t& kernel, attribute_t attribute)
399 CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
400 return kernel::detail_::get_attribute_in_current_context(kernel.handle(), attribute);
403 namespace occupancy {
407 inline grid::dimension_t max_active_blocks_per_multiprocessor(
409 grid::block_dimension_t block_size_in_threads,
410 memory::shared::size_t dynamic_shared_memory_per_block,
411 bool disable_caching_override)
415 auto flags =
static_cast<unsigned>(disable_caching_override) ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
416 cuda::status_t status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
417 &result, handle, static_cast<int>(block_size_in_threads), dynamic_shared_memory_per_block, flags);
418 throw_if_error_lazy(status,
419 "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
423 #if CUDA_VERSION >= 10000 426 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
427 CUfunction kernel_handle,
429 CUoccupancyB2DSize determine_shared_mem_by_block_size,
432 bool disable_caching_override)
434 int min_grid_size_in_blocks { 0 };
435 int block_size { 0 };
439 auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
440 &min_grid_size_in_blocks, &block_size,
442 determine_shared_mem_by_block_size,
443 fixed_shared_mem_size,
444 static_cast<int>(block_size_limit),
445 disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
448 throw_if_error_lazy(result,
449 "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
450 +
" with maximum occupancy given dynamic shared memory and block size data");
451 return {
static_cast<grid::dimension_t
>(min_grid_size_in_blocks), static_cast<grid::block_dimension_t>(block_size) };
453 #endif // CUDA_VERSION >= 10000 457 #if CUDA_VERSION >= 11000 461 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
462 const kernel_t &kernel,
463 grid::dimension_t blocks_on_multiprocessor,
464 grid::block_dimension_t block_size_in_threads)
467 auto status = cuOccupancyAvailableDynamicSMemPerBlock(
468 &result, kernel.handle(),
static_cast<int>(blocks_on_multiprocessor), static_cast<int>(block_size_in_threads));
469 throw_if_error_lazy(status,
"Determining the available dynamic memory per block, given " 470 "the number of blocks on a multiprocessor and their size");
471 return static_cast<memory::shared::size_t
>(result);
473 #endif // CUDA_VERSION >= 11000 478 inline grid::dimension_t max_active_blocks_per_multiprocessor(
479 const kernel_t &kernel,
480 grid::block_dimension_t block_size_in_threads,
481 memory::shared::size_t dynamic_shared_memory_per_block,
482 bool disable_caching_override =
false);
488 inline ::std::string identify(
const kernel_t& kernel)
490 return kernel::detail_::identify(kernel.handle()) +
" in " + context::detail_::identify(kernel.context());
497 #if CUDA_VERSION >= 10000 498 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
499 memory::shared::size_t dynamic_shared_memory_size,
500 grid::block_dimension_t block_size_limit,
501 bool disable_caching_override)
const 503 kernel::shared_memory_size_determiner_t no_shared_memory_size_determiner {
nullptr };
504 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
505 handle(), device_id(), no_shared_memory_size_determiner,
506 dynamic_shared_memory_size, block_size_limit, disable_caching_override);
509 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
510 kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
512 bool disable_caching_override)
const 514 memory::shared::size_t no_fixed_dynamic_shared_memory_size{ 0 };
515 return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
516 handle(), device_id(), shared_memory_size_determiner,
517 no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
519 #endif // CUDA_VERSION >= 10000 521 inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
522 grid::block_dimension_t block_size_in_threads,
523 memory::shared::size_t dynamic_shared_memory_per_block,
524 bool disable_caching_override)
const 526 return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
527 handle(), block_size_in_threads,
528 dynamic_shared_memory_per_block, disable_caching_override);
533 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_ array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:248
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:22
Definition: kernel_launch.hpp:238
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:332
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:752
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:74
Definition: kernel_launch.hpp:77
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:732
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:649
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:280
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:706
attribute_value_t get_attribute(attribute_t attribute, const device_t &first, const device_t &second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:110
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
Fundamental CUDA-related type definitions.
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:136