cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_
11 #define CUDA_API_WRAPPERS_KERNEL_HPP_
12 
13 #include "primary_context.hpp"
14 #include "current_context.hpp"
15 #include "device_properties.hpp"
16 #include "error.hpp"
17 #include "types.hpp"
18 
20 #if CUDA_VERSION < 11000
21 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 0
22 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual
23 #else
24 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 1
25 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
26 #endif
27 
29 namespace cuda {
30 
32 class kernel_t;
34 
35 namespace kernel {
36 
44 using shared_memory_size_determiner_t = size_t (CUDA_CB *)(int block_size);
45 
60 kernel_t wrap(
61  device::id_t device_id,
62  context::handle_t context_handle,
63  kernel::handle_t handle,
64  bool hold_primary_context_refcount_unit = false);
65 
66 namespace detail_ {
67 
68 inline ::std::string identify(const kernel_t& kernel);
69 
70 static const char* attribute_name(int attribute_index)
71 {
72  // Note: These correspond to the values of enum CUfunction_attribute_enum
73  static const char* names[] = {
74  "Maximum number of threads per block",
75  "Statically-allocated shared memory size in bytes",
76  "Required constant memory size in bytes",
77  "Required local memory size in bytes",
78  "Number of registers used by each thread",
79  "PTX virtual architecture version into which the kernel code was compiled",
80  "Binary architecture version for which the function was compiled",
81  "Indication whether the function was compiled with cache mode CA",
82  "Maximum allowed size of dynamically-allocated shared memory use size bytes",
83  "Preferred shared memory carve-out to actual shared memory"
84  };
85  return names[attribute_index];
86 }
87 
88 inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
89 {
90  kernel::attribute_value_t attribute_value;
91  auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
92  throw_if_error_lazy(result, ::std::string("Failed obtaining attribute ") + attribute_name(attribute));
93  return attribute_value;
94 }
95 
96 inline void set_attribute_in_current_context(handle_t handle, attribute_t attribute, attribute_value_t value)
97 {
98 #if CUDA_VERSION >= 9000
99  auto result = cuFuncSetAttribute(handle, static_cast<CUfunction_attribute>(attribute), value);
100  throw_if_error_lazy(result,
101  "Setting CUDA device function attribute " +
102  ::std::string(kernel::detail_::attribute_name(attribute)) + " of function at "
103  + cuda::kernel::detail_::identify(handle) + " to value " + ::std::to_string(value));
104 #else
105  throw(cuda::runtime_error {cuda::status::not_yet_implemented});
106 #endif
107 }
108 
109 #if CUDA_VERSION >= 12030
110 inline const char * get_name_in_current_context(handle_t handle)
111 {
112  const char* result;
113  auto status = cuFuncGetName(&result, handle);
114  throw_if_error_lazy(status, "Failed obtaining the name for " + identify(handle));
115  return result;
116 }
117 
118 inline const char * get_name(context::handle_t context_handle, handle_t kernel_handle)
119 {
120  CAW_SET_SCOPE_CONTEXT(context_handle);
121  return get_name_in_current_context(kernel_handle);
122 }
123 
124 inline module::handle_t get_module_in_current_context(handle_t handle)
125 {
126  module::handle_t result;
127  auto status = cuFuncGetModule(&result, handle);
128  throw_if_error_lazy(status, "Failed obtaining the module containing " + identify(handle));
129  return result;
130 }
131 
132 inline module::handle_t get_module(context::handle_t context_handle, handle_t kernel_handle)
133 {
134  CAW_SET_SCOPE_CONTEXT(context_handle);
135  return get_module_in_current_context(kernel_handle);
136 }
137 
138 #endif // CUDA_VERSION >= 12300
139 
140 } // namespace detail_
141 
142 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute);
143 
144 } // namespace kernel
145 
159 class kernel_t {
160 
161 public: // getters
162 
164  context_t context() const noexcept;
166  device_t device() const noexcept;
167 
169  device::id_t device_id() const noexcept { return device_id_; }
171  context::handle_t context_handle() const noexcept { return context_handle_; }
178 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE
179  kernel::handle_t handle() const noexcept { return handle_; }
180 #else
181  kernel::handle_t handle() const
182  {
183 #ifndef NDEBUG
184  if (handle_ == nullptr) {
185  throw runtime_error(status::named_t::invalid_resource_handle,
186  "CUDA driver handle unavailable for kernel");
187  }
188 #endif
189  return handle_;
190  }
191 #endif
192 
193 #if CUDA_VERSION >= 12030
194  const char *mangled_name() const { return cuda::kernel::detail_::get_name(context_handle_, handle_); }
197  module_t module() const;
198 #endif
199 
200 public: // operators
201 
202  kernel_t& operator=(const kernel_t&) = delete;
203  kernel_t& operator=(kernel_t&& other) noexcept
204  {
205  ::std::swap(device_id_, other.device_id_);
206  ::std::swap(context_handle_, other.context_handle_);
207  ::std::swap(handle_, other.handle_);
208  ::std::swap(holds_pc_refcount_unit, holds_pc_refcount_unit);
209  return *this;
210  }
211 
212 
213 public: // non-mutators
214 
215  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
216  kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const
217  {
218  return kernel::get_attribute(*this, attribute);
219  }
220 
222  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
224  {
225  auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
227  }
228 
230  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
232  auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
234  }
235 
243  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
245  {
246  return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
247  }
248 
249 #if CUDA_VERSION >= 10000
250 
276  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
278  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
279  memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
280  grid::block_dimension_t block_size_limit = 0,
281  bool disable_caching_override = false) const;
282 
283  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
284  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
285  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
286  grid::block_dimension_t block_size_limit = 0,
287  bool disable_caching_override = false) const;
289 #endif // CUDA_VERSION >= 10000
290 
309  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
310  grid::dimension_t max_active_blocks_per_multiprocessor(
311  grid::block_dimension_t block_size_in_threads,
312  memory::shared::size_t dynamic_shared_memory_per_block,
313  bool disable_caching_override = false) const;
314 
315 
316 
317 public: // methods mutating the kernel-in-context, but not this reference object
318 
319  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
320  void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const;
321 
334  {
335  auto amount_required_by_kernel_ = static_cast<kernel::attribute_value_t>(amount_required_by_kernel);
336  if (amount_required_by_kernel != static_cast<cuda::memory::shared::size_t>(amount_required_by_kernel_)) {
337  throw ::std::invalid_argument("Requested amount of maximum shared memory exceeds the "
338  "representation range for kernel attribute values");
339  }
340  // TODO: Consider a check in debug mode for the value being within range
341  set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
342  }
343 
344  memory::shared::size_t get_maximum_dynamic_shared_memory_per_block() const
345  {
346  return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
347  }
348 
366  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
368  {
369  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
370  auto result = cuFuncSetCacheConfig(handle(), static_cast<CUfunc_cache>(preference));
371  throw_if_error_lazy(result,
372  "Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
373  "CUDA device function");
374  }
375 
376 #if CUDA_VERSION < 12030
377 
382  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
384  {
385  // TODO: Need to set a context, not a device
386  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
387  auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
388  throw_if_error_lazy(result, "Failed setting the shared memory bank size");
389  }
390 #endif // CUDA_VERSION < 12030
391 
392 
393 protected: // ctors & dtor
394  kernel_t(
395  device::id_t device_id,
396  context::handle_t context_handle,
397  kernel::handle_t handle,
398  bool hold_primary_context_refcount_unit)
399  :
400  device_id_(device_id),
401  context_handle_(context_handle),
402  handle_(handle),
403  holds_pc_refcount_unit(hold_primary_context_refcount_unit)
404  { }
405 
406 public: // ctors & dtor
407  friend kernel_t kernel::wrap(device::id_t, context::handle_t, kernel::handle_t, bool);
408 
409  kernel_t(const kernel_t& other) :
410  kernel_t(other.device_id_, other.context_handle_, other.handle_, false) { }
411 
412  kernel_t(kernel_t&& other) :
413  kernel_t(other.device_id_, other.context_handle_, other.handle_, false)
414  {
415  ::std::swap(holds_pc_refcount_unit, other.holds_pc_refcount_unit);
416  }
417 
418 public: // ctors & dtor
419  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
420  ~kernel_t() NOEXCEPT_IF_NDEBUG
421  {
422  // TODO: DRY
423  if (holds_pc_refcount_unit) {
424 #ifdef NDEBUG
425  device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
426  // Note: "Swallowing" any potential error to avoid ::std::terminate(); also,
427  // because a failure probably means the primary context is inactive already
428 #else
429  device::primary_context::detail_::decrease_refcount(device_id_);
430 #endif
431  }
432  }
433 
434 protected: // data members
435  device::id_t device_id_; // We don't _absolutely_ need the device ID, but - why not have it if we can?
436  context::handle_t context_handle_;
437  mutable kernel::handle_t handle_;
438  bool holds_pc_refcount_unit;
439 }; // kernel_t
440 
441 namespace kernel {
442 
443 inline kernel_t wrap(
444  device::id_t device_id,
445  context::handle_t context_handle,
446  kernel::handle_t handle,
447  bool hold_primary_context_refcount_unit)
448 {
449  return kernel_t{device_id, context_handle, handle, hold_primary_context_refcount_unit };
450 }
451 
452 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute)
453 {
454  CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
455  return detail_::get_attribute_in_current_context(kernel.handle(), attribute);
456 }
457 
458 inline void set_attribute(const kernel_t& kernel, attribute_t attribute, attribute_value_t value)
459 {
460  CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
461  return detail_::set_attribute_in_current_context(kernel.handle(), attribute, value);
462 }
463 
464 namespace occupancy {
465 
466 namespace detail_ {
467 
468 inline grid::dimension_t max_active_blocks_per_multiprocessor(
469  handle_t handle,
470  grid::block_dimension_t block_size_in_threads,
471  memory::shared::size_t dynamic_shared_memory_per_block,
472  bool disable_caching_override)
473 {
474  int result;
475  // We don't need the initialization, but NVCC backed by GCC 8 warns us about it.
476  auto flags = static_cast<unsigned>(disable_caching_override) ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
477  cuda::status_t status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
478  &result, handle, static_cast<int>(block_size_in_threads), dynamic_shared_memory_per_block, flags);
479  throw_if_error_lazy(status,
480  "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
481  return result;
482 }
483 
484 #if CUDA_VERSION >= 10000
485 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
486 // if block_size_limit is 0, it is ignored.
487 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
488  CUfunction kernel_handle,
489  cuda::device::id_t device_id,
490  CUoccupancyB2DSize determine_shared_mem_by_block_size,
491  cuda::memory::shared::size_t fixed_shared_mem_size,
492  cuda::grid::block_dimension_t block_size_limit,
493  bool disable_caching_override)
494 {
495  int min_grid_size_in_blocks { 0 };
496  int block_size { 0 };
497  // Note: only initializing the values her because of a
498  // spurious (?) compiler warning about potential uninitialized use.
499 
500  auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
501  &min_grid_size_in_blocks, &block_size,
502  kernel_handle,
503  determine_shared_mem_by_block_size,
504  fixed_shared_mem_size,
505  static_cast<int>(block_size_limit),
506  disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
507  );
508 
509  throw_if_error_lazy(result,
510  "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
511  + " with maximum occupancy given dynamic shared memory and block size data");
512  return { static_cast<grid::dimension_t>(min_grid_size_in_blocks), static_cast<grid::block_dimension_t>(block_size) };
513 }
514 #endif // CUDA_VERSION >= 10000
515 
516 } // namespace detail_
517 
518 #if CUDA_VERSION >= 11000
519 
522 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
523  const kernel_t &kernel,
524  grid::dimension_t blocks_on_multiprocessor,
525  grid::block_dimension_t block_size_in_threads)
526 {
527  size_t result;
528  auto status = cuOccupancyAvailableDynamicSMemPerBlock(
529  &result, kernel.handle(), static_cast<int>(blocks_on_multiprocessor), static_cast<int>(block_size_in_threads));
530  throw_if_error_lazy(status, "Determining the available dynamic memory per block, given "
531  "the number of blocks on a multiprocessor and their size");
532  return static_cast<memory::shared::size_t>(result);
533 }
534 #endif // CUDA_VERSION >= 11000
535 
539 inline grid::dimension_t max_active_blocks_per_multiprocessor(
540  const kernel_t &kernel,
541  grid::block_dimension_t block_size_in_threads,
542  memory::shared::size_t dynamic_shared_memory_per_block,
543  bool disable_caching_override = false);
544 
545 } // namespace occupancy
546 
547 namespace detail_ {
548 
549 inline ::std::string identify(const kernel_t& kernel)
550 {
551  return kernel::detail_::identify(kernel.handle()) + " in " + context::detail_::identify(kernel.context());
552 }
553 
554 } // namespace detail_
555 
556 } // namespace kernel
557 
558 #if CUDA_VERSION >= 10000
559 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
560  memory::shared::size_t dynamic_shared_memory_size,
561  grid::block_dimension_t block_size_limit,
562  bool disable_caching_override) const
563 {
564  kernel::shared_memory_size_determiner_t no_shared_memory_size_determiner { nullptr };
565  return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
566  handle(), device_id(), no_shared_memory_size_determiner,
567  dynamic_shared_memory_size, block_size_limit, disable_caching_override);
568 }
569 
570 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
571  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
572  cuda::grid::block_dimension_t block_size_limit,
573  bool disable_caching_override) const
574 {
575  memory::shared::size_t no_fixed_dynamic_shared_memory_size{ 0 };
576  return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
577  handle(), device_id(), shared_memory_size_determiner,
578  no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
579 }
580 #endif // CUDA_VERSION >= 10000
581 
583  grid::block_dimension_t block_size_in_threads,
584  memory::shared::size_t dynamic_shared_memory_per_block,
585  bool disable_caching_override) const
586 {
587  return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
588  handle(), block_size_in_threads,
589  dynamic_shared_memory_per_block, disable_caching_override);
590 }
591 
592 inline bool operator==(const kernel_t& lhs, const kernel_t& rhs) noexcept
593 {
594  return
595  lhs.device_id() == rhs.device_id()
596  and lhs.context_handle() == rhs.context_handle()
597  and lhs.handle() == rhs.handle();
598 }
599 
600 inline bool operator!=(const kernel_t& lhs, const kernel_t& rhs) noexcept
601 {
602  return not (lhs == rhs);
603 }
604 
605 } // namespace cuda
606 
607 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_
context::handle_t context_handle() const noexcept
Get the raw handle of the context in which this kernel is defined.
Definition: kernel.hpp:171
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t ptx_version() const
Definition: kernel.hpp:223
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t binary_compilation_target_architecture() const
Definition: kernel.hpp:231
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
Wrapper class for a CUDA code module.
Definition: module.hpp:123
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
Sets a device function&#39;s preference of shared memory bank size.
Definition: kernel.hpp:383
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
Change the hardware resource carve-out between L1 cache and shared memory for launches of the kernel ...
Definition: kernel.hpp:333
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
CUmodule handle_t
Raw CUDA driver handle of a module of compiled code; see module_t.
Definition: module.hpp:34
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_cache_preference(multiprocessor_cache_preference_t preference) const
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with c...
Definition: kernel.hpp:367
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: kernel.hpp:582
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983
Classes representing specific and overall properties of CUDA devices.