cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_
11 #define CUDA_API_WRAPPERS_KERNEL_HPP_
12 
13 #include "primary_context.hpp"
14 #include "current_context.hpp"
15 #include "error.hpp"
16 #include "types.hpp"
17 
18 #if CUDA_VERSION < 11000
19 #define CAN_GET_APRIORI_KERNEL_HANDLE 0
20 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual
21 #else
22 #define CAN_GET_APRIORI_KERNEL_HANDLE 1
23 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
24 #endif
25 
26 namespace cuda {
27 
29 class kernel_t;
31 
32 namespace kernel {
33 
34 using shared_memory_size_determiner_t = size_t (CUDA_CB *)(int block_size);
35 
47 kernel_t wrap(
48  device::id_t device_id,
49  context::handle_t context_id,
50  kernel::handle_t f,
51  bool hold_primary_context_refcount_unit = false);
52 
53 namespace detail_ {
54 
55 inline ::std::string identify(const kernel_t& kernel);
56 
57 static const char* attribute_name(int attribute_index)
58 {
59  // Note: These correspond to the values of enum CUfunction_attribute_enum
60  static const char* names[] = {
61  "Maximum number of threads per block",
62  "Statically-allocated shared memory size in bytes",
63  "Required constant memory size in bytes",
64  "Required local memory size in bytes",
65  "Number of registers used by each thread",
66  "PTX virtual architecture version into which the kernel code was compiled",
67  "Binary architecture version for which the function was compiled",
68  "Indication whether the function was compiled with cache mode CA",
69  "Maximum allowed size of dynamically-allocated shared memory use size bytes",
70  "Preferred shared memory carve-out to actual shared memory"
71  };
72  return names[attribute_index];
73 }
74 
75 inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
76 {
77  kernel::attribute_value_t attribute_value;
78  auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
79  throw_if_error_lazy(result, ::std::string("Failed obtaining attribute ") + attribute_name(attribute));
80  return attribute_value;
81 }
82 
83 inline void set_attribute_in_current_context(handle_t handle, attribute_t attribute, attribute_value_t value)
84 {
85 #if CUDA_VERSION >= 9000
86  auto result = cuFuncSetAttribute(handle, static_cast<CUfunction_attribute>(attribute), value);
87  throw_if_error_lazy(result,
88  "Setting CUDA device function attribute " +
89  ::std::string(kernel::detail_::attribute_name(attribute)) + " of function at "
90  + cuda::kernel::detail_::identify(handle) + " to value " + ::std::to_string(value));
91 #else
92  throw(cuda::runtime_error {cuda::status::not_yet_implemented});
93 #endif
94 }
95 
96 } // namespace detail_
97 
98 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute);
99 
100 } // namespace kernel
101 
115 class kernel_t {
116 
117 public: // getters
118  context_t context() const noexcept;
119  device_t device() const noexcept;
120 
121  device::id_t device_id() const noexcept { return device_id_; }
122  context::handle_t context_handle() const noexcept { return context_handle_; }
123 #if CAN_GET_APRIORI_KERNEL_HANDLE
124  kernel::handle_t handle() const noexcept { return handle_; }
125 #else
126  kernel::handle_t handle() const
127  {
128 #ifndef NDEBUG
129  if (handle_ == nullptr) {
130  throw runtime_error(status::named_t::invalid_resource_handle,
131  "CUDA driver handle unavailable for kernel");
132  }
133 #endif
134  return handle_;
135  }
136 #endif
137 
138 public: // operators
139 
140  kernel_t& operator=(const kernel_t&) = delete;
141  kernel_t& operator=(kernel_t&& other) noexcept
142  {
143  ::std::swap(device_id_, other.device_id_);
144  ::std::swap(context_handle_, other.context_handle_);
145  ::std::swap(handle_, other.handle_);
146  ::std::swap(holds_pc_refcount_unit, holds_pc_refcount_unit);
147  return *this;
148  }
149 
150 
151 public: // non-mutators
152 
153  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
154  kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const
155  {
156  return kernel::get_attribute(*this, attribute);
157  }
158 
159  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
160  cuda::device::compute_capability_t ptx_version() const
161  {
162  auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
163  return device::compute_capability_t::from_combined_number(raw_attribute);
164  }
165 
166  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
167  cuda::device::compute_capability_t binary_compilation_target_architecture() const {
168  auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
169  return device::compute_capability_t::from_combined_number(raw_attribute);
170  }
171 
179  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
180  grid::block_dimension_t maximum_threads_per_block() const
181  {
182  return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
183  }
184 
185 #if CUDA_VERSION >= 10000
186 
212  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
214  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
215  memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
216  grid::block_dimension_t block_size_limit = 0,
217  bool disable_caching_override = false) const;
218 
219  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
220  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
221  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
222  grid::block_dimension_t block_size_limit = 0,
223  bool disable_caching_override = false) const;
225 #endif // CUDA_VERSION >= 10000
226 
245  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
246  grid::dimension_t max_active_blocks_per_multiprocessor(
247  grid::block_dimension_t block_size_in_threads,
248  memory::shared::size_t dynamic_shared_memory_per_block,
249  bool disable_caching_override = false) const;
250 
251 
252 
253 public: // methods mutating the kernel-in-context, but not this reference object
254 
255  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
256  void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const;
257 
269  void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
270  {
271  auto amount_required_by_kernel_ = static_cast<kernel::attribute_value_t>(amount_required_by_kernel);
272  if (amount_required_by_kernel != static_cast<cuda::memory::shared::size_t>(amount_required_by_kernel_)) {
273  throw ::std::invalid_argument("Requested amount of maximum shared memory exceeds the "
274  "representation range for kernel attribute values");
275  }
276  // TODO: Consider a check in debug mode for the value being within range
277  set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
278  }
279 
280  memory::shared::size_t get_maximum_dynamic_shared_memory_per_block() const
281  {
282  return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
283  }
284 
302  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
303  void set_cache_preference(multiprocessor_cache_preference_t preference) const
304  {
305  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
306  auto result = cuFuncSetCacheConfig(handle(), static_cast<CUfunc_cache>(preference));
307  throw_if_error_lazy(result,
308  "Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
309  "CUDA device function");
310  }
311 
317  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
318  void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
319  {
320  // TODO: Need to set a context, not a device
321  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
322  auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
323  throw_if_error_lazy(result, "Failed setting the shared memory bank size");
324  }
325 
326 protected: // ctors & dtor
327  kernel_t(
328  device::id_t device_id,
329  context::handle_t context_handle,
330  kernel::handle_t handle,
331  bool hold_primary_context_refcount_unit)
332  :
333  device_id_(device_id),
334  context_handle_(context_handle),
335  handle_(handle),
336  holds_pc_refcount_unit(hold_primary_context_refcount_unit)
337  { }
338 
339 public: // ctors & dtor
340  friend kernel_t kernel::wrap(device::id_t, context::handle_t, kernel::handle_t, bool);
341 
342  kernel_t(const kernel_t& other) :
343  kernel_t(other.device_id_, other.context_handle_, other.handle_, false) { }
344 
345  kernel_t(kernel_t&& other) :
346  kernel_t(other.device_id_, other.context_handle_, other.handle_, false)
347  {
348  ::std::swap(holds_pc_refcount_unit, other.holds_pc_refcount_unit);
349  }
350 
351 public: // ctors & dtor
352  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
353  ~kernel_t() NOEXCEPT_IF_NDEBUG
354  {
355  // TODO: DRY
356  if (holds_pc_refcount_unit) {
357 #ifdef NDEBUG
358  device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
359  // Note: "Swallowing" any potential error to avoid ::std::terminate(); also,
360  // because a failure probably means the primary context is inactive already
361 #else
362  device::primary_context::detail_::decrease_refcount(device_id_);
363 #endif
364  }
365  }
366 
367 protected: // data members
368  device::id_t device_id_; // We don't _absolutely_ need the device ID, but - why not have it if we can?
369  context::handle_t context_handle_;
370  mutable kernel::handle_t handle_;
371  bool holds_pc_refcount_unit;
372 }; // kernel_t
373 
374 namespace kernel {
375 
376 inline kernel_t wrap(
377  device::id_t device_id,
378  context::handle_t context_id,
379  kernel::handle_t f,
380  bool hold_primary_context_refcount_unit)
381 {
382  return kernel_t{ device_id, context_id, f, hold_primary_context_refcount_unit };
383 }
384 
385 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute)
386 {
387  CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
388  return detail_::get_attribute_in_current_context(kernel.handle(), attribute);
389 }
390 
391 inline void set_attribute(const kernel_t& kernel, attribute_t attribute, attribute_value_t value)
392 {
393  CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
394  return detail_::set_attribute_in_current_context(kernel.handle(), attribute, value);
395 }
396 
397 inline attribute_value_t set_attribute(const kernel_t& kernel, attribute_t attribute)
398 {
399  CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
400  return kernel::detail_::get_attribute_in_current_context(kernel.handle(), attribute);
401 }
402 
403 namespace occupancy {
404 
405 namespace detail_ {
406 
407 inline grid::dimension_t max_active_blocks_per_multiprocessor(
408  handle_t handle,
409  grid::block_dimension_t block_size_in_threads,
410  memory::shared::size_t dynamic_shared_memory_per_block,
411  bool disable_caching_override)
412 {
413  int result;
414  // We don't need the initialization, but NVCC backed by GCC 8 warns us about it.
415  auto flags = static_cast<unsigned>(disable_caching_override) ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
416  cuda::status_t status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
417  &result, handle, static_cast<int>(block_size_in_threads), dynamic_shared_memory_per_block, flags);
418  throw_if_error_lazy(status,
419  "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
420  return result;
421 }
422 
423 #if CUDA_VERSION >= 10000
424 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
425 // if block_size_limit is 0, it is ignored.
426 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
427  CUfunction kernel_handle,
428  cuda::device::id_t device_id,
429  CUoccupancyB2DSize determine_shared_mem_by_block_size,
430  cuda::memory::shared::size_t fixed_shared_mem_size,
431  cuda::grid::block_dimension_t block_size_limit,
432  bool disable_caching_override)
433 {
434  int min_grid_size_in_blocks { 0 };
435  int block_size { 0 };
436  // Note: only initializing the values her because of a
437  // spurious (?) compiler warning about potential uninitialized use.
438 
439  auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
440  &min_grid_size_in_blocks, &block_size,
441  kernel_handle,
442  determine_shared_mem_by_block_size,
443  fixed_shared_mem_size,
444  static_cast<int>(block_size_limit),
445  disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
446  );
447 
448  throw_if_error_lazy(result,
449  "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
450  + " with maximum occupancy given dynamic shared memory and block size data");
451  return { static_cast<grid::dimension_t>(min_grid_size_in_blocks), static_cast<grid::block_dimension_t>(block_size) };
452 }
453 #endif // CUDA_VERSION >= 10000
454 
455 } // namespace detail_
456 
457 #if CUDA_VERSION >= 11000
458 
461 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
462  const kernel_t &kernel,
463  grid::dimension_t blocks_on_multiprocessor,
464  grid::block_dimension_t block_size_in_threads)
465 {
466  size_t result;
467  auto status = cuOccupancyAvailableDynamicSMemPerBlock(
468  &result, kernel.handle(), static_cast<int>(blocks_on_multiprocessor), static_cast<int>(block_size_in_threads));
469  throw_if_error_lazy(status, "Determining the available dynamic memory per block, given "
470  "the number of blocks on a multiprocessor and their size");
471  return static_cast<memory::shared::size_t>(result);
472 }
473 #endif // CUDA_VERSION >= 11000
474 
478 inline grid::dimension_t max_active_blocks_per_multiprocessor(
479  const kernel_t &kernel,
480  grid::block_dimension_t block_size_in_threads,
481  memory::shared::size_t dynamic_shared_memory_per_block,
482  bool disable_caching_override = false);
483 
484 } // namespace occupancy
485 
486 namespace detail_ {
487 
488 inline ::std::string identify(const kernel_t& kernel)
489 {
490  return kernel::detail_::identify(kernel.handle()) + " in " + context::detail_::identify(kernel.context());
491 }
492 
493 } // namespace detail_
494 
495 } // namespace kernel
496 
497 #if CUDA_VERSION >= 10000
498 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
499  memory::shared::size_t dynamic_shared_memory_size,
500  grid::block_dimension_t block_size_limit,
501  bool disable_caching_override) const
502 {
503  kernel::shared_memory_size_determiner_t no_shared_memory_size_determiner { nullptr };
504  return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
505  handle(), device_id(), no_shared_memory_size_determiner,
506  dynamic_shared_memory_size, block_size_limit, disable_caching_override);
507 }
508 
509 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
510  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
511  cuda::grid::block_dimension_t block_size_limit,
512  bool disable_caching_override) const
513 {
514  memory::shared::size_t no_fixed_dynamic_shared_memory_size{ 0 };
515  return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
516  handle(), device_id(), shared_memory_size_determiner,
517  no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
518 }
519 #endif // CUDA_VERSION >= 10000
520 
521 inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
522  grid::block_dimension_t block_size_in_threads,
523  memory::shared::size_t dynamic_shared_memory_per_block,
524  bool disable_caching_override) const
525 {
526  return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
527  handle(), block_size_in_threads,
528  dynamic_shared_memory_per_block, disable_caching_override);
529 }
530 
531 } // namespace cuda
532 
533 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:248
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:22
Definition: kernel_launch.hpp:238
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:332
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:752
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:74
Definition: kernel_launch.hpp:77
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:732
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:649
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:280
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:706
attribute_value_t get_attribute(attribute_t attribute, const device_t &first, const device_t &second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:110
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
Fundamental CUDA-related type definitions.
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:136