cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_
11 #define CUDA_API_WRAPPERS_KERNEL_HPP_
12 
13 #include <cuda/api/types.hpp>
14 #include <cuda/api/error.hpp>
16 // #include <cuda/api/module.hpp>
17 
18 #include <cuda_runtime.h>
19 #include <cuda.h>
20 
21 #if CUDA_VERSION < 11000
22 #define CAN_GET_APRIORI_KERNEL_HANDLE 0
23 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual
24 #else
25 #define CAN_GET_APRIORI_KERNEL_HANDLE 1
26 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
27 #endif
28 
29 namespace cuda {
30 
32 class device_t;
33 class kernel_t;
35 
36 namespace kernel {
37 
38 using shared_memory_size_determiner_t = size_t (*)(int block_size);
39 
51 kernel_t wrap(
52  device::id_t device_id,
53  context::handle_t context_id,
54  kernel::handle_t f);
55 
56 namespace detail_ {
57 
58 inline ::std::string identify(const kernel_t& kernel);
59 
60 #ifndef NDEBUG
61 static const char* attribute_name(int attribute_index)
62 {
63  // Note: These correspond to the values of enum CUfunction_attribute_enum
64  static const char* names[] = {
65  "Maximum number of threads per block",
66  "Statically-allocated shared memory size in bytes",
67  "Required constant memory size in bytes",
68  "Required local memory size in bytes",
69  "Number of registers used by each thread",
70  "PTX virtual architecture version into which the kernel code was compiled",
71  "Binary architecture version for which the function was compiled",
72  "Indication whether the function was compiled with cache mode CA",
73  "Maximum allowed size of dynamically-allocated shared memory use size bytes",
74  "Preferred shared memory carve-out to actual shared memory"
75  };
76  return names[attribute_index];
77 }
78 #endif
79 
80 inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
81 {
82  kernel::attribute_value_t attribute_value;
83  auto result = cuFuncGetAttribute(&attribute_value, attribute, handle);
84  throw_if_error(result,
85  ::std::string("Failed obtaining attribute ") +
86 #ifdef NDEBUG
87  ::std::to_string(static_cast<::std::underlying_type<kernel::attribute_t>::type>(attribute))
88 #else
89  attribute_name(attribute)
90 #endif
91  );
92  return attribute_value;
93 }
94 
95 } // namespace detail_
96 
97 } // namespace kernel
98 
116 class kernel_t {
117 
118 public: // getters
119  context_t context() const noexcept;
120  device_t device() const noexcept;
121 
122  device::id_t device_id() const noexcept { return device_id_; }
123  context::handle_t context_handle() const noexcept { return context_handle_; }
124 #if ! CAN_GET_APRIORI_KERNEL_HANDLE
125  kernel::handle_t handle() const
126  {
127 #ifndef NDEBUG
128  if (handle_ == nullptr) {
129  throw runtime_error(status::named_t::invalid_resource_handle,
130  "CUDA driver handle unavailable for kernel");
131  }
132 #endif
133  return handle_;
134  }
135 #else
136  kernel::handle_t handle() const noexcept { return handle_; }
137 #endif
138 
139 public: // non-mutators
140 
141  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
142  kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const
143  {
144  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
145  return kernel::detail_::get_attribute_in_current_context(handle(), attribute);
146  }
147 
148  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
149  cuda::device::compute_capability_t ptx_version() const
150  {
151  auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
152  return device::compute_capability_t::from_combined_number(raw_attribute);
153  }
154 
155  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
156  cuda::device::compute_capability_t binary_compilation_target_architecture() const {
157  auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
158  return device::compute_capability_t::from_combined_number(raw_attribute);
159  }
160 
168  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
169  grid::block_dimension_t maximum_threads_per_block() const
170  {
171  return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
172  }
173 
174 #if CUDA_VERSION >= 10000
175 
201  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
203  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
204  memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
205  grid::block_dimension_t block_size_limit = 0,
206  bool disable_caching_override = false) const;
207 
208  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
209  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
210  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
211  grid::block_dimension_t block_size_limit = 0,
212  bool disable_caching_override = false) const;
214 #endif // CUDA_VERSION >= 10000
215 
234  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
235  grid::dimension_t max_active_blocks_per_multiprocessor(
236  grid::block_dimension_t block_size_in_threads,
237  memory::shared::size_t dynamic_shared_memory_per_block,
238  bool disable_caching_override = false) const;
239 
240 
241 
242 public: // methods mutating the kernel-in-context, but not this reference object
243 
244  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
245  void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const;
246 
258  void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
259  {
260  auto amount_required_by_kernel_ = (kernel::attribute_value_t) amount_required_by_kernel;
261  if (amount_required_by_kernel != (cuda::memory::shared::size_t) amount_required_by_kernel_) {
262  throw ::std::invalid_argument("Requested amount of maximum shared memory exceeds the "
263  "representation range for kernel attribute values");
264  }
265  // TODO: Consider a check in debug mode for the value being within range
266  set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
267  }
268 
269  memory::shared::size_t get_maximum_dynamic_shared_memory_per_block() const
270  {
271  return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
272  }
273 
291  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
292  void set_cache_preference(multiprocessor_cache_preference_t preference) const
293  {
294  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
295  auto result = cuFuncSetCacheConfig(handle(), (CUfunc_cache) preference);
296  throw_if_error(result,
297  "Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
298  "CUDA device function");
299  }
300 
306  VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
307  void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
308  {
309  // TODO: Need to set a context, not a device
310  context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
311  auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
312  throw_if_error(result, "Failed setting the shared memory bank size");
313  }
314 
315 protected: // ctors & dtor
316  kernel_t(device::id_t device_id, context::handle_t context_handle, kernel::handle_t handle)
317  : device_id_(device_id), context_handle_(context_handle), handle_(handle) { }
318 
319 public: // ctors & dtor
320  friend kernel_t kernel::wrap(device::id_t, context::handle_t, kernel::handle_t);
321 
322  kernel_t(const kernel_t& other) = default; // Note: be careful with subclasses
323  kernel_t(kernel_t&& other) = default; // Note: be careful with subclasses
324 
325 public: // ctors & dtor
326 #if ! CAN_GET_APRIORI_KERNEL_HANDLE
327  virtual ~kernel_t() = default;
328 #endif
329 
330 protected: // data members
331  device::id_t device_id_; // We don't _absolutely_ need the device ID, but - why not have it if we can?
332  context::handle_t context_handle_;
333  mutable kernel::handle_t handle_;
334 }; // kernel_t
335 
336 namespace kernel {
337 
338 inline kernel_t wrap(
339  device::id_t device_id,
340  context::handle_t context_id,
341  kernel::handle_t f)
342 {
343  return kernel_t{ device_id, context_id, f };
344 }
345 
346 namespace occupancy {
347 
348 namespace detail_ {
349 
350 inline grid::dimension_t max_active_blocks_per_multiprocessor(
351  handle_t handle,
352  grid::block_dimension_t block_size_in_threads,
353  memory::shared::size_t dynamic_shared_memory_per_block,
354  bool disable_caching_override)
355 {
356  int result;
357  cuda::status_t status = CUDA_SUCCESS;
358  // We don't need the initialization, but NVCC backed by GCC 8 warns us about it.
359  auto flags = (unsigned) disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
360  status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
361  &result, handle, (int) block_size_in_threads, dynamic_shared_memory_per_block, flags);
362  throw_if_error(status,
363  "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dyanmic memory per block");
364  return result;
365 }
366 
367 #if CUDA_VERSION >= 10000
368 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
369 // if block_size_limit is 0, it is ignored.
370 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
371  CUfunction kernel_handle,
372  cuda::device::id_t device_id,
373  CUoccupancyB2DSize determine_shared_mem_by_block_size,
374  cuda::memory::shared::size_t fixed_shared_mem_size,
375  cuda::grid::block_dimension_t block_size_limit,
376  bool disable_caching_override)
377 {
378  int min_grid_size_in_blocks { 0 };
379  int block_size { 0 };
380  // Note: only initializing the values her because of a
381  // spurious (?) compiler warning about potential uninitialized use.
382 
383  auto result = cuOccupancyMaxPotentialBlockSizeWithFlags(
384  &min_grid_size_in_blocks, &block_size,
385  kernel_handle,
386  determine_shared_mem_by_block_size,
387  fixed_shared_mem_size,
388  static_cast<int>(block_size_limit),
389  disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
390  );
391 
392  throw_if_error(result,
393  "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
394  + " with maximum occupancy given dynamic shared memory and block size data");
395  return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
396 }
397 #endif // CUDA_VERSION >= 10000
398 
399 } // namespace detail_
400 
401 
405 #if CUDA_VERSION < 11000
406 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
407  const kernel_t &,
408  grid::dimension_t,
409  grid::block_dimension_t)
410 {
411  throw cuda::runtime_error(status::not_supported,
412  "cuOccupancyAvailableDynamicSMemPerBlock() requires CUDA 11.0 or later");
413 }
414 #else
415 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
416  const kernel_t &kernel,
417  grid::dimension_t blocks_on_multiprocessor,
418  grid::block_dimension_t block_size_in_threads)
419 {
420  size_t result;
421  auto status = cuOccupancyAvailableDynamicSMemPerBlock(
422  &result, kernel.handle(), (int) blocks_on_multiprocessor, (int) block_size_in_threads);
423  throw_if_error(status,
424  "Determining the available dynamic memory per block, given the number of blocks on a multiprocessor and their size");
425  return (memory::shared::size_t) result;
426 }
427 #endif // CUDA_VERSION < 11000
428 
432 inline grid::dimension_t max_active_blocks_per_multiprocessor(
433  const kernel_t &kernel,
434  grid::block_dimension_t block_size_in_threads,
435  memory::shared::size_t dynamic_shared_memory_per_block,
436  bool disable_caching_override = false);
437 
438 } // namespace occupancy
439 
440 namespace detail_ {
441 
442 inline ::std::string identify(const kernel_t& kernel)
443 {
444  return kernel::detail_::identify(kernel.handle()) + " in " + context::detail_::identify(kernel.context());
445 }
446 
447 } // namespace detail_
448 } // namespace kernel
449 
450 #if CUDA_VERSION >= 10000
451 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
452  memory::shared::size_t dynamic_shared_memory_size,
453  grid::block_dimension_t block_size_limit,
454  bool disable_caching_override) const
455 {
456  kernel::shared_memory_size_determiner_t no_shared_memory_size_determiner { nullptr };
457  return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
458  handle(), device_id(), no_shared_memory_size_determiner,
459  dynamic_shared_memory_size, block_size_limit, disable_caching_override);
460 }
461 
462 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
463  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
464  cuda::grid::block_dimension_t block_size_limit,
465  bool disable_caching_override) const
466 {
467  size_t no_fixed_dynamic_shared_memory_size { 0 };
468  return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
469  handle(), device_id(), shared_memory_size_determiner,
470  no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
471 }
472 #endif // CUDA_VERSION >= 10000
473 
474 inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
475  grid::block_dimension_t block_size_in_threads,
476  memory::shared::size_t dynamic_shared_memory_per_block,
477  bool disable_caching_override) const
478 {
479  return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
480  handle(), block_size_in_threads,
481  dynamic_shared_memory_per_block, disable_caching_override);
482 }
483 
484 } // namespace cuda
485 
486 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_
int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:686
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:196
attribute_value_t get_attribute(attribute_t attribute, device_t first, device_t second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:97
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:22
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:297
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:676
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:74
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:313
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:656
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:600
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:269
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:630
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
Fundamental CUDA-related type definitions.
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:116