cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
apriori_compiled.hpp
Go to the documentation of this file.
1 
7 #pragma once
8 #ifndef CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_
9 #define CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_
10 
11 #include "../kernel.hpp"
12 
13 // The following is needed for occupancy-related calculation convenience
14 // and kernel-attribute-related API functions
15 #include <cuda_runtime.h>
16 
17 #include <type_traits>
18 
19 namespace cuda {
20 
21 namespace kernel {
22 
24 class apriori_compiled_t;
26 
27 namespace apriori_compiled {
28 
29 namespace detail_ {
30 
31 #if CUDA_VERSION < 11000
32 inline handle_t get_handle(const void *, const char* = nullptr)
33 {
34  throw cuda::runtime_error(status::not_supported,
35  "Only CUDA versions 11.0 and later support obtaining CUDA driver handles "
36  "for kernels compiled alongside the program source");
37 }
38 #else
39 inline handle_t get_handle(const void *kernel_function_ptr, const char* name = nullptr)
40 {
41  handle_t handle;
42  auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr);
43  throw_if_error_lazy(status, "Failed obtaining a CUDA function handle for "
44  + ((name == nullptr) ? ::std::string("a kernel function") : ::std::string("kernel function ") + name)
45  + " at " + cuda::detail_::ptr_as_hex(kernel_function_ptr));
46  return handle;
47 }
48 #endif
49 
50 apriori_compiled_t wrap(
51  device::id_t device_id,
52  context::handle_t primary_context_handle,
54  const void* ptr,
55  bool hold_primary_context_refcount_unit = false);
56 
57 } // namespace detail_
58 
59 
60 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
61 
65 struct attributes_t : cudaFuncAttributes {
66 
70  }
71 
72  cuda::device::compute_capability_t binary_compilation_target_architecture() const noexcept {
74  }
75 };
76 
77 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE
78 
79 namespace occupancy {
80 
81 namespace detail_ {
82 
83 #if CUDA_VERSION < 11000
84 
85 template<typename UnaryFunction, class T>
86 static __inline__ cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_(
87  int *minGridSize,
88  int *blockSize,
89  T func,
90  UnaryFunction blockSizeToDynamicSMemSize,
91  int blockSizeLimit = 0,
92  unsigned int flags = 0)
93 {
94  cudaError_t status;
95 
96  // Device and function properties
97  int device;
98  struct cudaFuncAttributes attr;
99 
100  // Limits
101  int maxThreadsPerMultiProcessor;
102  int warpSize;
103  int devMaxThreadsPerBlock;
104  int multiProcessorCount;
105  int funcMaxThreadsPerBlock;
106  int occupancyLimit;
107  int granularity;
108 
109  // Recorded maximum
110  int maxBlockSize = 0;
111  int numBlocks = 0;
112  int maxOccupancy = 0;
113 
114  // Temporary
115  int blockSizeToTryAligned;
116  int blockSizeToTry;
117  int blockSizeLimitAligned;
118  int occupancyInBlocks;
119  int occupancyInThreads;
120  size_t dynamicSMemSize;
121 
123  // Check user input
125 
126  if (!minGridSize || !blockSize || !func) {
127  return cudaErrorInvalidValue;
128  }
129 
131  // Obtain device and function properties
133 
134  status = ::cudaGetDevice(&device);
135  if (status != cudaSuccess) {
136  return status;
137  }
138 
139  status = cudaDeviceGetAttribute(
140  &maxThreadsPerMultiProcessor,
141  cudaDevAttrMaxThreadsPerMultiProcessor,
142  device);
143  if (status != cudaSuccess) {
144  return status;
145  }
146 
147  status = cudaDeviceGetAttribute(
148  &warpSize,
149  cudaDevAttrWarpSize,
150  device);
151  if (status != cudaSuccess) {
152  return status;
153  }
154 
155  status = cudaDeviceGetAttribute(
156  &devMaxThreadsPerBlock,
157  cudaDevAttrMaxThreadsPerBlock,
158  device);
159  if (status != cudaSuccess) {
160  return status;
161  }
162 
163  status = cudaDeviceGetAttribute(
164  &multiProcessorCount,
165  cudaDevAttrMultiProcessorCount,
166  device);
167  if (status != cudaSuccess) {
168  return status;
169  }
170 
171  status = cudaFuncGetAttributes(&attr, func);
172  if (status != cudaSuccess) {
173  return status;
174  }
175 
176  funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
177 
179  // Try each block size, and pick the block size with maximum occupancy
181 
182  occupancyLimit = maxThreadsPerMultiProcessor;
183  granularity = warpSize;
184 
185  if (blockSizeLimit == 0) {
186  blockSizeLimit = devMaxThreadsPerBlock;
187  }
188 
189  if (devMaxThreadsPerBlock < blockSizeLimit) {
190  blockSizeLimit = devMaxThreadsPerBlock;
191  }
192 
193  if (funcMaxThreadsPerBlock < blockSizeLimit) {
194  blockSizeLimit = funcMaxThreadsPerBlock;
195  }
196 
197  blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
198 
199  for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
200  // This is needed for the first iteration, because
201  // blockSizeLimitAligned could be greater than blockSizeLimit
202  //
203  if (blockSizeLimit < blockSizeToTryAligned) {
204  blockSizeToTry = blockSizeLimit;
205  } else {
206  blockSizeToTry = blockSizeToTryAligned;
207  }
208 
209  dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
210 
211  status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
212  &occupancyInBlocks,
213  func,
214  blockSizeToTry,
215  dynamicSMemSize,
216  flags);
217 
218  if (status != cudaSuccess) {
219  return status;
220  }
221 
222  occupancyInThreads = blockSizeToTry * occupancyInBlocks;
223 
224  if (occupancyInThreads > maxOccupancy) {
225  maxBlockSize = blockSizeToTry;
226  numBlocks = occupancyInBlocks;
227  maxOccupancy = occupancyInThreads;
228  }
229 
230  // Early out if we have reached the maximum
231  //
232  if (occupancyLimit == maxOccupancy) {
233  break;
234  }
235  }
236 
238  // Return best available
240 
241  // Suggested min grid size to achieve a full machine launch
242  //
243  *minGridSize = numBlocks * multiProcessorCount;
244  *blockSize = maxBlockSize;
245 
246  return status;
247 }
248 
249 #if CUDA_VERSION > 10000
250 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
251 // if block_size_limit is 0, it is ignored.
252 template <typename UnaryFunction>
253 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
254  const void* kernel_function_ptr,
255  cuda::device::id_t device_id,
256  UnaryFunction determine_shared_mem_by_block_size,
257  cuda::grid::block_dimension_t block_size_limit,
258  bool disable_caching_override)
259 {
260  int min_grid_size_in_blocks { 0 };
261  int block_size { 0 };
262  // Note: only initializing the values her because of a
263  // spurious (?) compiler warning about potential uninitialized use.
264 
265  unsigned flags = disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
266  auto result = (cuda::status_t) cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_<UnaryFunction, const void*>(
267  &min_grid_size_in_blocks,
268  &block_size,
269  kernel_function_ptr,
270  determine_shared_mem_by_block_size,
271  (int) block_size_limit,
272  flags);
273 
274  throw_if_error(result,
275  "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_function_ptr, device_id)
276  + " with maximum occupancy given dynamic shared memory and block size data");
277  return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
278 }
279 #endif // CUDA_VERSION > 10000
280 
281 inline grid::dimension_t max_active_blocks_per_multiprocessor(
282  const void* kernel_function_ptr,
283  grid::block_dimension_t block_size_in_threads,
284  memory::shared::size_t dynamic_shared_memory_per_block,
285  bool disable_caching_override)
286 {
287  // Assuming we don't need to set the current device here
288  int result;
289  cuda::status_t status = CUDA_SUCCESS;
290  auto flags = (unsigned) disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
291  status = (cuda::status_t) cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
292  &result, kernel_function_ptr, (int) block_size_in_threads, (int) dynamic_shared_memory_per_block, flags);
293  throw_if_error(status,
294  "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
295  return result;
296 }
297 
298 #endif
299 
300 } // namespace detail_
301 
302 } // namespace occupancy
303 
304 } // namespace apriori_compiled
305 
310 class apriori_compiled_t final : public kernel_t {
311 public: // getters
318  const void *ptr() const noexcept { return ptr_; }
320  const void *get() const noexcept { return ptr_; }
321 
322 public: // type_conversions
323  explicit operator const void *() noexcept { return ptr_; }
325 
326 public: // non-mutators
327 
328 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
329 
331  apriori_compiled::attributes_t attributes() const;
332 
334  void set_cache_preference(multiprocessor_cache_preference_t preference) const override;
335 
337  void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const override;
338 
346  {
347  return attributes().ptx_version();
348  }
349 
350  cuda::device::compute_capability_t binary_compilation_target_architecture() const override
351  {
352  return attributes().binary_compilation_target_architecture();
353  }
354 
361  {
362  return attributes().maxThreadsPerBlock;
363  }
364 
365  void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const override;
366 
367 #if CUDA_VERSION > 10000
368  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
369  memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
370  grid::block_dimension_t block_size_limit = 0,
371  bool disable_caching_override = false) const override
372  {
373  auto shared_memory_size_determiner =
374  [dynamic_shared_memory_size](int) -> size_t { return dynamic_shared_memory_size; };
375  return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
376  ptr(), device_id(),
377  shared_memory_size_determiner,
378  block_size_limit, disable_caching_override);
379  }
380 
381  grid::composite_dimensions_t min_grid_params_for_max_occupancy(
382  kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
383  grid::block_dimension_t block_size_limit = 0,
384  bool disable_caching_override = false) const override
385  {
386  return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
387  ptr(), device_id(),
388  shared_memory_size_determiner,
389  block_size_limit, disable_caching_override);
390  }
391 #endif
392 
393  kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const override;
394 
412  grid::block_dimension_t block_size_in_threads,
413  memory::shared::size_t dynamic_shared_memory_per_block,
414  bool disable_caching_override = false) const override
415  {
416  return apriori_compiled::occupancy::detail_::max_active_blocks_per_multiprocessor(
417  ptr(),
418  block_size_in_threads,
419  dynamic_shared_memory_per_block,
420  disable_caching_override);
421  }
422 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
423 
424 protected: // ctors & dtor
425  apriori_compiled_t(device::id_t device_id, context::handle_t primary_context_handle,
426  kernel::handle_t handle, const void *f, bool hold_pc_refcount_unit)
427  : kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) {
428  // TODO: Consider checking whether this actually is a device function, at all and in this context
429 #ifndef NDEBUG
430  assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer");
431 #endif
432  }
434  device::id_t device_id,
435  context::handle_t primary_context_handle,
436  const void *f,
437  bool hold_primary_context_refcount_unit)
439  device_id,
440  primary_context_handle,
441  apriori_compiled::detail_::get_handle(f),
442  f,
443  hold_primary_context_refcount_unit)
444  { }
445 
446 public: // ctors & dtor
447  apriori_compiled_t(const apriori_compiled_t&) = default;
449 
450 public: // friends
451  friend apriori_compiled_t apriori_compiled::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*, bool);
452 
453 protected: // data members
454  const void *const ptr_;
455 }; // class apriori_compiled_t
456 
457 namespace apriori_compiled {
458 
459 namespace detail_ {
460 
461 inline apriori_compiled_t wrap(
462  device::id_t device_id,
463  context::handle_t primary_context_handle,
464  kernel::handle_t f,
465  const void * ptr,
466  bool hold_primary_context_refcount_unit)
467 {
468  return { device_id, primary_context_handle, f, ptr, hold_primary_context_refcount_unit };
469 }
470 
471 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
472 inline ::std::string identify(const apriori_compiled_t& kernel)
473 {
474  return "apriori-compiled kernel " + cuda::detail_::ptr_as_hex(kernel.ptr())
475  + " in " + context::detail_::identify(kernel.context());
476 }
477 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
478 
479 } // namespace detail
480 
481 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE
482 inline attribute_value_t get_attribute(const void* function_ptr, attribute_t attribute)
483 {
484  auto handle = detail_::get_handle(function_ptr);
485  return kernel::detail_::get_attribute_in_current_context(handle, attribute);
486 }
487 
488 inline void set_attribute(const void* function_ptr, attribute_t attribute, attribute_value_t value)
489 {
490  auto handle = detail_::get_handle(function_ptr);
491  return kernel::detail_::set_attribute_in_current_context(handle, attribute, value);
492 }
493 
494 inline attribute_value_t get_attribute(
495  const context_t& context,
496  const void* function_ptr,
497  attribute_t attribute)
498 {
499  CAW_SET_SCOPE_CONTEXT(context.handle());
500  return get_attribute(function_ptr, attribute);
501 }
502 
503 inline void set_attribute(
504  const context_t& context,
505  const void* function_ptr,
506  attribute_t attribute,
507  attribute_value_t value)
508 {
509  CAW_SET_SCOPE_CONTEXT(context.handle());
510  return set_attribute(function_ptr, attribute, value);
511 }
512 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE
513 
514 } // namespace apriori_compiled
515 
521 
529 template<typename KernelFunctionPtr>
530 apriori_compiled_t get(const device_t& device, KernelFunctionPtr function_ptr);
531 
533 template<typename KernelFunctionPtr>
534 apriori_compiled_t get(context_t context, KernelFunctionPtr function_ptr);
535 
536 } // namespace kernel
537 
538 } // namespace cuda
539 
540 #endif // CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
cuda::device::compute_capability_t ptx_version() const override
The PTX intermediate-representation language used in the compilation of this kernel (whether as the o...
Definition: apriori_compiled.hpp:345
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
a wrapper around cudaFuncAttributes, offering a few convenience member functions. ...
Definition: apriori_compiled.hpp:65
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:335
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
grid::block_dimension_t maximum_threads_per_block() const override
The constraint on the block size in threads for launch grids of this kernel in its associated context...
Definition: apriori_compiled.hpp:360
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
cuda::device::compute_capability_t ptx_version() const noexcept
See apriori_compiled_t::ptx_version()
Definition: apriori_compiled.hpp:68
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:264
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const override
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: apriori_compiled.hpp:411
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983