cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel_launch.hpp
Go to the documentation of this file.
1 
7 #pragma once
8 #ifndef MULTI_WRAPPER_IMPLS_LAUNCH_HPP_
9 #define MULTI_WRAPPER_IMPLS_LAUNCH_HPP_
10 
11 #include "kernel.hpp"
12 #include "../types.hpp"
13 #include "../memory.hpp"
14 #include "../stream.hpp"
15 #include "../kernel_launch.hpp"
16 #include "../pointer.hpp"
17 #include "../device.hpp"
18 
19 // The following is needed for occupancy-related calculation convenience functions
20 #include <cuda_runtime.h>
21 
22 namespace cuda {
23 
24 template<typename Kernel, typename... KernelParameters>
26  Kernel&& kernel,
27  const stream_t& stream,
28  launch_configuration_t launch_configuration,
29  KernelParameters&&... parameters)
30 {
31  static_assert(
32  detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
33  "All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
34  static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
35 #if CUDA_VERSION >= 12000
36  static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
37 #else
38  static constexpr const bool library_kernel = false;
39 #endif // CUDA_VERSION >= 12000
40 #ifndef NDEBUG
41  // wrapped kernel and library kernel compatibility with the launch configuration
42  // will be validated further inside, when we differentiate them from raw kernels
43  detail_::validate(launch_configuration);
44 #endif
45 
46  // We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
47  // use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
48  // function for each of them will basically be just a one-liner :-(
49  detail_::enqueue_launch<Kernel, KernelParameters...>(
50  detail_::bool_constant<wrapped_contextual_kernel>{},
51  detail_::bool_constant<library_kernel>{},
52  ::std::forward<Kernel>(kernel), stream, launch_configuration,
53  ::std::forward<KernelParameters>(parameters)...);
54 }
55 
56 namespace detail_ {
57 
58 inline void validate_shared_mem_compatibility(
59  const device_t &device,
60  memory::shared::size_t shared_mem_size) noexcept(false)
61 {
62  if (shared_mem_size == 0) { return; }
63  memory::shared::size_t max_shared = device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
64 
65  // Note: A single kernel may not be able to access this shared memory capacity without opting-in to
66  // it using kernel_t::set_maximum_dynamic_shared_memory_per_block. See @ref kernel_t
67 
68  if (shared_mem_size > max_shared) {
69  throw ::std::invalid_argument(
70  "A dynamic shared memory size of " + ::std::to_string(shared_mem_size)
71  + " bytes exceeds the device maximum of " + ::std::to_string(max_shared));
72  }
73 }
74 
75 inline void validate_compatibility(
76  const device::id_t device_id,
77  memory::shared::size_t shared_mem_size,
78  bool cooperative_launch,
79  optional<grid::dimensions_t> block_cluster_dimensions) noexcept(false)
80 {
81  auto device = device::get(device_id);
82  if (not cooperative_launch or device.supports_block_cooperation()) {
83  throw ::std::runtime_error(device::detail_::identify(device_id)
84  + " cannot launch kernels with inter-block cooperation");
85  }
86  validate_shared_mem_compatibility(device, shared_mem_size);
87  if (block_cluster_dimensions) {
88 #if CUDA_VERSION >= 12000
89  if (not device.supports_block_clustering()) {
90  throw ::std::runtime_error(device::detail_::identify(device_id)
91  + " cannot launch kernels with inter-block cooperation");
92  // TODO: Uncomment this once the CUDA driver offers info on the maximum
93  // cluster size...
94  //
95  // auto max_cluster_size = ???;
96  // auto cluster_size = block_cluster_dimensions.value().volume();
97  // if (cluster_size > max_cluster_size) {
98  // throw ::std::runtime_error(device::detail_::identify(device_id)
99  // + " only supports as many as " + ::std::to_string(max_cluster_size)
100  // + "blocks per block-cluster, but " + ::std::to_string(cluster_size));
101  }
102 #else
103  throw ::std::runtime_error("Block clusters are not supported with CUDA versions earlier than 12.0");
104 #endif // CUDA_VERSION >= 12000
105  }
106 
107  // The CUDA driver does not offer us information with which we could check the validity
108  // of trying a programmatically dependent launch, or a programmatic completion event,
109  // or the use of a "remote" memory synchronization domain. So, assuming that's all valid
110 }
111 
112 template <typename Dims>
113 inline void validate_any_dimensions_compatibility(
114  const device_t &device, Dims dims,
115  Dims maxima,
116  const char* kind) noexcept(false)
117 {
118  auto device_id = device.id();
119  auto check =
120  [device_id, kind](grid::dimension_t dim, grid::dimension_t max, const char *axis) {
121  if (max < dim) {
122  throw ::std::invalid_argument(
123  ::std::string("specified ") + kind + " " + axis + "-axis dimension " + ::std::to_string(dim)
124  + " exceeds the maximum supported " + axis + " dimension of " + ::std::to_string(max)
125  + " for " + device::detail_::identify(device_id));
126  }
127  };
128  check(dims.x, maxima.x, "X");
129  check(dims.y, maxima.y, "Y");
130  check(dims.z, maxima.z, "Z");
131 }
132 
133 inline void validate_block_dimension_compatibility(
134  const device_t &device,
135  grid::block_dimensions_t block_dims) noexcept(false)
136 {
137  auto max_block_size = device.maximum_threads_per_block();
138  auto volume = block_dims.volume();
139  if (volume > max_block_size) {
140  throw ::std::invalid_argument(
141  "Specified block dimensions result in blocks of size " + ::std::to_string(volume)
142  + ", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
143  + " for " + device::detail_::identify(device.id()));
144  }
145  auto maxima = grid::block_dimensions_t{
146  static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)),
147  static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)),
148  static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
149  };
150  validate_any_dimensions_compatibility(device, block_dims, maxima, "block");
151 }
152 
153 inline void validate_grid_dimension_compatibility(
154  const device_t &device,
155  grid::block_dimensions_t block_dims) noexcept(false)
156 {
157  auto maxima = grid::dimensions_t{
158  static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)),
159  static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)),
160  static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
161  };
162  validate_any_dimensions_compatibility(device, block_dims, maxima, "grid");
163 }
164 
165 
166 inline void validate_shared_mem_size_compatibility(
167  const kernel_t& kernel_ptr,
168  memory::shared::size_t shared_mem_size)
169 {
170  if (shared_mem_size == 0) { return; }
171  auto max_shared = kernel_ptr.get_maximum_dynamic_shared_memory_per_block();
172  if (shared_mem_size > max_shared) {
173  throw ::std::invalid_argument(
174  "Requested dynamic shared memory size "
175  + ::std::to_string(shared_mem_size) + " exceeds kernel's maximum allowed value of "
176  + ::std::to_string(max_shared));
177  }
178 }
179 
180 inline void validate_block_dimension_compatibility(
181  const kernel_t& kernel,
182  grid::block_dimensions_t block_dims)
183 {
184  auto max_block_size = kernel.maximum_threads_per_block();
185  auto volume = block_dims.volume();
186  if (volume > max_block_size) {
187  throw ::std::invalid_argument(
188  "specified block dimensions result in blocks of size " + ::std::to_string(volume)
189  + ", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
190  + " for " + kernel::detail_::identify(kernel));
191  }
192 }
193 
194 inline void validate_dyanmic_shared_memory_size(
195  const kernel_t& kernel,
196  memory::shared::size_t dynamic_shared_memory_size)
197 {
198  memory::shared::size_t max_dyn_shmem = kernel.get_attribute(
199  kernel::attribute_t::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
200  if (dynamic_shared_memory_size > max_dyn_shmem) {
201  throw ::std::invalid_argument(
202  "specified size of dynamic shared memory, " + ::std::to_string(dynamic_shared_memory_size)
203  + "bytes, exceeds the maximum supported by " + kernel::detail_::identify(kernel)
204  + ", " + ::std::to_string(max_dyn_shmem) + " bytes");
205  }
206 }
207 
208 
209 template<typename... KernelParameters>
210 void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::operator()(
211  const kernel::apriori_compiled_t& wrapped_kernel,
212  const stream_t & stream,
213  launch_configuration_t launch_configuration,
214  KernelParameters &&... parameters) const
215 {
216  using raw_kernel_t = typename kernel::detail_::raw_kernel_typegen<KernelParameters ...>::type;
217  auto unwrapped_kernel_function = reinterpret_cast<raw_kernel_t>(const_cast<void *>(wrapped_kernel.ptr()));
218  // Notes:
219  // 1. The inner cast here is because we store the pointer as const void* - as an extra
220  // precaution against anybody trying to write through it. Now, function pointers
221  // can't get written through, but are still for some reason not considered const.
222  // 2. We rely on the caller providing us with more-or-less the correct parameters -
223  // corresponding to the compiled kernel function's. I say "more or less" because the
224  // `KernelParameter` pack may contain some references, arrays and so on - which CUDA
225  // kernels cannot accept; so we massage those a bit.
226 
227  // It is assumed arguments were already been validated
228 
229  detail_::enqueue_raw_kernel_launch_in_current_context(
230  unwrapped_kernel_function,
231  stream.device_id(),
232  stream.context_handle(),
233  stream.handle(),
234  launch_configuration,
235  ::std::forward<KernelParameters>(parameters)...);
236 }
237 
238 template<typename... KernelParameters>
239 ::std::array<const void*, sizeof...(KernelParameters)>
240 marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
241 {
242  return ::std::array<const void*, sizeof...(KernelParameters)> { &parameters... };
243 }
244 
245 // Note: The last (valid) element of marshalled_arguments must be null
246 inline void enqueue_kernel_launch_by_handle_in_current_context(
247  kernel::handle_t kernel_function_handle,
248  device::id_t device_id,
249  context::handle_t context_handle,
250  stream::handle_t stream_handle,
251  launch_configuration_t launch_config,
252  const void** marshalled_arguments)
253 {
254  // It is assumed arguments were already been validated
255 
256  status_t status;
257  const auto&lc = launch_config; // alias for brevity
258 #if CUDA_VERSION >= 12000
259  CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
260  auto launch_attributes_span = span<CUlaunchAttribute>{
261  launch_attributes, sizeof(launch_attributes)/sizeof(launch_attributes[0])
262  };
263  CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
264  status = cuLaunchKernelEx(
265  &full_launch_config,
266  kernel_function_handle,
267  const_cast<void**>(marshalled_arguments),
268  nullptr);
269 #else
270  if (launch_config.has_nondefault_attributes())
271  status = cuLaunchCooperativeKernel(
272  kernel_function_handle,
273  lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
274  lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
275  lc.dynamic_shared_memory_size,
276  stream_handle,
277  const_cast<void**>(marshalled_arguments)
278  );
279  else {
280  static constexpr const auto no_arguments_in_alternative_format = nullptr;
281  // TODO: Consider passing marshalled_arguments in the alternative format
282  status = cuLaunchKernel(
283  kernel_function_handle,
284  lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
285  lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
286  lc.dynamic_shared_memory_size,
287  stream_handle,
288  const_cast<void**>(marshalled_arguments),
289  no_arguments_in_alternative_format
290  );
291  }
292 #endif // CUDA_VERSION >= 12000
293  throw_if_error_lazy(status,
294  ::std::string(" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
295  + " on " + stream::detail_::identify(stream_handle, context_handle, device_id));
296 }
297 
298 
299 template<typename... KernelParameters>
300 struct enqueue_launch_helper<kernel_t, KernelParameters...> {
301 
302  void operator()(
303  const kernel_t& wrapped_kernel,
304  const stream_t& stream,
305  launch_configuration_t launch_config,
306  KernelParameters&&... arguments) const
307  {
308  // It is assumed arguments were already been validated
309 
310 #ifndef NDEBUG
311  if (wrapped_kernel.context() != stream.context()) {
312  throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(wrapped_kernel)
313  + " on " + stream::detail_::identify(stream) + ": Different contexts"};
314  }
315  validate_compatibility(wrapped_kernel, launch_config);
316 #endif
317  auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
318  auto function_handle = wrapped_kernel.handle();
319  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
320 
321  enqueue_kernel_launch_by_handle_in_current_context(
322  function_handle, stream.device_id(), stream.context_handle(),
323  stream.handle(), launch_config, marshalled_arguments.data());
324  } // operator()
325 };
326 
327 template<typename RawKernelFunction, typename... KernelParameters>
328 void enqueue_launch(
329  bool_constant<false>, // Not a wrapped contextual kernel,
330  bool_constant<false>, // and not a library kernel, so it must be a raw kernel function
331  RawKernelFunction&& kernel_function,
332  const stream_t& stream,
333  launch_configuration_t launch_configuration,
334  KernelParameters&&... parameters)
335 {
336  // It is assumed arguments were already been validated
337 
338  // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
339  // and not have trouble enqueueing into a stream in another context - it balks at doing so under
340  // certain conditions, so we must place ourselves in the stream's context.
341  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
342  detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
343  kernel_function, stream.device_id(), stream.context_handle(), stream.handle(), launch_configuration,
344  ::std::forward<KernelParameters>(parameters)...);
345 }
346 
347 template<typename Kernel, typename... KernelParameters>
348 void enqueue_launch(
349  bool_constant<true>, // a kernel wrapped in a kernel_t (sub)class
350  bool_constant<false>, // Not a library kernel
351  Kernel&& kernel,
352  const stream_t& stream,
353  launch_configuration_t launch_configuration,
354  KernelParameters&&... parameters)
355 {
356  // It is assumed arguments were already been validated - except for:
357 #ifndef NDEBUG
358  if (kernel.context() != stream.context()) {
359  throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel)
360  + " on " + stream::detail_::identify(stream) + ": Different contexts"};
361  }
362  detail_::validate_compatibility(kernel, launch_configuration);
363 #endif // #ifndef NDEBUG
364 
365  enqueue_launch_helper<typename ::std::decay<Kernel>::type, KernelParameters...>{}(
366  ::std::forward<Kernel>(kernel), stream, launch_configuration,
367  ::std::forward<KernelParameters>(parameters)...);
368 }
369 
370 #if CUDA_VERSION >= 12000
371 template<typename Kernel, typename... KernelParameters>
372 void enqueue_launch(
373  bool_constant<false>, // Not a wrapped contextual kernel,
374  bool_constant<true>, // but a library kernel
375  Kernel&& kernel,
376  const stream_t& stream,
377  launch_configuration_t launch_configuration,
378  KernelParameters&&... parameters)
379 {
380  // Launch configuration is assumed to have been validated separately
381  // from the kernel, and their compatibility will be validated further
382  // inside, against the contextualized kernel
383 
384  kernel_t contextualized = cuda::contextualize(kernel, stream.context());
385  enqueue_launch_helper<kernel_t, KernelParameters...> {}(
386  contextualized, stream, launch_configuration,
387  ::std::forward<KernelParameters>(parameters)...);
388 }
389 #endif // CUDA_VERSION >= 12000
390 
391 } // namespace detail_
392 
393 template<typename Kernel, typename... KernelParameters>
394 void launch(
395  Kernel&& kernel,
396  launch_configuration_t launch_configuration,
397  KernelParameters&&... parameters)
398 {
399  // Argument validation will occur within call to enqueue_launch
400 
401  auto primary_context = detail_::get_implicit_primary_context(::std::forward<Kernel>(kernel));
402  auto stream = primary_context.default_stream();
403 
404  // Note: If Kernel is a kernel_t, and its associated device is different
405  // than the current device, the next call will fail:
406 
407  enqueue_launch(kernel, stream, launch_configuration, ::std::forward<KernelParameters>(parameters)...);
408 }
409 
410 template <typename SpanOfConstVoidPtrLike>
411 inline void launch_type_erased(
412  const kernel_t& kernel,
413  const stream_t& stream,
414  launch_configuration_t launch_configuration,
415  SpanOfConstVoidPtrLike marshalled_arguments)
416 {
417  // Note: We assume that kernel, stream and launch_configuration have already been validated.
418  static_assert(
419  ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, void*>::value or
420  ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, const void*>::value,
421  "The element type of the marshalled arguments container type must be either void* or const void*");
422 #ifndef NDEBUG
423  if (kernel.context() != stream.context()) {
424  throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel)
425  + " on " + stream::detail_::identify(stream) + ": Different contexts"};
426  }
427  detail_::validate_compatibility(kernel, launch_configuration);
428  detail_::validate(launch_configuration);
429  if (*(marshalled_arguments.end() - 1) != nullptr) {
430  throw ::std::invalid_argument("marshalled arguments for a kernel launch must end with a nullptr element");
431  }
432 #endif
433  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
434  return detail_::enqueue_kernel_launch_by_handle_in_current_context(
435  kernel.handle(),
436  stream.device_id(),
437  stream.context_handle(),
438  stream.handle(),
439  launch_configuration,
440  static_cast<const void**>(marshalled_arguments.data()));
441 }
442 
443 #if CUDA_VERSION >= 12000
444 template <typename SpanOfConstVoidPtrLike>
445 void launch_type_erased(
446  const library::kernel_t& kernel,
447  const stream_t& stream,
448  launch_configuration_t launch_configuration,
449  SpanOfConstVoidPtrLike marshalled_arguments)
450 {
451  // Argument validation will occur inside the call to launch_type_erased
452  auto contextualized = contextualize(kernel, stream.context());
453  launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
454 }
455 #endif // CUDA_VERSION >= 12000
456 
457 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
458 
459 #if defined(__CUDACC__)
460 
461 // Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
462 // from code compiled with a host-side-only compiler! See cuda_runtime.h for details
463 
464 #if CUDA_VERSION >= 10000
465 namespace detail_ {
466 
467 template <typename UnaryFunction>
468 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
469  const void * ptr,
470  device::id_t device_id,
471  UnaryFunction block_size_to_dynamic_shared_mem_size,
472  grid::block_dimension_t block_size_limit,
473  bool disable_caching_override)
474 {
475  int min_grid_size_in_blocks { 0 };
476  int block_size { 0 };
477  // Note: only initializing the values her because of a
478  // spurious (?) compiler warning about potential uninitialized use.
479  auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
480  &min_grid_size_in_blocks, &block_size,
481  ptr,
482  block_size_to_dynamic_shared_mem_size,
483  static_cast<int>(block_size_limit),
484  disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
485  );
486  throw_if_error_lazy(result,
487  "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
488  " on device " + ::std::to_string(device_id) + ".");
489  return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
490 }
491 
492 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
493  const void * ptr,
494  device::id_t device_id,
495  memory::shared::size_t dynamic_shared_mem_size,
496  grid::block_dimension_t block_size_limit,
497  bool disable_caching_override)
498 {
499  auto always_need_same_shared_mem_size =
500  [dynamic_shared_mem_size](::size_t) { return dynamic_shared_mem_size; };
501  return min_grid_params_for_max_occupancy(
502  ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override);
503 }
504 
505 } // namespace detail_
506 
507 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
508  const kernel::apriori_compiled_t& kernel,
509  memory::shared::size_t dynamic_shared_memory_size,
510  grid::block_dimension_t block_size_limit,
511  bool disable_caching_override)
512 {
513  return detail_::min_grid_params_for_max_occupancy(
514  kernel.ptr(), kernel.device().id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
515 }
516 
517 template <typename UnaryFunction>
518 grid::composite_dimensions_t min_grid_params_for_max_occupancy(
519  const kernel::apriori_compiled_t& kernel,
520  UnaryFunction block_size_to_dynamic_shared_mem_size,
521  grid::block_dimension_t block_size_limit,
522  bool disable_caching_override)
523 {
524  return detail_::min_grid_params_for_max_occupancy(
525  kernel.ptr(), kernel.device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
526 }
527 #endif // CUDA_VERSION >= 10000
528 
529 #endif // defined(__CUDACC__)
530 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
531 
532 } // namespace cuda
533 
534 #endif // MULTI_WRAPPER_IMPLS_LAUNCH_HPP_
535 
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260
Proxy class for a CUDA stream.
Definition: stream.hpp:246
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:342
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:28
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:837
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:411
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
context_t context() const noexcept
The context in which this stream was defined.
Definition: stream.hpp:135
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:263