cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel_launch.hpp
Go to the documentation of this file.
1 
7 #pragma once
8 #ifndef MULTI_WRAPPER_IMPLS_LAUNCH_HPP_
9 #define MULTI_WRAPPER_IMPLS_LAUNCH_HPP_
10 
11 #include "kernel.hpp"
12 #include "../types.hpp"
13 #include "../memory.hpp"
14 #include "../stream.hpp"
15 #include "../kernel_launch.hpp"
16 #include "../pointer.hpp"
17 #include "../device.hpp"
18 
19 // The following is needed for occupancy-related calculation convenience functions
20 #include <cuda_runtime.h>
21 
22 namespace cuda {
23 
24 template<typename Kernel, typename... KernelParameters>
26  Kernel&& kernel,
27  const stream_t& stream,
28  launch_configuration_t launch_configuration,
29  KernelParameters&&... parameters)
30 {
31  static_assert(
32  detail_::all_true<is_valid_kernel_argument<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
33  "All kernel parameter types must fulfill the CUDA kernel argument requirements. "
34  "Refer to the documentation of 'cuda::traits::is_valid_kernel_argument' for more details."
35  );
36  static constexpr const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
37 #if CUDA_VERSION >= 12000
38  static constexpr const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
39 #else
40  static constexpr const bool library_kernel = false;
41 #endif // CUDA_VERSION >= 12000
42 #ifndef NDEBUG
43  // wrapped kernel and library kernel compatibility with the launch configuration
44  // will be validated further inside, when we differentiate them from raw kernels
45  detail_::validate(launch_configuration);
46 #endif
47 
48  // We would have liked an "if constexpr" here, but that is unsupported by C++11, so we have to
49  // use tagged dispatch for the separate behavior for raw and wrapped kernels - although the enqueue_launch
50  // function for each of them will basically be just a one-liner :-(
51  detail_::enqueue_launch<Kernel, KernelParameters...>(
52  detail_::bool_constant<wrapped_contextual_kernel>{},
53  detail_::bool_constant<library_kernel>{},
54  ::std::forward<Kernel>(kernel), stream, launch_configuration,
55  ::std::forward<KernelParameters>(parameters)...);
56 }
57 
58 namespace detail_ {
59 
60 inline void validate_shared_mem_compatibility(
61  const device_t &device,
62  memory::shared::size_t shared_mem_size) noexcept(false)
63 {
64  if (shared_mem_size == 0) { return; }
65  memory::shared::size_t max_shared = device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN);
66 
67  // Note: A single kernel may not be able to access this shared memory capacity without opting-in to
68  // it using kernel_t::set_maximum_dynamic_shared_memory_per_block. See @ref kernel_t
69 
70  if (shared_mem_size > max_shared) {
71  throw ::std::invalid_argument(
72  "A dynamic shared memory size of " + ::std::to_string(shared_mem_size)
73  + " bytes exceeds the device maximum of " + ::std::to_string(max_shared));
74  }
75 }
76 
77 inline void validate_compatibility(
78  device::id_t device_id,
79  memory::shared::size_t shared_mem_size,
80  bool cooperative_launch,
81  optional<grid::dimensions_t> block_cluster_dimensions) noexcept(false)
82 {
83  auto device = device::get(device_id);
84  if (not cooperative_launch or device.supports_block_cooperation()) {
85  throw ::std::runtime_error(device::detail_::identify(device_id)
86  + " cannot launch kernels with inter-block cooperation");
87  }
88  validate_shared_mem_compatibility(device, shared_mem_size);
89  if (block_cluster_dimensions) {
90 #if CUDA_VERSION >= 12000
91  if (not device.supports_block_clustering()) {
92  throw ::std::runtime_error(device::detail_::identify(device_id)
93  + " cannot launch kernels with inter-block cooperation");
94  // TODO: Uncomment this once the CUDA driver offers info on the maximum
95  // cluster size...
96  //
97  // auto max_cluster_size = ???;
98  // auto cluster_size = block_cluster_dimensions.value().volume();
99  // if (cluster_size > max_cluster_size) {
100  // throw ::std::runtime_error(device::detail_::identify(device_id)
101  // + " only supports as many as " + ::std::to_string(max_cluster_size)
102  // + "blocks per block-cluster, but " + ::std::to_string(cluster_size));
103  }
104 #else
105  throw ::std::runtime_error("Block clusters are not supported with CUDA versions earlier than 12.0");
106 #endif // CUDA_VERSION >= 12000
107  }
108 
109  // The CUDA driver does not offer us information with which we could check the validity
110  // of trying a programmatically dependent launch, or a programmatic completion event,
111  // or the use of a "remote" memory synchronization domain. So, assuming that's all valid
112 }
113 
114 template <typename Dims>
115 inline void validate_any_dimensions_compatibility(
116  const device_t &device, Dims dims,
117  Dims maxima,
118  const char* kind) noexcept(false)
119 {
120  auto device_id = device.id();
121  auto check =
122  [device_id, kind](grid::dimension_t dim, grid::dimension_t max, const char *axis) {
123  if (max < dim) {
124  throw ::std::invalid_argument(
125  ::std::string("specified ") + kind + " " + axis + "-axis dimension " + ::std::to_string(dim)
126  + " exceeds the maximum supported " + axis + " dimension of " + ::std::to_string(max)
127  + " for " + device::detail_::identify(device_id));
128  }
129  };
130  check(dims.x, maxima.x, "X");
131  check(dims.y, maxima.y, "Y");
132  check(dims.z, maxima.z, "Z");
133 }
134 
135 inline void validate_block_dimension_compatibility(
136  const device_t &device,
137  grid::block_dimensions_t block_dims) noexcept(false)
138 {
139  auto max_block_size = device.maximum_threads_per_block();
140  auto volume = block_dims.volume();
141  if (volume > max_block_size) {
142  throw ::std::invalid_argument(
143  "Specified block dimensions result in blocks of size " + ::std::to_string(volume)
144  + ", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
145  + " for " + device::detail_::identify(device.id()));
146  }
147  auto maxima = grid::block_dimensions_t{
148  static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X)),
149  static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)),
150  static_cast<grid::block_dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
151  };
152  validate_any_dimensions_compatibility(device, block_dims, maxima, "block");
153 }
154 
155 inline void validate_grid_dimension_compatibility(
156  const device_t &device,
157  grid::block_dimensions_t block_dims) noexcept(false)
158 {
159  auto maxima = grid::dimensions_t{
160  static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X)),
161  static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)),
162  static_cast<grid::dimension_t>(device.get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
163  };
164  validate_any_dimensions_compatibility(device, block_dims, maxima, "grid");
165 }
166 
167 
168 inline void validate_shared_mem_size_compatibility(
169  const kernel_t& kernel_ptr,
170  memory::shared::size_t shared_mem_size)
171 {
172  if (shared_mem_size == 0) { return; }
173  auto max_shared = kernel_ptr.get_maximum_dynamic_shared_memory_per_block();
174  if (shared_mem_size > max_shared) {
175  throw ::std::invalid_argument(
176  "Requested dynamic shared memory size "
177  + ::std::to_string(shared_mem_size) + " exceeds kernel's maximum allowed value of "
178  + ::std::to_string(max_shared));
179  }
180 }
181 
182 inline void validate_block_dimension_compatibility(
183  const kernel_t& kernel,
184  grid::block_dimensions_t block_dims)
185 {
186  auto max_block_size = kernel.maximum_threads_per_block();
187  auto volume = block_dims.volume();
188  if (volume > max_block_size) {
189  throw ::std::invalid_argument(
190  "specified block dimensions result in blocks of size " + ::std::to_string(volume)
191  + ", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
192  + " for " + kernel::detail_::identify(kernel));
193  }
194 }
195 
196 inline void validate_dyanmic_shared_memory_size(
197  const kernel_t& kernel,
198  memory::shared::size_t dynamic_shared_memory_size)
199 {
200  memory::shared::size_t max_dyn_shmem = kernel.get_attribute(
201  kernel::attribute_t::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
202  if (dynamic_shared_memory_size > max_dyn_shmem) {
203  throw ::std::invalid_argument(
204  "specified size of dynamic shared memory, " + ::std::to_string(dynamic_shared_memory_size)
205  + "bytes, exceeds the maximum supported by " + kernel::detail_::identify(kernel)
206  + ", " + ::std::to_string(max_dyn_shmem) + " bytes");
207  }
208 }
209 
210 
211 template<typename... KernelParameters>
212 void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::operator()(
213  const kernel::apriori_compiled_t& wrapped_kernel,
214  const stream_t & stream,
215  launch_configuration_t launch_configuration,
216  KernelParameters &&... parameters) const
217 {
218  using raw_kernel_t = typename kernel::detail_::raw_kernel_typegen<KernelParameters ...>::type;
219  auto unwrapped_kernel_function = reinterpret_cast<raw_kernel_t>(const_cast<void *>(wrapped_kernel.ptr()));
220  // Notes:
221  // 1. The inner cast here is because we store the pointer as const void* - as an extra
222  // precaution against anybody trying to write through it. Now, function pointers
223  // can't get written through, but are still for some reason not considered const.
224  // 2. We rely on the caller providing us with more-or-less the correct parameters -
225  // corresponding to the compiled kernel function's. I say "more or less" because the
226  // `KernelParameter` pack may contain some references, arrays and so on - which CUDA
227  // kernels cannot accept; so we massage those a bit.
228 
229  // It is assumed arguments were already been validated
230 
231  detail_::enqueue_raw_kernel_launch_in_current_context(
232  unwrapped_kernel_function,
233  stream.device_id(),
234  stream.context_handle(),
235  stream.handle(),
236  launch_configuration,
237  ::std::forward<KernelParameters>(parameters)...);
238 }
239 
240 template<typename... KernelParameters>
241 ::std::array<const void*, sizeof...(KernelParameters)>
242 marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
243 {
244  return ::std::array<const void*, sizeof...(KernelParameters)> { &parameters... };
245 }
246 
247 // Note: The last (valid) element of marshalled_arguments must be null
248 inline void enqueue_kernel_launch_by_handle_in_current_context(
249  kernel::handle_t kernel_function_handle,
250  device::id_t device_id,
251  context::handle_t context_handle,
252  stream::handle_t stream_handle,
253  launch_configuration_t launch_config,
254  const void** marshalled_arguments)
255 {
256  // It is assumed arguments were already been validated
257 
258  status_t status;
259  const auto&lc = launch_config; // alias for brevity
260 #if CUDA_VERSION >= 12000
261  CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
262  auto launch_attributes_span = span<CUlaunchAttribute>{
263  launch_attributes, sizeof(launch_attributes)/sizeof(launch_attributes[0])
264  };
265  CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
266  status = cuLaunchKernelEx(
267  &full_launch_config,
268  kernel_function_handle,
269  const_cast<void**>(marshalled_arguments),
270  nullptr);
271 #else
272  if (launch_config.has_nondefault_attributes())
273  status = cuLaunchCooperativeKernel(
274  kernel_function_handle,
275  lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
276  lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
277  lc.dynamic_shared_memory_size,
278  stream_handle,
279  const_cast<void**>(marshalled_arguments)
280  );
281  else {
282  static constexpr const auto no_arguments_in_alternative_format = nullptr;
283  // TODO: Consider passing marshalled_arguments in the alternative format
284  status = cuLaunchKernel(
285  kernel_function_handle,
286  lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
287  lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
288  lc.dynamic_shared_memory_size,
289  stream_handle,
290  const_cast<void**>(marshalled_arguments),
291  no_arguments_in_alternative_format
292  );
293  }
294 #endif // CUDA_VERSION >= 12000
295  throw_if_error_lazy(status,
296  ::std::string(" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
297  + " on " + stream::detail_::identify(stream_handle, context_handle, device_id));
298 }
299 
300 
301 template<typename... KernelParameters>
302 struct enqueue_launch_helper<kernel_t, KernelParameters...> {
303 
304  void operator()(
305  const kernel_t& wrapped_kernel,
306  const stream_t& stream,
307  launch_configuration_t launch_config,
308  KernelParameters&&... arguments) const
309  {
310  // It is assumed arguments were already been validated
311 
312 #ifndef NDEBUG
313  if (wrapped_kernel.context() != stream.context()) {
314  throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(wrapped_kernel)
315  + " on " + stream::detail_::identify(stream) + ": Different contexts"};
316  }
317  validate_compatibility(wrapped_kernel, launch_config);
318 #endif
319  auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
320  auto function_handle = wrapped_kernel.handle();
321  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
322 
323  enqueue_kernel_launch_by_handle_in_current_context(
324  function_handle, stream.device_id(), stream.context_handle(),
325  stream.handle(), launch_config, marshalled_arguments.data());
326  } // operator()
327 };
328 
329 template<typename RawKernelFunction, typename... KernelParameters>
330 void enqueue_launch(
331  bool_constant<false>, // Not a wrapped contextual kernel,
332  bool_constant<false>, // and not a library kernel, so it must be a raw kernel function
333  RawKernelFunction&& kernel_function,
334  const stream_t& stream,
335  launch_configuration_t launch_configuration,
336  KernelParameters&&... parameters)
337 {
338  // It is assumed arguments were already been validated
339 
340  // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
341  // and not have trouble enqueueing into a stream in another context - it balks at doing so under
342  // certain conditions, so we must place ourselves in the stream's context.
343  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
344  detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
345  kernel_function, stream.device_id(), stream.context_handle(), stream.handle(), launch_configuration,
346  ::std::forward<KernelParameters>(parameters)...);
347 }
348 
349 template<typename Kernel, typename... KernelParameters>
350 void enqueue_launch(
351  bool_constant<true>, // a kernel wrapped in a kernel_t (sub)class
352  bool_constant<false>, // Not a library kernel
353  Kernel&& kernel,
354  const stream_t& stream,
355  launch_configuration_t launch_configuration,
356  KernelParameters&&... parameters)
357 {
358  // It is assumed arguments were already been validated - except for:
359 #ifndef NDEBUG
360  if (kernel.context() != stream.context()) {
361  throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel)
362  + " on " + stream::detail_::identify(stream) + ": Different contexts"};
363  }
364  detail_::validate_compatibility(kernel, launch_configuration);
365 #endif // #ifndef NDEBUG
366 
367  enqueue_launch_helper<typename ::std::decay<Kernel>::type, KernelParameters...>{}(
368  ::std::forward<Kernel>(kernel), stream, launch_configuration,
369  ::std::forward<KernelParameters>(parameters)...);
370 }
371 
372 #if CUDA_VERSION >= 12000
373 template<typename Kernel, typename... KernelParameters>
374 void enqueue_launch(
375  bool_constant<false>, // Not a wrapped contextual kernel,
376  bool_constant<true>, // but a library kernel
377  Kernel&& kernel,
378  const stream_t& stream,
379  launch_configuration_t launch_configuration,
380  KernelParameters&&... parameters)
381 {
382  // Launch configuration is assumed to have been validated separately
383  // from the kernel, and their compatibility will be validated further
384  // inside, against the contextualized kernel
385 
386  kernel_t contextualized = cuda::contextualize(kernel, stream.context());
387  enqueue_launch_helper<kernel_t, KernelParameters...> {}(
388  contextualized, stream, launch_configuration,
389  ::std::forward<KernelParameters>(parameters)...);
390 }
391 #endif // CUDA_VERSION >= 12000
392 
393 } // namespace detail_
394 
395 template<typename Kernel, typename... KernelParameters>
396 void launch(
397  Kernel&& kernel,
398  launch_configuration_t launch_configuration,
399  KernelParameters&&... parameters)
400 {
401  // Argument validation will occur within call to enqueue_launch
402 
403  auto primary_context = detail_::get_implicit_primary_context(::std::forward<Kernel>(kernel));
404  auto stream = primary_context.default_stream();
405 
406  // Note: If Kernel is a kernel_t, and its associated device is different
407  // than the current device, the next call will fail:
408 
409  enqueue_launch(kernel, stream, launch_configuration, ::std::forward<KernelParameters>(parameters)...);
410 }
411 
412 template <typename SpanOfConstVoidPtrLike>
413 inline void launch_type_erased(
414  const kernel_t& kernel,
415  const stream_t& stream,
416  launch_configuration_t launch_configuration,
417  SpanOfConstVoidPtrLike marshalled_arguments)
418 {
419  // Note: We assume that kernel, stream and launch_configuration have already been validated.
420  static_assert(
421  ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, void*>::value or
422  ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, const void*>::value,
423  "The element type of the marshalled arguments container type must be either void* or const void*");
424 #ifndef NDEBUG
425  if (kernel.context() != stream.context()) {
426  throw ::std::invalid_argument{"Attempt to launch " + kernel::detail_::identify(kernel)
427  + " on " + stream::detail_::identify(stream) + ": Different contexts"};
428  }
429  detail_::validate_compatibility(kernel, launch_configuration);
430  detail_::validate(launch_configuration);
431  if (*(marshalled_arguments.end() - 1) != nullptr) {
432  throw ::std::invalid_argument("marshalled arguments for a kernel launch must end with a nullptr element");
433  }
434 #endif
435  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
436  return detail_::enqueue_kernel_launch_by_handle_in_current_context(
437  kernel.handle(),
438  stream.device_id(),
439  stream.context_handle(),
440  stream.handle(),
441  launch_configuration,
442  static_cast<const void**>(marshalled_arguments.data()));
443 }
444 
445 #if CUDA_VERSION >= 12000
446 template <typename SpanOfConstVoidPtrLike>
447 void launch_type_erased(
448  const library::kernel_t& kernel,
449  const stream_t& stream,
450  launch_configuration_t launch_configuration,
451  SpanOfConstVoidPtrLike marshalled_arguments)
452 {
453  // Argument validation will occur inside the call to launch_type_erased
454  auto contextualized = contextualize(kernel, stream.context());
455  launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
456 }
457 #endif // CUDA_VERSION >= 12000
458 
459 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
460 
461 #if defined(__CUDACC__)
462 
463 // Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
464 // from code compiled with a host-side-only compiler! See cuda_runtime.h for details
465 
466 #if CUDA_VERSION >= 10000
467 namespace detail_ {
468 
469 template <typename UnaryFunction>
470 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
471  const void * ptr,
472  device::id_t device_id,
473  UnaryFunction block_size_to_dynamic_shared_mem_size,
474  grid::block_dimension_t block_size_limit,
475  bool disable_caching_override)
476 {
477  int min_grid_size_in_blocks { 0 };
478  int block_size { 0 };
479  // Note: only initializing the values her because of a
480  // spurious (?) compiler warning about potential uninitialized use.
481  auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
482  &min_grid_size_in_blocks, &block_size,
483  ptr,
484  block_size_to_dynamic_shared_mem_size,
485  static_cast<int>(block_size_limit),
486  disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
487  );
488  throw_if_error_lazy(result,
489  "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
490  " on device " + ::std::to_string(device_id) + ".");
491  return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
492 }
493 
494 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
495  const void * ptr,
496  device::id_t device_id,
497  memory::shared::size_t dynamic_shared_mem_size,
498  grid::block_dimension_t block_size_limit,
499  bool disable_caching_override)
500 {
501  auto always_need_same_shared_mem_size =
502  [dynamic_shared_mem_size](::size_t) { return dynamic_shared_mem_size; };
503  return min_grid_params_for_max_occupancy(
504  ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override);
505 }
506 
507 } // namespace detail_
508 
509 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
510  const kernel::apriori_compiled_t& kernel,
511  memory::shared::size_t dynamic_shared_memory_size,
512  grid::block_dimension_t block_size_limit,
513  bool disable_caching_override)
514 {
515  return detail_::min_grid_params_for_max_occupancy(
516  kernel.ptr(), kernel.device().id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
517 }
518 
519 template <typename UnaryFunction>
520 grid::composite_dimensions_t min_grid_params_for_max_occupancy(
521  const kernel::apriori_compiled_t& kernel,
522  UnaryFunction block_size_to_dynamic_shared_mem_size,
523  grid::block_dimension_t block_size_limit,
524  bool disable_caching_override)
525 {
526  return detail_::min_grid_params_for_max_occupancy(
527  kernel.ptr(), kernel.device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
528 }
529 #endif // CUDA_VERSION >= 10000
530 
531 #endif // defined(__CUDACC__)
532 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
533 
534 } // namespace cuda
535 
536 #endif // MULTI_WRAPPER_IMPLS_LAUNCH_HPP_
537 
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:272
Proxy class for a CUDA stream.
Definition: stream.hpp:258
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:296
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:269
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
A trait characterizing those types which can be used as kernel parameters.
Definition: kernel_launch.hpp:76
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:309
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:339
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:28
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:396
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:322
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:732
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:78
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:832
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:505
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:413
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:91
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:236
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
context_t context() const noexcept
The context in which this stream was defined.
Definition: stream.hpp:135
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:275