8 #ifndef MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ 9 #define MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ 12 #include "../types.hpp" 13 #include "../memory.hpp" 14 #include "../stream.hpp" 15 #include "../kernel_launch.hpp" 16 #include "../pointer.hpp" 17 #include "../device.hpp" 20 #include <cuda_runtime.h> 24 template<
typename Kernel,
typename... KernelParameters>
29 KernelParameters&&... parameters)
33 "All kernel parameter types must fulfill the CUDA kernel argument requirements. " 34 "Refer to the documentation of 'cuda::traits::is_valid_kernel_argument' for more details." 36 static constexpr
const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
37 #if CUDA_VERSION >= 12000 38 static constexpr
const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
40 static constexpr
const bool library_kernel =
false;
41 #endif // CUDA_VERSION >= 12000 45 detail_::validate(launch_configuration);
51 detail_::enqueue_launch<Kernel, KernelParameters...>(
52 detail_::bool_constant<wrapped_contextual_kernel>{},
53 detail_::bool_constant<library_kernel>{},
54 ::std::forward<Kernel>(kernel), stream, launch_configuration,
55 ::std::forward<KernelParameters>(parameters)...);
60 inline void validate_shared_mem_compatibility(
64 if (shared_mem_size == 0) {
return; }
70 if (shared_mem_size > max_shared) {
71 throw ::std::invalid_argument(
72 "A dynamic shared memory size of " + ::std::to_string(shared_mem_size)
73 +
" bytes exceeds the device maximum of " + ::std::to_string(max_shared));
77 inline void validate_compatibility(
80 bool cooperative_launch,
81 optional<grid::dimensions_t> block_cluster_dimensions) noexcept(
false)
85 throw ::std::runtime_error(device::detail_::identify(device_id)
86 +
" cannot launch kernels with inter-block cooperation");
88 validate_shared_mem_compatibility(device, shared_mem_size);
89 if (block_cluster_dimensions) {
90 #if CUDA_VERSION >= 12000 91 if (not device.supports_block_clustering()) {
92 throw ::std::runtime_error(device::detail_::identify(device_id)
93 +
" cannot launch kernels with inter-block cooperation");
105 throw ::std::runtime_error(
"Block clusters are not supported with CUDA versions earlier than 12.0");
106 #endif // CUDA_VERSION >= 12000 114 template <
typename Dims>
115 inline void validate_any_dimensions_compatibility(
118 const char* kind) noexcept(
false)
120 auto device_id = device.
id();
124 throw ::std::invalid_argument(
125 ::std::string(
"specified ") + kind +
" " + axis +
"-axis dimension " + ::std::to_string(dim)
126 +
" exceeds the maximum supported " + axis +
" dimension of " + ::std::to_string(max)
127 +
" for " + device::detail_::identify(device_id));
130 check(dims.x, maxima.x,
"X");
131 check(dims.y, maxima.y,
"Y");
132 check(dims.z, maxima.z,
"Z");
135 inline void validate_block_dimension_compatibility(
139 auto max_block_size = device.maximum_threads_per_block();
140 auto volume = block_dims.
volume();
141 if (volume > max_block_size) {
142 throw ::std::invalid_argument(
143 "Specified block dimensions result in blocks of size " + ::std::to_string(volume)
144 +
", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
145 +
" for " + device::detail_::identify(device.
id()));
149 static_cast<grid::block_dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)),
150 static_cast<grid::block_dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
152 validate_any_dimensions_compatibility(device, block_dims, maxima,
"block");
155 inline void validate_grid_dimension_compatibility(
161 static_cast<grid::dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)),
162 static_cast<grid::dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
164 validate_any_dimensions_compatibility(device, block_dims, maxima,
"grid");
168 inline void validate_shared_mem_size_compatibility(
172 if (shared_mem_size == 0) {
return; }
173 auto max_shared = kernel_ptr.get_maximum_dynamic_shared_memory_per_block();
174 if (shared_mem_size > max_shared) {
175 throw ::std::invalid_argument(
176 "Requested dynamic shared memory size " 177 + ::std::to_string(shared_mem_size) +
" exceeds kernel's maximum allowed value of " 178 + ::std::to_string(max_shared));
182 inline void validate_block_dimension_compatibility(
187 auto volume = block_dims.
volume();
188 if (volume > max_block_size) {
189 throw ::std::invalid_argument(
190 "specified block dimensions result in blocks of size " + ::std::to_string(volume)
191 +
", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
192 +
" for " + kernel::detail_::identify(kernel));
196 inline void validate_dyanmic_shared_memory_size(
201 kernel::attribute_t::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
202 if (dynamic_shared_memory_size > max_dyn_shmem) {
203 throw ::std::invalid_argument(
204 "specified size of dynamic shared memory, " + ::std::to_string(dynamic_shared_memory_size)
205 +
"bytes, exceeds the maximum supported by " + kernel::detail_::identify(kernel)
206 +
", " + ::std::to_string(max_dyn_shmem) +
" bytes");
211 template<
typename... KernelParameters>
212 void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::operator()(
216 KernelParameters &&... parameters)
const 218 using raw_kernel_t =
typename kernel::detail_::raw_kernel_typegen<KernelParameters ...>::type;
219 auto unwrapped_kernel_function =
reinterpret_cast<raw_kernel_t
>(
const_cast<void *
>(wrapped_kernel.
ptr()));
231 detail_::enqueue_raw_kernel_launch_in_current_context(
232 unwrapped_kernel_function,
236 launch_configuration,
237 ::std::forward<KernelParameters>(parameters)...);
240 template<
typename... KernelParameters>
241 ::std::array<
const void*,
sizeof...(KernelParameters)>
242 marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
244 return ::std::array<
const void*,
sizeof...(KernelParameters)> { ¶meters... };
248 inline void enqueue_kernel_launch_by_handle_in_current_context(
249 kernel::handle_t kernel_function_handle,
254 const void** marshalled_arguments)
259 const auto&lc = launch_config;
260 #if CUDA_VERSION >= 12000 261 CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
262 auto launch_attributes_span = span<CUlaunchAttribute>{
263 launch_attributes,
sizeof(launch_attributes)/
sizeof(launch_attributes[0])
265 CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
266 status = cuLaunchKernelEx(
268 kernel_function_handle,
269 const_cast<void**>(marshalled_arguments),
273 status = cuLaunchCooperativeKernel(
274 kernel_function_handle,
275 lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
276 lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
277 lc.dynamic_shared_memory_size,
279 const_cast<void**>(marshalled_arguments)
282 static constexpr
const auto no_arguments_in_alternative_format =
nullptr;
284 status = cuLaunchKernel(
285 kernel_function_handle,
286 lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
287 lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
288 lc.dynamic_shared_memory_size,
290 const_cast<void**>(marshalled_arguments),
291 no_arguments_in_alternative_format
294 #endif // CUDA_VERSION >= 12000 296 ::std::string(
" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
297 +
" on " + stream::detail_::identify(stream_handle, context_handle, device_id));
301 template<
typename... KernelParameters>
302 struct enqueue_launch_helper<kernel_t, KernelParameters...> {
305 const kernel_t& wrapped_kernel,
308 KernelParameters&&... arguments)
const 314 throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(wrapped_kernel)
315 +
" on " + stream::detail_::identify(stream) +
": Different contexts"};
317 validate_compatibility(wrapped_kernel, launch_config);
319 auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
320 auto function_handle = wrapped_kernel.
handle();
323 enqueue_kernel_launch_by_handle_in_current_context(
325 stream.
handle(), launch_config, marshalled_arguments.data());
329 template<
typename RawKernelFunction,
typename... KernelParameters>
331 bool_constant<false>,
332 bool_constant<false>,
333 RawKernelFunction&& kernel_function,
336 KernelParameters&&... parameters)
344 detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
346 ::std::forward<KernelParameters>(parameters)...);
349 template<
typename Kernel,
typename... KernelParameters>
352 bool_constant<false>,
356 KernelParameters&&... parameters)
361 throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(kernel)
362 +
" on " + stream::detail_::identify(stream) +
": Different contexts"};
364 detail_::validate_compatibility(kernel, launch_configuration);
365 #endif // #ifndef NDEBUG 367 enqueue_launch_helper<typename ::std::decay<Kernel>::type, KernelParameters...>{}(
368 ::std::forward<Kernel>(kernel), stream, launch_configuration,
369 ::std::forward<KernelParameters>(parameters)...);
372 #if CUDA_VERSION >= 12000 373 template<
typename Kernel,
typename... KernelParameters>
375 bool_constant<false>,
380 KernelParameters&&... parameters)
386 kernel_t contextualized = cuda::contextualize(kernel, stream.
context());
387 enqueue_launch_helper<kernel_t, KernelParameters...> {}(
388 contextualized, stream, launch_configuration,
389 ::std::forward<KernelParameters>(parameters)...);
391 #endif // CUDA_VERSION >= 12000 395 template<
typename Kernel,
typename... KernelParameters>
399 KernelParameters&&... parameters)
403 auto primary_context = detail_::get_implicit_primary_context(::std::forward<Kernel>(kernel));
404 auto stream = primary_context.default_stream();
409 enqueue_launch(kernel, stream, launch_configuration, ::std::forward<KernelParameters>(parameters)...);
412 template <
typename SpanOfConstVo
idPtrLike>
417 SpanOfConstVoidPtrLike marshalled_arguments)
421 ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, void*>::value or
422 ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, const void*>::value,
423 "The element type of the marshalled arguments container type must be either void* or const void*");
426 throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(kernel)
427 +
" on " + stream::detail_::identify(stream) +
": Different contexts"};
429 detail_::validate_compatibility(kernel, launch_configuration);
430 detail_::validate(launch_configuration);
431 if (*(marshalled_arguments.end() - 1) !=
nullptr) {
432 throw ::std::invalid_argument(
"marshalled arguments for a kernel launch must end with a nullptr element");
436 return detail_::enqueue_kernel_launch_by_handle_in_current_context(
441 launch_configuration,
442 static_cast<const void**
>(marshalled_arguments.data()));
445 #if CUDA_VERSION >= 12000 446 template <
typename SpanOfConstVo
idPtrLike>
448 const library::kernel_t& kernel,
451 SpanOfConstVoidPtrLike marshalled_arguments)
454 auto contextualized = contextualize(kernel, stream.
context());
455 launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
457 #endif // CUDA_VERSION >= 12000 459 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 461 #if defined(__CUDACC__) 466 #if CUDA_VERSION >= 10000 469 template <
typename UnaryFunction>
473 UnaryFunction block_size_to_dynamic_shared_mem_size,
475 bool disable_caching_override)
477 int min_grid_size_in_blocks { 0 };
478 int block_size { 0 };
481 auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
482 &min_grid_size_in_blocks, &block_size,
484 block_size_to_dynamic_shared_mem_size,
485 static_cast<int>(block_size_limit),
486 disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
489 "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
490 " on device " + ::std::to_string(device_id) +
".");
499 bool disable_caching_override)
501 auto always_need_same_shared_mem_size =
502 [dynamic_shared_mem_size](
::size_t) {
return dynamic_shared_mem_size; };
503 return min_grid_params_for_max_occupancy(
504 ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override);
513 bool disable_caching_override)
515 return detail_::min_grid_params_for_max_occupancy(
516 kernel.
ptr(), kernel.
device().
id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
519 template <
typename UnaryFunction>
522 UnaryFunction block_size_to_dynamic_shared_mem_size,
524 bool disable_caching_override)
526 return detail_::min_grid_params_for_max_occupancy(
527 kernel.
ptr(), kernel.
device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
529 #endif // CUDA_VERSION >= 10000 531 #endif // defined(__CUDACC__) 532 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 536 #endif // MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:272
Proxy class for a CUDA stream.
Definition: stream.hpp:258
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:296
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:269
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
A trait characterizing those types which can be used as kernel parameters.
Definition: kernel_launch.hpp:76
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:309
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:339
device::id_t id() const noexcept
Return the proxied device's ID.
Definition: device.hpp:594
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:28
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:396
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:322
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:732
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:78
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:832
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:505
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver's launch...
Definition: kernel_launch.hpp:413
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:91
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:236
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
context_t context() const noexcept
The context in which this stream was defined.
Definition: stream.hpp:135
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:275