8 #ifndef MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ 9 #define MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ 12 #include "../types.hpp" 13 #include "../memory.hpp" 14 #include "../stream.hpp" 15 #include "../kernel_launch.hpp" 16 #include "../pointer.hpp" 17 #include "../device.hpp" 20 #include <cuda_runtime.h> 24 template<
typename Kernel,
typename... KernelParameters>
29 KernelParameters&&... parameters)
32 detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
33 "All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
34 static constexpr
const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
35 #if CUDA_VERSION >= 12000 36 static constexpr
const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
38 static constexpr
const bool library_kernel =
false;
39 #endif // CUDA_VERSION >= 12000 43 detail_::validate(launch_configuration);
49 detail_::enqueue_launch<Kernel, KernelParameters...>(
50 detail_::bool_constant<wrapped_contextual_kernel>{},
51 detail_::bool_constant<library_kernel>{},
52 ::std::forward<Kernel>(kernel), stream, launch_configuration,
53 ::std::forward<KernelParameters>(parameters)...);
58 inline void validate_shared_mem_compatibility(
62 if (shared_mem_size == 0) {
return; }
68 if (shared_mem_size > max_shared) {
69 throw ::std::invalid_argument(
70 "A dynamic shared memory size of " + ::std::to_string(shared_mem_size)
71 +
" bytes exceeds the device maximum of " + ::std::to_string(max_shared));
75 inline void validate_compatibility(
78 bool cooperative_launch,
79 optional<grid::dimensions_t> block_cluster_dimensions) noexcept(
false)
83 throw ::std::runtime_error(device::detail_::identify(device_id)
84 +
" cannot launch kernels with inter-block cooperation");
86 validate_shared_mem_compatibility(device, shared_mem_size);
87 if (block_cluster_dimensions) {
88 #if CUDA_VERSION >= 12000 89 if (not device.supports_block_clustering()) {
90 throw ::std::runtime_error(device::detail_::identify(device_id)
91 +
" cannot launch kernels with inter-block cooperation");
103 throw ::std::runtime_error(
"Block clusters are not supported with CUDA versions earlier than 12.0");
104 #endif // CUDA_VERSION >= 12000 112 template <
typename Dims>
113 inline void validate_any_dimensions_compatibility(
116 const char* kind) noexcept(
false)
118 auto device_id = device.
id();
122 throw ::std::invalid_argument(
123 ::std::string(
"specified ") + kind +
" " + axis +
"-axis dimension " + ::std::to_string(dim)
124 +
" exceeds the maximum supported " + axis +
" dimension of " + ::std::to_string(max)
125 +
" for " + device::detail_::identify(device_id));
128 check(dims.x, maxima.x,
"X");
129 check(dims.y, maxima.y,
"Y");
130 check(dims.z, maxima.z,
"Z");
133 inline void validate_block_dimension_compatibility(
137 auto max_block_size = device.maximum_threads_per_block();
138 auto volume = block_dims.
volume();
139 if (volume > max_block_size) {
140 throw ::std::invalid_argument(
141 "Specified block dimensions result in blocks of size " + ::std::to_string(volume)
142 +
", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
143 +
" for " + device::detail_::identify(device.
id()));
147 static_cast<grid::block_dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)),
148 static_cast<grid::block_dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
150 validate_any_dimensions_compatibility(device, block_dims, maxima,
"block");
153 inline void validate_grid_dimension_compatibility(
159 static_cast<grid::dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)),
160 static_cast<grid::dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
162 validate_any_dimensions_compatibility(device, block_dims, maxima,
"grid");
166 inline void validate_shared_mem_size_compatibility(
170 if (shared_mem_size == 0) {
return; }
171 auto max_shared = kernel_ptr.get_maximum_dynamic_shared_memory_per_block();
172 if (shared_mem_size > max_shared) {
173 throw ::std::invalid_argument(
174 "Requested dynamic shared memory size " 175 + ::std::to_string(shared_mem_size) +
" exceeds kernel's maximum allowed value of " 176 + ::std::to_string(max_shared));
180 inline void validate_block_dimension_compatibility(
185 auto volume = block_dims.
volume();
186 if (volume > max_block_size) {
187 throw ::std::invalid_argument(
188 "specified block dimensions result in blocks of size " + ::std::to_string(volume)
189 +
", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
190 +
" for " + kernel::detail_::identify(kernel));
194 inline void validate_dyanmic_shared_memory_size(
199 kernel::attribute_t::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
200 if (dynamic_shared_memory_size > max_dyn_shmem) {
201 throw ::std::invalid_argument(
202 "specified size of dynamic shared memory, " + ::std::to_string(dynamic_shared_memory_size)
203 +
"bytes, exceeds the maximum supported by " + kernel::detail_::identify(kernel)
204 +
", " + ::std::to_string(max_dyn_shmem) +
" bytes");
209 template<
typename... KernelParameters>
210 void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::operator()(
214 KernelParameters &&... parameters)
const 216 using raw_kernel_t =
typename kernel::detail_::raw_kernel_typegen<KernelParameters ...>::type;
217 auto unwrapped_kernel_function =
reinterpret_cast<raw_kernel_t
>(
const_cast<void *
>(wrapped_kernel.
ptr()));
229 detail_::enqueue_raw_kernel_launch_in_current_context(
230 unwrapped_kernel_function,
234 launch_configuration,
235 ::std::forward<KernelParameters>(parameters)...);
238 template<
typename... KernelParameters>
239 ::std::array<
const void*,
sizeof...(KernelParameters)>
240 marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
242 return ::std::array<
const void*,
sizeof...(KernelParameters)> { ¶meters... };
246 inline void enqueue_kernel_launch_by_handle_in_current_context(
247 kernel::handle_t kernel_function_handle,
252 const void** marshalled_arguments)
257 const auto&lc = launch_config;
258 #if CUDA_VERSION >= 12000 259 CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
260 auto launch_attributes_span = span<CUlaunchAttribute>{
261 launch_attributes,
sizeof(launch_attributes)/
sizeof(launch_attributes[0])
263 CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
264 status = cuLaunchKernelEx(
266 kernel_function_handle,
267 const_cast<void**>(marshalled_arguments),
271 status = cuLaunchCooperativeKernel(
272 kernel_function_handle,
273 lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
274 lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
275 lc.dynamic_shared_memory_size,
277 const_cast<void**>(marshalled_arguments)
280 static constexpr
const auto no_arguments_in_alternative_format =
nullptr;
282 status = cuLaunchKernel(
283 kernel_function_handle,
284 lc.dimensions.grid.x, lc.dimensions.grid.y, lc.dimensions.grid.z,
285 lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
286 lc.dynamic_shared_memory_size,
288 const_cast<void**>(marshalled_arguments),
289 no_arguments_in_alternative_format
292 #endif // CUDA_VERSION >= 12000 294 ::std::string(
" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
295 +
" on " + stream::detail_::identify(stream_handle, context_handle, device_id));
299 template<
typename... KernelParameters>
300 struct enqueue_launch_helper<kernel_t, KernelParameters...> {
303 const kernel_t& wrapped_kernel,
306 KernelParameters&&... arguments)
const 312 throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(wrapped_kernel)
313 +
" on " + stream::detail_::identify(stream) +
": Different contexts"};
315 validate_compatibility(wrapped_kernel, launch_config);
317 auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
318 auto function_handle = wrapped_kernel.
handle();
321 enqueue_kernel_launch_by_handle_in_current_context(
323 stream.
handle(), launch_config, marshalled_arguments.data());
327 template<
typename RawKernelFunction,
typename... KernelParameters>
329 bool_constant<false>,
330 bool_constant<false>,
331 RawKernelFunction&& kernel_function,
334 KernelParameters&&... parameters)
342 detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
344 ::std::forward<KernelParameters>(parameters)...);
347 template<
typename Kernel,
typename... KernelParameters>
350 bool_constant<false>,
354 KernelParameters&&... parameters)
359 throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(kernel)
360 +
" on " + stream::detail_::identify(stream) +
": Different contexts"};
362 detail_::validate_compatibility(kernel, launch_configuration);
363 #endif // #ifndef NDEBUG 365 enqueue_launch_helper<typename ::std::decay<Kernel>::type, KernelParameters...>{}(
366 ::std::forward<Kernel>(kernel), stream, launch_configuration,
367 ::std::forward<KernelParameters>(parameters)...);
370 #if CUDA_VERSION >= 12000 371 template<
typename Kernel,
typename... KernelParameters>
373 bool_constant<false>,
378 KernelParameters&&... parameters)
384 kernel_t contextualized = cuda::contextualize(kernel, stream.
context());
385 enqueue_launch_helper<kernel_t, KernelParameters...> {}(
386 contextualized, stream, launch_configuration,
387 ::std::forward<KernelParameters>(parameters)...);
389 #endif // CUDA_VERSION >= 12000 393 template<
typename Kernel,
typename... KernelParameters>
397 KernelParameters&&... parameters)
401 auto primary_context = detail_::get_implicit_primary_context(::std::forward<Kernel>(kernel));
402 auto stream = primary_context.default_stream();
407 enqueue_launch(kernel, stream, launch_configuration, ::std::forward<KernelParameters>(parameters)...);
410 template <
typename SpanOfConstVo
idPtrLike>
415 SpanOfConstVoidPtrLike marshalled_arguments)
419 ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, void*>::value or
420 ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, const void*>::value,
421 "The element type of the marshalled arguments container type must be either void* or const void*");
424 throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(kernel)
425 +
" on " + stream::detail_::identify(stream) +
": Different contexts"};
427 detail_::validate_compatibility(kernel, launch_configuration);
428 detail_::validate(launch_configuration);
429 if (*(marshalled_arguments.end() - 1) !=
nullptr) {
430 throw ::std::invalid_argument(
"marshalled arguments for a kernel launch must end with a nullptr element");
434 return detail_::enqueue_kernel_launch_by_handle_in_current_context(
439 launch_configuration,
440 static_cast<const void**
>(marshalled_arguments.data()));
443 #if CUDA_VERSION >= 12000 444 template <
typename SpanOfConstVo
idPtrLike>
446 const library::kernel_t& kernel,
449 SpanOfConstVoidPtrLike marshalled_arguments)
452 auto contextualized = contextualize(kernel, stream.
context());
453 launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
455 #endif // CUDA_VERSION >= 12000 457 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 459 #if defined(__CUDACC__) 464 #if CUDA_VERSION >= 10000 467 template <
typename UnaryFunction>
471 UnaryFunction block_size_to_dynamic_shared_mem_size,
473 bool disable_caching_override)
475 int min_grid_size_in_blocks { 0 };
476 int block_size { 0 };
479 auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
480 &min_grid_size_in_blocks, &block_size,
482 block_size_to_dynamic_shared_mem_size,
483 static_cast<int>(block_size_limit),
484 disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
487 "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
488 " on device " + ::std::to_string(device_id) +
".");
497 bool disable_caching_override)
499 auto always_need_same_shared_mem_size =
500 [dynamic_shared_mem_size](
::size_t) {
return dynamic_shared_mem_size; };
501 return min_grid_params_for_max_occupancy(
502 ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override);
511 bool disable_caching_override)
513 return detail_::min_grid_params_for_max_occupancy(
514 kernel.
ptr(), kernel.
device().
id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
517 template <
typename UnaryFunction>
520 UnaryFunction block_size_to_dynamic_shared_mem_size,
522 bool disable_caching_override)
524 return detail_::min_grid_params_for_max_occupancy(
525 kernel.
ptr(), kernel.
device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
527 #endif // CUDA_VERSION >= 10000 529 #endif // defined(__CUDACC__) 530 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 534 #endif // MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260
Proxy class for a CUDA stream.
Definition: stream.hpp:246
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:342
device::id_t id() const noexcept
Return the proxied device's ID.
Definition: device.hpp:594
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:28
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:837
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver's launch...
Definition: kernel_launch.hpp:411
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:239
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
context_t context() const noexcept
The context in which this stream was defined.
Definition: stream.hpp:135
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:263