8 #ifndef MULTI_WRAPPER_IMPLS_LAUNCH_HPP_     9 #define MULTI_WRAPPER_IMPLS_LAUNCH_HPP_    12 #include "../types.hpp"    13 #include "../memory.hpp"    14 #include "../stream.hpp"    15 #include "../kernel_launch.hpp"    16 #include "../pointer.hpp"    17 #include "../device.hpp"    20 #include <cuda_runtime.h>    24 template<
typename Kernel, 
typename... KernelParameters>
    29     KernelParameters&&...   parameters)
    32         detail_::all_true<::std::is_trivially_copy_constructible<detail_::kernel_parameter_decay_t<KernelParameters>>::value...>::value,
    33         "All kernel parameter types must be of a trivially copy-constructible (decayed) type." );
    34     static constexpr 
const bool wrapped_contextual_kernel = ::std::is_base_of<kernel_t, typename ::std::decay<Kernel>::type>::value;
    35 #if CUDA_VERSION >= 12000    36     static constexpr 
const bool library_kernel = cuda::detail_::is_library_kernel<Kernel>::value;
    38     static constexpr 
const bool library_kernel = 
false;
    39 #endif // CUDA_VERSION >= 12000    43     detail_::validate(launch_configuration);
    49     detail_::enqueue_launch<Kernel, KernelParameters...>(
    50         detail_::bool_constant<wrapped_contextual_kernel>{},
    51         detail_::bool_constant<library_kernel>{},
    52         ::std::forward<Kernel>(kernel), stream, launch_configuration,
    53         ::std::forward<KernelParameters>(parameters)...);
    58 inline void validate_shared_mem_compatibility(
    62     if (shared_mem_size == 0) { 
return; }
    68     if (shared_mem_size > max_shared) {
    69         throw ::std::invalid_argument(
    70             "A dynamic shared memory size of " + ::std::to_string(shared_mem_size)
    71             + 
" bytes exceeds the device maximum of " + ::std::to_string(max_shared));
    75 inline void validate_compatibility(
    78     bool                          cooperative_launch,
    79     optional<grid::dimensions_t>  block_cluster_dimensions) noexcept(
false)
    83         throw ::std::runtime_error(device::detail_::identify(device_id)
    84             + 
" cannot launch kernels with inter-block cooperation");
    86     validate_shared_mem_compatibility(device, shared_mem_size);
    87     if (block_cluster_dimensions) {
    88 #if CUDA_VERSION >= 12000    89         if (not device.supports_block_clustering()) {
    90             throw ::std::runtime_error(device::detail_::identify(device_id)
    91                 + 
" cannot launch kernels with inter-block cooperation");
   103         throw ::std::runtime_error(
"Block clusters are not supported with CUDA versions earlier than 12.0");
   104 #endif // CUDA_VERSION >= 12000   112 template <
typename Dims>
   113 inline void validate_any_dimensions_compatibility(
   116     const char* kind) noexcept(
false)
   118     auto device_id = device.
id();
   122                 throw ::std::invalid_argument(
   123                     ::std::string(
"specified ") + kind + 
" " + axis + 
"-axis dimension " + ::std::to_string(dim)
   124                     + 
" exceeds the maximum supported " + axis + 
" dimension of " + ::std::to_string(max)
   125                     + 
" for " + device::detail_::identify(device_id));
   128     check(dims.x, maxima.x, 
"X");
   129     check(dims.y, maxima.y, 
"Y");
   130     check(dims.z, maxima.z, 
"Z");
   133 inline void validate_block_dimension_compatibility(
   137     auto max_block_size = device.maximum_threads_per_block();
   138     auto volume = block_dims.
volume();
   139     if (volume > max_block_size) {
   140         throw ::std::invalid_argument(
   141             "Specified block dimensions result in blocks of size " + ::std::to_string(volume)
   142             + 
", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
   143             + 
" for " + device::detail_::identify(device.
id()));
   147         static_cast<grid::block_dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y)),
   148         static_cast<grid::block_dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z))
   150     validate_any_dimensions_compatibility(device, block_dims, maxima, 
"block");
   153 inline void validate_grid_dimension_compatibility(
   159         static_cast<grid::dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y)),
   160         static_cast<grid::dimension_t>(device.
get_attribute(CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z))
   162     validate_any_dimensions_compatibility(device, block_dims, maxima, 
"grid");
   166 inline void validate_shared_mem_size_compatibility(
   170     if (shared_mem_size == 0) { 
return; }
   171     auto max_shared = kernel_ptr.get_maximum_dynamic_shared_memory_per_block();
   172     if (shared_mem_size > max_shared) {
   173         throw ::std::invalid_argument(
   174             "Requested dynamic shared memory size "   175             + ::std::to_string(shared_mem_size) + 
" exceeds kernel's maximum allowed value of "   176             + ::std::to_string(max_shared));
   180 inline void validate_block_dimension_compatibility(
   185     auto volume = block_dims.
volume();
   186     if (volume > max_block_size) {
   187         throw ::std::invalid_argument(
   188             "specified block dimensions result in blocks of size " + ::std::to_string(volume)
   189             + 
", exceeding the maximum possible block size of " + ::std::to_string(max_block_size)
   190             + 
" for " + kernel::detail_::identify(kernel));
   194 inline void validate_dyanmic_shared_memory_size(
   199         kernel::attribute_t::CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
   200     if (dynamic_shared_memory_size > max_dyn_shmem) {
   201         throw ::std::invalid_argument(
   202             "specified size of dynamic shared memory, " + ::std::to_string(dynamic_shared_memory_size)
   203             + 
"bytes, exceeds the maximum supported by  " + kernel::detail_::identify(kernel)
   204             + 
", " + ::std::to_string(max_dyn_shmem) + 
" bytes");
   209 template<
typename... KernelParameters>
   210 void enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...>::operator()(
   214     KernelParameters &&...            parameters)
 const   216     using raw_kernel_t = 
typename kernel::detail_::raw_kernel_typegen<KernelParameters ...>::type;
   217     auto unwrapped_kernel_function = 
reinterpret_cast<raw_kernel_t
>(
const_cast<void *
>(wrapped_kernel.
ptr()));
   229     detail_::enqueue_raw_kernel_launch_in_current_context(
   230         unwrapped_kernel_function,
   234         launch_configuration,
   235         ::std::forward<KernelParameters>(parameters)...);
   238 template<
typename... KernelParameters>
   239 ::std::array<
const void*, 
sizeof...(KernelParameters)>
   240 marshal_dynamic_kernel_arguments(KernelParameters&&... parameters)
   242     return ::std::array<
const void*, 
sizeof...(KernelParameters)> { ¶meters... };
   246 inline void enqueue_kernel_launch_by_handle_in_current_context(
   247     kernel::handle_t        kernel_function_handle,
   252     const void**            marshalled_arguments)
   257     const auto&lc = launch_config; 
   258 #if CUDA_VERSION >= 12000   259     CUlaunchAttribute launch_attributes[detail_::maximum_possible_kernel_launch_attributes+1];
   260     auto launch_attributes_span = span<CUlaunchAttribute>{
   261         launch_attributes, 
sizeof(launch_attributes)/
sizeof(launch_attributes[0])
   263     CUlaunchConfig full_launch_config = detail_::marshal(lc, stream_handle, launch_attributes_span);
   264     status = cuLaunchKernelEx(
   266         kernel_function_handle,
   267         const_cast<void**>(marshalled_arguments),
   271         status = cuLaunchCooperativeKernel(
   272             kernel_function_handle,
   273             lc.dimensions.grid.x,  lc.dimensions.grid.y,  lc.dimensions.grid.z,
   274             lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
   275             lc.dynamic_shared_memory_size,
   277             const_cast<void**>(marshalled_arguments)
   280         static constexpr 
const auto no_arguments_in_alternative_format = 
nullptr;
   282         status = cuLaunchKernel(
   283             kernel_function_handle,
   284             lc.dimensions.grid.x,  lc.dimensions.grid.y,  lc.dimensions.grid.z,
   285             lc.dimensions.block.x, lc.dimensions.block.y, lc.dimensions.block.z,
   286             lc.dynamic_shared_memory_size,
   288             const_cast<void**>(marshalled_arguments),
   289             no_arguments_in_alternative_format
   292 #endif // CUDA_VERSION >= 12000   294         ::std::string(
" kernel launch failed for ") + kernel::detail_::identify(kernel_function_handle)
   295         + 
" on " + stream::detail_::identify(stream_handle, context_handle, device_id));
   299 template<
typename... KernelParameters>
   300 struct enqueue_launch_helper<kernel_t, KernelParameters...> {
   303     const kernel_t&                       wrapped_kernel,
   306     KernelParameters&&...                 arguments)
 const   312             throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(wrapped_kernel)
   313                 + 
" on " + stream::detail_::identify(stream) + 
": Different contexts"};
   315         validate_compatibility(wrapped_kernel, launch_config);
   317         auto marshalled_arguments { marshal_dynamic_kernel_arguments(::std::forward<KernelParameters>(arguments)...) };
   318         auto function_handle = wrapped_kernel.
handle();
   321         enqueue_kernel_launch_by_handle_in_current_context(
   323             stream.
handle(), launch_config, marshalled_arguments.data());
   327 template<
typename RawKernelFunction, 
typename... KernelParameters>
   329     bool_constant<false>, 
   330     bool_constant<false>, 
   331     RawKernelFunction&&       kernel_function,
   334     KernelParameters&&...     parameters)
   342     detail_::enqueue_raw_kernel_launch_in_current_context<RawKernelFunction, KernelParameters...>(
   344         ::std::forward<KernelParameters>(parameters)...);
   347 template<
typename Kernel, 
typename... KernelParameters>
   350     bool_constant<false>, 
   354     KernelParameters&&...   parameters)
   359         throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(kernel)
   360             + 
" on " + stream::detail_::identify(stream) + 
": Different contexts"};
   362     detail_::validate_compatibility(kernel, launch_configuration);
   363 #endif // #ifndef NDEBUG   365     enqueue_launch_helper<typename ::std::decay<Kernel>::type, KernelParameters...>{}(
   366         ::std::forward<Kernel>(kernel), stream, launch_configuration,
   367         ::std::forward<KernelParameters>(parameters)...);
   370 #if CUDA_VERSION >= 12000   371 template<
typename Kernel, 
typename... KernelParameters>
   373     bool_constant<false>, 
   378     KernelParameters&&...   parameters)
   384     kernel_t contextualized = cuda::contextualize(kernel, stream.
context());
   385     enqueue_launch_helper<kernel_t, KernelParameters...> {}(
   386         contextualized, stream, launch_configuration,
   387         ::std::forward<KernelParameters>(parameters)...);
   389 #endif // CUDA_VERSION >= 12000   393 template<
typename Kernel, 
typename... KernelParameters>
   397     KernelParameters&&...   parameters)
   401     auto primary_context = detail_::get_implicit_primary_context(::std::forward<Kernel>(kernel));
   402     auto stream = primary_context.default_stream();
   407     enqueue_launch(kernel, stream, launch_configuration, ::std::forward<KernelParameters>(parameters)...);
   410 template <
typename SpanOfConstVo
idPtrLike>
   415     SpanOfConstVoidPtrLike  marshalled_arguments)
   419         ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, void*>::value or
   420         ::std::is_same<typename SpanOfConstVoidPtrLike::value_type, const void*>::value,
   421         "The element type of the marshalled arguments container type must be either void* or const void*");
   424         throw ::std::invalid_argument{
"Attempt to launch " + kernel::detail_::identify(kernel)
   425             + 
" on " + stream::detail_::identify(stream) + 
": Different contexts"};
   427     detail_::validate_compatibility(kernel, launch_configuration);
   428     detail_::validate(launch_configuration);
   429     if (*(marshalled_arguments.end() - 1) != 
nullptr) {
   430         throw ::std::invalid_argument(
"marshalled arguments for a kernel launch must end with a nullptr element");
   434     return detail_::enqueue_kernel_launch_by_handle_in_current_context(
   439         launch_configuration,
   440         static_cast<const void**
>(marshalled_arguments.data()));
   443 #if CUDA_VERSION >= 12000   444 template <
typename SpanOfConstVo
idPtrLike>
   446     const library::kernel_t&  kernel,
   449     SpanOfConstVoidPtrLike    marshalled_arguments)
   452     auto contextualized = contextualize(kernel, stream.
context());
   453     launch_type_erased(contextualized, stream, launch_configuration, marshalled_arguments);
   455 #endif // CUDA_VERSION >= 12000   457 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE   459 #if defined(__CUDACC__)   464 #if CUDA_VERSION >= 10000   467 template <
typename UnaryFunction>
   471     UnaryFunction            block_size_to_dynamic_shared_mem_size,
   473     bool                     disable_caching_override)
   475     int min_grid_size_in_blocks { 0 };
   476     int block_size { 0 };
   479     auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
   480         &min_grid_size_in_blocks, &block_size,
   482         block_size_to_dynamic_shared_mem_size,
   483         static_cast<int>(block_size_limit),
   484         disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
   487         "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr) +
   488             " on device " + ::std::to_string(device_id) + 
".");
   497     bool                     disable_caching_override)
   499     auto always_need_same_shared_mem_size =
   500         [dynamic_shared_mem_size](
::size_t) { 
return dynamic_shared_mem_size; };
   501     return min_grid_params_for_max_occupancy(
   502         ptr, device_id, always_need_same_shared_mem_size, block_size_limit, disable_caching_override);
   511     bool                              disable_caching_override)
   513     return detail_::min_grid_params_for_max_occupancy(
   514         kernel.
ptr(), kernel.
device().
id(), dynamic_shared_memory_size, block_size_limit, disable_caching_override);
   517 template <
typename UnaryFunction>
   520     UnaryFunction                     block_size_to_dynamic_shared_mem_size,
   522     bool                              disable_caching_override)
   524     return detail_::min_grid_params_for_max_occupancy(
   525         kernel.
ptr(), kernel.
device_id(), block_size_to_dynamic_shared_mem_size, block_size_limit, disable_caching_override);
   527 #endif // CUDA_VERSION >= 10000   529 #endif // defined(__CUDACC__)   530 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE   534 #endif // MULTI_WRAPPER_IMPLS_LAUNCH_HPP_ context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined. 
Definition: stream.hpp:260
 
Proxy class for a CUDA stream. 
Definition: stream.hpp:246
 
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions. 
Definition: types.hpp:299
 
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel. 
Definition: kernel.hpp:181
 
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps. 
Definition: stream.hpp:257
 
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
 
Definitions and functionality wrapping CUDA APIs. 
Definition: array.hpp:22
 
The full set of possible configuration parameters for launching a kernel on a GPU. 
Definition: launch_configuration.hpp:69
 
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}. 
Definition: types.hpp:878
 
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions. 
Definition: types.hpp:312
 
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API. 
Definition: types.hpp:850
 
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions. 
Definition: types.hpp:342
 
device::id_t id() const noexcept
Return the proxied device's ID. 
Definition: device.hpp:594
 
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined. 
Definition: kernel.hpp:28
 
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244
 
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
 
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
 
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
 
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device. 
Definition: kernel_launch.hpp:25
 
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
 
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
 
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id. 
Definition: device.hpp:837
 
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
 
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
 
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
 
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
 
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver's launch...
Definition: kernel_launch.hpp:411
 
size_t dimension_t
An individual dimension extent for an array. 
Definition: types.hpp:94
 
CUstream handle_t
The CUDA driver's raw handle for streams. 
Definition: types.hpp:239
 
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
 
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
 
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined. 
Definition: kernel.hpp:169
 
context_t context() const noexcept
The context in which this stream was defined. 
Definition: stream.hpp:135
 
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined. 
Definition: kernel.hpp:22
 
Wrapper class for a CUDA device. 
Definition: device.hpp:135
 
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device. 
Definition: device.hpp:356
 
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
 
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined. 
Definition: stream.hpp:263