eyalroz/cuda-api-wrappers/kernel__launch_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
 #define CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_

 #include "launch_configuration.hpp"
 #include "kernel.hpp"
 #include "kernels/apriori_compiled.hpp"
 #if CUDA_VERSION >= 12000
 #include "kernels/in_library.hpp"
 #endif


 #if CUDA_VERSION >= 9000
 // The following is necessary for cudaLaunchCooperativeKernel
 #include <cuda_runtime.h>
 #endif // CUDA_VERSION >= 9000

 #include <type_traits>
 #include <utility>

 namespace cuda {

 class stream_t;

 constexpr grid::dimensions_t single_block() { return 1; }
 constexpr grid::block_dimensions_t single_thread_per_block() { return 1; }

 namespace detail_ {

 template<typename P>
 struct kernel_parameter_decay {
 private:
     using U = typename ::std::remove_reference<P>::type;
 public:
     using type = typename ::std::conditional<
         ::std::is_array<U>::value,
         typename ::std::remove_extent<U>::type*,
         typename ::std::conditional<
             ::std::is_function<U>::value,
             typename ::std::add_pointer<U>::type,
             U
         >::type
     >::type;
 };

 template<typename P>
 using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;

 template<typename Fun>
 struct is_function_ptr: bool_constant<
     ::std::is_pointer<Fun>::value and ::std::is_function<typename ::std::remove_pointer<Fun>::type>::value> { };

 inline void collect_argument_addresses(void**) { }

 template <typename Arg, typename... Args>
 inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Args&&... args)
 {
     collected_addresses[0] = const_cast<void*>(static_cast<const void*>(&arg));
     collect_argument_addresses(collected_addresses + 1, ::std::forward<Args>(args)...);
 }

 // For partial template specialization on WrappedKernel...
 template<typename Kernel, typename... KernelParameters>
 struct enqueue_launch_helper {
     void operator()(
         Kernel&&                kernel_function,
         const stream_t &        stream,
         launch_configuration_t  launch_configuration,
         KernelParameters &&...  parameters) const;
 };

 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
     bool_constant<false>,
     bool_constant<false>,
     Kernel&&                kernel_function,
     const stream_t&         stream,
     launch_configuration_t  launch_configuration,
     KernelParameters&&...   parameters);

 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
     bool_constant<true>,
     bool_constant<false>,
     Kernel&&                kernel,
     const stream_t&         stream,
     launch_configuration_t  launch_configuration,
     KernelParameters&&...   parameters);

 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
     bool_constant<false>,
     bool_constant<true>,
     Kernel&&                kernel,
     const stream_t&         stream,
     launch_configuration_t  launch_configuration,
     KernelParameters&&...   parameters);

 inline void enqueue_kernel_launch_by_handle_in_current_context(
     kernel::handle_t        kernel_function_handle,
     device::id_t            device_id,
     context::handle_t       context_handle,
     stream::handle_t        stream_handle,
     launch_configuration_t  launch_config,
     const void**            marshalled_arguments);

 template<typename KernelFunction, typename... KernelParameters>
 void enqueue_raw_kernel_launch_in_current_context(
     KernelFunction&&        kernel_function,
     device::id_t            device_id,
     context::handle_t       context_handle,
     stream::handle_t        stream_handle,
     launch_configuration_t  launch_configuration,
     KernelParameters&&...   parameters)
 #ifndef __CUDACC__
 // If we're not in CUDA's NVCC, this can't run properly anyway, so either we throw some
 // compilation error, or we just do nothing. For now it's option 2.
 ;
 #else
 {
     using decayed_kf_type = typename ::std::decay<KernelFunction>::type;
     static_assert(::std::is_function<decayed_kf_type>::value or is_function_ptr<decayed_kf_type>::value,
         "Only a bona fide function can be launched as a CUDA kernel");
     if (not launch_configuration.has_nondefault_attributes()) {
         // regular plain vanilla launch
         kernel_function <<<
             launch_configuration.dimensions.grid,
             launch_configuration.dimensions.block,
             launch_configuration.dynamic_shared_memory_size,
             stream_handle
         >>>(::std::forward<KernelParameters>(parameters)...);
         cuda::outstanding_error::ensure_none("Kernel launch failed");
     }
     else {
 #if CUDA_VERSION < 9000
         throw cuda::runtime_error(status::not_supported,
             "Only CUDA versions 9.0 and later support launching kernels with additional"
             "arguments, e.g block cooperation");
 #else
         // The following hack is due to C++ not supporting arrays of length 0 -
         // but such an array being necessary for collect_argument_addresses with
         // multiple parameters. Other workarounds are possible, but would be
         // more cumbersome, except perhaps with C++17 or later.
         static constexpr const auto non_zero_num_params =
             sizeof...(KernelParameters) == 0 ? 1 : sizeof...(KernelParameters);
         void* argument_ptrs[non_zero_num_params];
         // fill the argument array with our parameters. Yes, the use
         // of the two terms is confusing here and depends on how you
         // look at things.
         detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
 #if CUDA_VERSION >= 11000
         kernel::handle_t kernel_function_handle = kernel::apriori_compiled::detail_::get_handle( (const void*) kernel_function);
         enqueue_kernel_launch_by_handle_in_current_context(
             kernel_function_handle,
             device_id,
             context_handle,
             stream_handle,
             launch_configuration,
             const_cast<const void**>(argument_ptrs));

 #else // CUDA_VERSION is at least 9000 but under 11000
         (void) device_id;
         (void) context_handle;
         auto status = cudaLaunchCooperativeKernel(
             (const void *) kernel_function,
             (dim3)(uint3)launch_configuration.dimensions.grid,
             (dim3)(uint3)launch_configuration.dimensions.block,
             &argument_ptrs[0],
             (size_t)launch_configuration.dynamic_shared_memory_size,
             cudaStream_t(stream_handle));
         throw_if_error_lazy(status, "Kernel launch failed");
 #endif // CUDA_VERSION >= 11000
 #endif // CUDA_VERSION >= 9000
     }
 }
 #endif

 } // namespace detail_


 namespace kernel {

 namespace detail_ {

 // The helper code here is intended for re-imbuing kernel-related classes with the types
 // of the kernel parameters. This is necessary since kernel wrappers may be type-erased
 // (which makes it much easier to work with them and avoids a bunch of code duplication).
 //
 // Note: The type-unerased kernel must be a non-const function pointer. Why? Not sure.
 // even though function pointers can't get written through, for some reason they are
 // expected not to be const.


 template<typename... KernelParameters>
 struct raw_kernel_typegen {
     // You should be careful to only instantiate this class with nice simple types we can pass to CUDA kernels.
 //  static_assert(
 //      all_true<
 //          ::std::is_same<
 //              KernelParameters,
 //              ::cuda::detail_::kernel_parameter_decay_t<KernelParameters>>::value...
 //          >::value,
 //      "All kernel parameter types must be decay-invariant" );
     using type = void(*)(cuda::detail_::kernel_parameter_decay_t<KernelParameters>...);
 };

 } // namespace detail_

 template<typename... KernelParameters>
 typename detail_::raw_kernel_typegen<KernelParameters...>::type
 unwrap(const kernel::apriori_compiled_t& kernel)
 {
     using raw_kernel_t = typename detail_::raw_kernel_typegen<KernelParameters ...>::type;
     return reinterpret_cast<raw_kernel_t>(const_cast<void *>(kernel.ptr()));
 }

 } // namespace kernel

 namespace detail_ {

 template<typename... KernelParameters>
 struct enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...> {
     void operator()(
         const kernel::apriori_compiled_t&  wrapped_kernel,
         const stream_t &                  stream,
         launch_configuration_t            launch_configuration,
         KernelParameters &&...            parameters) const;
 };

 } // namespace detail_


 template<typename Kernel, typename... KernelParameters>
 void enqueue_launch(
     Kernel&&                kernel,
     const stream_t&         stream,
     launch_configuration_t  launch_configuration,
     KernelParameters&&...   parameters);

 template<typename Kernel, typename... KernelParameters>
 void launch(
     Kernel&&                kernel,
     launch_configuration_t  launch_configuration,
     KernelParameters&&...   parameters);

 template <typename SpanOfConstVoidPtrLike>
 void launch_type_erased(
     const kernel_t&         kernel,
     const stream_t&         stream,
     launch_configuration_t  launch_configuration,
     SpanOfConstVoidPtrLike  marshalled_arguments);

 #if CUDA_VERSION >= 12000
 template <typename SpanOfConstVoidPtrLike>
 void launch_type_erased(
     const library::kernel_t&  kernel,
     const stream_t&           stream,
     launch_configuration_t    launch_configuration,
     SpanOfConstVoidPtrLike    marshalled_arguments);
 #endif // CUDA_VERSION >= 12000

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
cuda::stream_t
Proxy class for a CUDA stream.
Definition: stream.hpp:246

cuda::launch_configuration_t::has_nondefault_attributes
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::launch_configuration_t
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69

cuda::context::handle_t
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878

cuda::single_block
constexpr grid::dimensions_t single_block()
A named constructor idiom for grid::dimensions_t, which, when used, will result in a grid with a sing...
Definition: kernel_launch.hpp:64

cuda::launch_configuration_t::dimensions
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::outstanding_error::ensure_none
void ensure_none(const ::std::string &message) noexcept(false)
Does nothing (unless throwing an exception)
Definition: error.hpp:438

cuda::launch
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394

cuda::grid::dimensions_t
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325

cuda::enqueue_launch
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25

cuda::single_thread_per_block
constexpr grid::block_dimensions_t single_thread_per_block()
A named constructor idiom for grid::dimensions_t, which, when used, will result in a grid whose block...
Definition: kernel_launch.hpp:70

cuda::runtime_error
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271

cuda::kernel::apriori_compiled_t
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::launch_type_erased
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:411

cuda::launch_configuration_t::dynamic_shared_memory_size
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77

launch_configuration.hpp
Contains the class cuda::launch_configuration_t and some supporting code.

cuda::stream::handle_t
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239

kernel.hpp
Contains a base wrapper class for CUDA kernels - both statically and dynamically compiled; and some r...

cuda::kernel::apriori_compiled_t::ptr
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319

cuda::kernel_t
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159

cuda::kernel::unwrap
detail_::raw_kernel_typegen< KernelParameters... >::type unwrap(const kernel::apriori_compiled_t &kernel)
A function similar to ::std::any_cast for retrieving the function pointer wrapped by a cuda::kernel::...
Definition: kernel_launch.hpp:269

in_library.hpp
The cuda::library::kernel_t class and related code.

apriori_compiled.hpp
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...