cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel_launch.hpp
Go to the documentation of this file.
1 
34 #pragma once
35 #ifndef CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
36 #define CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
37 
38 #include "launch_configuration.hpp"
39 #include "kernel.hpp"
41 #if CUDA_VERSION >= 12000
42 #include "kernels/in_library.hpp"
43 #endif
44 
45 
46 #if CUDA_VERSION >= 9000
47 // The following is necessary for cudaLaunchCooperativeKernel
48 #include <cuda_runtime.h>
49 #endif // CUDA_VERSION >= 9000
50 
51 #include <type_traits>
52 #include <utility>
53 
54 namespace cuda {
55 
57 class stream_t;
59 
64 constexpr grid::dimensions_t single_block() { return 1; }
71 
72 namespace detail_ {
73 
82 template<typename P>
83 struct kernel_parameter_decay {
84 private:
85  using U = typename ::std::remove_reference<P>::type;
86 public:
87  using type = typename ::std::conditional<
88  ::std::is_array<U>::value,
89  typename ::std::remove_extent<U>::type*,
90  typename ::std::conditional<
91  ::std::is_function<U>::value,
92  typename ::std::add_pointer<U>::type,
93  U
94  >::type
95  >::type;
96 };
97 
98 template<typename P>
99 using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
100 
101 template<typename Fun>
102 struct is_function_ptr: bool_constant<
103  ::std::is_pointer<Fun>::value and ::std::is_function<typename ::std::remove_pointer<Fun>::type>::value> { };
104 
105 inline void collect_argument_addresses(void**) { }
106 
107 template <typename Arg, typename... Args>
108 inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Args&&... args)
109 {
110  collected_addresses[0] = const_cast<void*>(static_cast<const void*>(&arg));
111  collect_argument_addresses(collected_addresses + 1, ::std::forward<Args>(args)...);
112 }
113 
114 // For partial template specialization on WrappedKernel...
115 template<typename Kernel, typename... KernelParameters>
116 struct enqueue_launch_helper {
117  void operator()(
118  Kernel&& kernel_function,
119  const stream_t & stream,
120  launch_configuration_t launch_configuration,
121  KernelParameters &&... parameters) const;
122 };
123 
124 template<typename Kernel, typename... KernelParameters>
125 void enqueue_launch(
126  bool_constant<false>,
127  bool_constant<false>,
128  Kernel&& kernel_function,
129  const stream_t& stream,
130  launch_configuration_t launch_configuration,
131  KernelParameters&&... parameters);
132 
133 template<typename Kernel, typename... KernelParameters>
134 void enqueue_launch(
135  bool_constant<true>,
136  bool_constant<false>,
137  Kernel&& kernel,
138  const stream_t& stream,
139  launch_configuration_t launch_configuration,
140  KernelParameters&&... parameters);
141 
142 template<typename Kernel, typename... KernelParameters>
143 void enqueue_launch(
144  bool_constant<false>,
145  bool_constant<true>,
146  Kernel&& kernel,
147  const stream_t& stream,
148  launch_configuration_t launch_configuration,
149  KernelParameters&&... parameters);
150 
151 inline void enqueue_kernel_launch_by_handle_in_current_context(
152  kernel::handle_t kernel_function_handle,
153  device::id_t device_id,
154  context::handle_t context_handle,
155  stream::handle_t stream_handle,
156  launch_configuration_t launch_config,
157  const void** marshalled_arguments);
158 
159 template<typename KernelFunction, typename... KernelParameters>
160 void enqueue_raw_kernel_launch_in_current_context(
161  KernelFunction&& kernel_function,
162  device::id_t device_id,
163  context::handle_t context_handle,
164  stream::handle_t stream_handle,
165  launch_configuration_t launch_configuration,
166  KernelParameters&&... parameters)
167 #ifndef __CUDACC__
168 // If we're not in CUDA's NVCC, this can't run properly anyway, so either we throw some
169 // compilation error, or we just do nothing. For now it's option 2.
170 ;
171 #else
172 {
173  using decayed_kf_type = typename ::std::decay<KernelFunction>::type;
174  static_assert(::std::is_function<decayed_kf_type>::value or is_function_ptr<decayed_kf_type>::value,
175  "Only a bona fide function can be launched as a CUDA kernel");
176  if (not launch_configuration.has_nondefault_attributes()) {
177  // regular plain vanilla launch
178  kernel_function <<<
179  launch_configuration.dimensions.grid,
180  launch_configuration.dimensions.block,
181  launch_configuration.dynamic_shared_memory_size,
182  stream_handle
183  >>>(::std::forward<KernelParameters>(parameters)...);
184  cuda::outstanding_error::ensure_none("Kernel launch failed");
185  }
186  else {
187 #if CUDA_VERSION < 9000
188  throw cuda::runtime_error(status::not_supported,
189  "Only CUDA versions 9.0 and later support launching kernels with additional"
190  "arguments, e.g block cooperation");
191 #else
192  // The following hack is due to C++ not supporting arrays of length 0 -
193  // but such an array being necessary for collect_argument_addresses with
194  // multiple parameters. Other workarounds are possible, but would be
195  // more cumbersome, except perhaps with C++17 or later.
196  static constexpr const auto non_zero_num_params =
197  sizeof...(KernelParameters) == 0 ? 1 : sizeof...(KernelParameters);
198  void* argument_ptrs[non_zero_num_params];
199  // fill the argument array with our parameters. Yes, the use
200  // of the two terms is confusing here and depends on how you
201  // look at things.
202  detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
203 #if CUDA_VERSION >= 11000
204  kernel::handle_t kernel_function_handle = kernel::apriori_compiled::detail_::get_handle( (const void*) kernel_function);
205  enqueue_kernel_launch_by_handle_in_current_context(
206  kernel_function_handle,
207  device_id,
208  context_handle,
209  stream_handle,
210  launch_configuration,
211  const_cast<const void**>(argument_ptrs));
212 
213 #else // CUDA_VERSION is at least 9000 but under 11000
214  (void) device_id;
215  (void) context_handle;
216  auto status = cudaLaunchCooperativeKernel(
217  (const void *) kernel_function,
218  (dim3)(uint3)launch_configuration.dimensions.grid,
219  (dim3)(uint3)launch_configuration.dimensions.block,
220  &argument_ptrs[0],
221  (size_t)launch_configuration.dynamic_shared_memory_size,
222  cudaStream_t(stream_handle));
223  throw_if_error_lazy(status, "Kernel launch failed");
224 #endif // CUDA_VERSION >= 11000
225 #endif // CUDA_VERSION >= 9000
226  }
227 }
228 #endif
229 
230 } // namespace detail_
231 
232 
233 namespace kernel {
234 
235 namespace detail_ {
236 
237 // The helper code here is intended for re-imbuing kernel-related classes with the types
238 // of the kernel parameters. This is necessary since kernel wrappers may be type-erased
239 // (which makes it much easier to work with them and avoids a bunch of code duplication).
240 //
241 // Note: The type-unerased kernel must be a non-const function pointer. Why? Not sure.
242 // even though function pointers can't get written through, for some reason they are
243 // expected not to be const.
244 
245 
246 template<typename... KernelParameters>
247 struct raw_kernel_typegen {
248  // You should be careful to only instantiate this class with nice simple types we can pass to CUDA kernels.
249 // static_assert(
250 // all_true<
251 // ::std::is_same<
252 // KernelParameters,
253 // ::cuda::detail_::kernel_parameter_decay_t<KernelParameters>>::value...
254 // >::value,
255 // "All kernel parameter types must be decay-invariant" );
256  using type = void(*)(cuda::detail_::kernel_parameter_decay_t<KernelParameters>...);
257 };
258 
259 } // namespace detail_
260 
267 template<typename... KernelParameters>
268 typename detail_::raw_kernel_typegen<KernelParameters...>::type
270 {
271  using raw_kernel_t = typename detail_::raw_kernel_typegen<KernelParameters ...>::type;
272  return reinterpret_cast<raw_kernel_t>(const_cast<void *>(kernel.ptr()));
273 }
274 
275 } // namespace kernel
276 
277 namespace detail_ {
278 
279 template<typename... KernelParameters>
280 struct enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...> {
281  void operator()(
282  const kernel::apriori_compiled_t& wrapped_kernel,
283  const stream_t & stream,
284  launch_configuration_t launch_configuration,
285  KernelParameters &&... parameters) const;
286 };
287 
288 } // namespace detail_
289 
290 
319 template<typename Kernel, typename... KernelParameters>
320 void enqueue_launch(
321  Kernel&& kernel,
322  const stream_t& stream,
323  launch_configuration_t launch_configuration,
324  KernelParameters&&... parameters);
325 
331 template<typename Kernel, typename... KernelParameters>
332 void launch(
333  Kernel&& kernel,
334  launch_configuration_t launch_configuration,
335  KernelParameters&&... parameters);
336 
357 template <typename SpanOfConstVoidPtrLike>
359 void launch_type_erased(
360  const kernel_t& kernel,
361  const stream_t& stream,
362  launch_configuration_t launch_configuration,
363  SpanOfConstVoidPtrLike marshalled_arguments);
364 
365 #if CUDA_VERSION >= 12000
366 template <typename SpanOfConstVoidPtrLike>
367 void launch_type_erased(
368  const library::kernel_t& kernel,
369  const stream_t& stream,
370  launch_configuration_t launch_configuration,
371  SpanOfConstVoidPtrLike marshalled_arguments);
373 #endif // CUDA_VERSION >= 12000
374 
375 } // namespace cuda
376 
377 #endif // CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
Proxy class for a CUDA stream.
Definition: stream.hpp:246
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
constexpr grid::dimensions_t single_block()
A named constructor idiom for grid::dimensions_t, which, when used, will result in a grid with a sing...
Definition: kernel_launch.hpp:64
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
void ensure_none(const ::std::string &message) noexcept(false)
Does nothing (unless throwing an exception)
Definition: error.hpp:438
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
constexpr grid::block_dimensions_t single_thread_per_block()
A named constructor idiom for grid::dimensions_t, which, when used, will result in a grid whose block...
Definition: kernel_launch.hpp:70
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:411
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
Contains the class cuda::launch_configuration_t and some supporting code.
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
Contains a base wrapper class for CUDA kernels - both statically and dynamically compiled; and some r...
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
detail_::raw_kernel_typegen< KernelParameters... >::type unwrap(const kernel::apriori_compiled_t &kernel)
A function similar to ::std::any_cast for retrieving the function pointer wrapped by a cuda::kernel::...
Definition: kernel_launch.hpp:269
The cuda::library::kernel_t class and related code.
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...