cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
kernel_launch.hpp
Go to the documentation of this file.
1 
34 #pragma once
35 #ifndef CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
36 #define CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
37 
38 #include "launch_configuration.hpp"
39 #include "kernel.hpp"
41 #if CUDA_VERSION >= 12000
42 #include "kernels/in_library.hpp"
43 #endif
44 
45 
46 #if CUDA_VERSION >= 9000
47 // The following is necessary for cudaLaunchCooperativeKernel
48 #include <cuda_runtime.h>
49 #endif // CUDA_VERSION >= 9000
50 
51 #include <type_traits>
52 #include <utility>
53 
54 namespace cuda {
55 
75 template <typename T>
76 struct is_valid_kernel_argument : ::std::is_trivially_copyable<T> { };
77 
81 template <typename T>
83 
85 class stream_t;
87 
92 constexpr grid::dimensions_t single_block() { return 1; }
99 
100 namespace detail_ {
101 
110 template<typename P>
111 struct kernel_parameter_decay {
112 private:
113  using U = typename ::std::remove_reference<P>::type;
114 public:
115  using type = typename ::std::conditional<
116  ::std::is_array<U>::value,
117  typename ::std::remove_extent<U>::type*,
118  typename ::std::conditional<
119  ::std::is_function<U>::value,
120  typename ::std::add_pointer<U>::type,
121  U
122  >::type
123  >::type;
124 };
125 
126 template<typename P>
127 using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
128 
129 template<typename Fun>
130 struct is_function_ptr: bool_constant<
131  ::std::is_pointer<Fun>::value and ::std::is_function<typename ::std::remove_pointer<Fun>::type>::value> { };
132 
133 inline void collect_argument_addresses(void**) { }
134 
135 template <typename Arg, typename... Args>
136 inline void collect_argument_addresses(void** collected_addresses, Arg&& arg, Args&&... args)
137 {
138  collected_addresses[0] = const_cast<void*>(static_cast<const void*>(&arg));
139  collect_argument_addresses(collected_addresses + 1, ::std::forward<Args>(args)...);
140 }
141 
142 template<typename Kernel, typename... KernelParameters>
143 struct enqueue_launch_helper {
144  void operator()(
145  Kernel&& kernel_function,
146  const stream_t & stream,
147  launch_configuration_t launch_configuration,
148  KernelParameters &&... parameters) const;
149 };
150 
151 template<typename Kernel, typename... KernelParameters>
152 void enqueue_launch(
153  bool_constant<false>,
154  bool_constant<false>,
155  Kernel&& kernel_function,
156  const stream_t& stream,
157  launch_configuration_t launch_configuration,
158  KernelParameters&&... parameters);
159 
160 template<typename Kernel, typename... KernelParameters>
161 void enqueue_launch(
162  bool_constant<true>,
163  bool_constant<false>,
164  Kernel&& kernel,
165  const stream_t& stream,
166  launch_configuration_t launch_configuration,
167  KernelParameters&&... parameters);
168 
169 template<typename Kernel, typename... KernelParameters>
170 void enqueue_launch(
171  bool_constant<false>,
172  bool_constant<true>,
173  Kernel&& kernel,
174  const stream_t& stream,
175  launch_configuration_t launch_configuration,
176  KernelParameters&&... parameters);
177 
178 inline void enqueue_kernel_launch_by_handle_in_current_context(
179  kernel::handle_t kernel_function_handle,
180  device::id_t device_id,
181  context::handle_t context_handle,
182  stream::handle_t stream_handle,
183  launch_configuration_t launch_config,
184  const void** marshalled_arguments);
185 
186 template<typename KernelFunction, typename... KernelParameters>
187 void enqueue_raw_kernel_launch_in_current_context(
188  KernelFunction&& kernel_function,
189  device::id_t device_id,
190  context::handle_t context_handle,
191  stream::handle_t stream_handle,
192  launch_configuration_t launch_configuration,
193  KernelParameters&&... parameters)
194 #ifndef __CUDACC__
195 // If we're not in CUDA's NVCC, this can't run properly anyway, so either we throw some
196 // compilation error, or we just do nothing. For now it's option 2.
197 ;
198 #else
199 {
200  using decayed_kf_type = typename ::std::decay<KernelFunction>::type;
201  static_assert(::std::is_function<decayed_kf_type>::value or is_function_ptr<decayed_kf_type>::value,
202  "Only a bona fide function can be launched as a CUDA kernel");
203  if (not launch_configuration.has_nondefault_attributes()) {
204  // regular plain vanilla launch
205  kernel_function <<<
206  launch_configuration.dimensions.grid,
207  launch_configuration.dimensions.block,
208  launch_configuration.dynamic_shared_memory_size,
209  stream_handle
210  >>>(::std::forward<KernelParameters>(parameters)...);
211  cuda::outstanding_error::ensure_none("Kernel launch failed");
212  }
213  else {
214 #if CUDA_VERSION < 9000
215  throw cuda::runtime_error(status::not_supported,
216  "Only CUDA versions 9.0 and later support launching kernels with additional"
217  "arguments, e.g block cooperation");
218 #else
219  // The following hack is due to C++ not supporting arrays of length 0 -
220  // but such an array being necessary for collect_argument_addresses with
221  // multiple parameters. Other workarounds are possible, but would be
222  // more cumbersome, except perhaps with C++17 or later.
223  static constexpr const auto non_zero_num_params =
224  sizeof...(KernelParameters) == 0 ? 1 : sizeof...(KernelParameters);
225  void* argument_ptrs[non_zero_num_params];
226  // fill the argument array with our parameters. Yes, the use
227  // of the two terms is confusing here and depends on how you
228  // look at things.
229  detail_::collect_argument_addresses(argument_ptrs, ::std::forward<KernelParameters>(parameters)...);
230 #if CUDA_VERSION >= 11000
231  kernel::handle_t kernel_function_handle = kernel::apriori_compiled::detail_::get_handle( (const void*) kernel_function);
232  enqueue_kernel_launch_by_handle_in_current_context(
233  kernel_function_handle,
234  device_id,
235  context_handle,
236  stream_handle,
237  launch_configuration,
238  const_cast<const void**>(argument_ptrs));
239 
240 #else // CUDA_VERSION is at least 9000 but under 11000
241  (void) device_id;
242  (void) context_handle;
243  auto status = cudaLaunchCooperativeKernel(
244  (const void *) kernel_function,
245  (dim3)(uint3)launch_configuration.dimensions.grid,
246  (dim3)(uint3)launch_configuration.dimensions.block,
247  &argument_ptrs[0],
248  (size_t)launch_configuration.dynamic_shared_memory_size,
249  cudaStream_t(stream_handle));
250  throw_if_error_lazy(status, "Kernel launch failed");
251 #endif // CUDA_VERSION >= 11000
252 #endif // CUDA_VERSION >= 9000
253  }
254 }
255 #endif
256 
257 } // namespace detail_
258 
259 
260 namespace kernel {
261 
262 namespace detail_ {
263 
264 // The helper code here is intended for re-imbuing kernel-related classes with the types
265 // of the kernel parameters. This is necessary since kernel wrappers may be type-erased
266 // (which makes it much easier to work with them and avoids a bunch of code duplication).
267 //
268 // Note: The type-unerased kernel must be a non-const function pointer. Why? Not sure.
269 // even though function pointers can't get written through, for some reason they are
270 // expected not to be const.
271 
272 
273 template<typename... KernelParameters>
274 struct raw_kernel_typegen {
275  // You should be careful to only instantiate this class with nice simple types we can pass to CUDA kernels.
276 // static_assert(
277 // all_true<
278 // ::std::is_same<
279 // KernelParameters,
280 // ::cuda::detail_::kernel_parameter_decay_t<KernelParameters>>::value...
281 // >::value,
282 // "All kernel parameter types must be decay-invariant" );
283  using type = void(*)(cuda::detail_::kernel_parameter_decay_t<KernelParameters>...);
284 };
285 
286 } // namespace detail_
287 
294 template<typename... KernelParameters>
295 typename detail_::raw_kernel_typegen<KernelParameters...>::type
297 {
298  using raw_kernel_t = typename detail_::raw_kernel_typegen<KernelParameters ...>::type;
299  return reinterpret_cast<raw_kernel_t>(const_cast<void *>(kernel.ptr()));
300 }
301 
302 } // namespace kernel
303 
304 namespace detail_ {
305 
306 template<typename... KernelParameters>
307 struct enqueue_launch_helper<kernel::apriori_compiled_t, KernelParameters...> {
308  void operator()(
309  const kernel::apriori_compiled_t& wrapped_kernel,
310  const stream_t & stream,
311  launch_configuration_t launch_configuration,
312  KernelParameters &&... parameters) const;
313 };
314 
315 } // namespace detail_
316 
317 
346 template<typename Kernel, typename... KernelParameters>
347 void enqueue_launch(
348  Kernel&& kernel,
349  const stream_t& stream,
350  launch_configuration_t launch_configuration,
351  KernelParameters&&... parameters);
352 
358 template<typename Kernel, typename... KernelParameters>
359 void launch(
360  Kernel&& kernel,
361  launch_configuration_t launch_configuration,
362  KernelParameters&&... parameters);
363 
384 template <typename SpanOfConstVoidPtrLike>
386 void launch_type_erased(
387  const kernel_t& kernel,
388  const stream_t& stream,
389  launch_configuration_t launch_configuration,
390  SpanOfConstVoidPtrLike marshalled_arguments);
391 
392 #if CUDA_VERSION >= 12000
393 template <typename SpanOfConstVoidPtrLike>
394 void launch_type_erased(
395  const library::kernel_t& kernel,
396  const stream_t& stream,
397  launch_configuration_t launch_configuration,
398  SpanOfConstVoidPtrLike marshalled_arguments);
400 #endif // CUDA_VERSION >= 12000
401 
402 } // namespace cuda
403 
404 #endif // CUDA_API_WRAPPERS_KERNEL_LAUNCH_CUH_
Proxy class for a CUDA stream.
Definition: stream.hpp:258
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
A trait characterizing those types which can be used as kernel parameters.
Definition: kernel_launch.hpp:76
constexpr grid::dimensions_t single_block()
A named constructor idiom for grid::dimensions_t, which, when used, will result in a grid with a sing...
Definition: kernel_launch.hpp:92
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
void ensure_none(const ::std::string &message) noexcept(false)
Does nothing (unless throwing an exception)
Definition: error.hpp:449
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:396
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:322
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
constexpr grid::block_dimensions_t single_thread_per_block()
A named constructor idiom for grid::dimensions_t, which, when used, will result in a grid whose block...
Definition: kernel_launch.hpp:98
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
typename is_valid_kernel_argument< T >::type is_valid_kernel_argument_t
A convenience type using the is_valid_kernel_argument trait struct.
Definition: kernel_launch.hpp:82
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:413
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
Contains the class cuda::launch_configuration_t and some supporting code.
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:236
Contains a base wrapper class for CUDA kernels - both statically and dynamically compiled; and some r...
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
detail_::raw_kernel_typegen< KernelParameters... >::type unwrap(const kernel::apriori_compiled_t &kernel)
A function similar to ::std::any_cast for retrieving the function pointer wrapped by a cuda::kernel::...
Definition: kernel_launch.hpp:296
The cuda::library::kernel_t class and related code.
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...