cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
launch_configuration.hpp
Go to the documentation of this file.
1 
10 #pragma once
11 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIGURATION_CUH_
12 #define CUDA_API_WRAPPERS_LAUNCH_CONFIGURATION_CUH_
13 
14 #include "constants.hpp"
15 #include "types.hpp"
16 
17 #include <type_traits>
18 #include <utility>
19 
20 namespace cuda {
21 
23 class device_t;
24 class event_t;
25 class kernel_t;
27 
28 namespace detail_ {
29 
30 inline void validate_block_dimensions(grid::block_dimensions_t block_dims)
31 {
32  if (block_dims.volume() == 0) {
33  throw ::std::invalid_argument("Zero-volume grid-of-blocks dimensions provided");
34  }
35 }
36 
37 inline void validate_grid_dimensions(grid::dimensions_t grid_dims)
38 {
39  if (grid_dims.volume() == 0) {
40  throw ::std::invalid_argument("Zero-volume block dimensions provided");
41  }
42 }
43 
44 // Note: The reason for the verbose name is the identity of the block and grid dimension types
45 void validate_block_dimension_compatibility(const device_t &device, grid::block_dimensions_t block_dims);
46 void validate_block_dimension_compatibility(const kernel_t &kernel, grid::block_dimensions_t block_dims);
47 
48 void validate_compatibility(const kernel_t &kernel, memory::shared::size_t shared_mem_size);
49 void validate_compatibility(const device_t &device, memory::shared::size_t shared_mem_size);
50 
51 } // namespace detail_
52 
53 #if CUDA_VERSION >= 12000
54 enum class cluster_scheduling_policy_t {
55  default_ = CU_CLUSTER_SCHEDULING_POLICY_DEFAULT,
56  spread = CU_CLUSTER_SCHEDULING_POLICY_SPREAD,
57  load_balance = CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING
58 };
59 #endif
60 
71  grid::composite_dimensions_t dimensions { grid::dimensions_t{ 0u, 0u, 0u }, grid::block_dimensions_t{ 0u, 0u, 0u } };
72 
77  memory::shared::size_t dynamic_shared_memory_size { 0u };
78 
86  bool block_cooperation { false };
87 
88 #if CUDA_VERSION >= 12000
89 
98  bool programmatically_dependent_launch { true };
99 
114  struct programmatic_completion_t {
115  event_t* event { nullptr };
116  // unsigned flags; WHAT ABOUT THE FLAGS?
117  bool trigger_event_at_block_start { true };
118 #if __cplusplus >= 202002L
119  constexpr bool operator==(const programmatic_completion_t&) const noexcept = default;
120 #endif
121  } programmatic_completion;
122 
129  bool in_remote_memory_synchronization_domain { false };
130 
135  struct clustering_t {
136  grid::dimensions_t cluster_dimensions { 1, 1, 1 };
137  cluster_scheduling_policy_t scheduling_policy { cluster_scheduling_policy_t::default_ };
138 #if __cplusplus >= 202002L
139  constexpr bool operator==(const clustering_t &) const noexcept = default;
140 #endif
141  } clustering;
142 #endif // CUDA_VERSION >= 12000
143 
144 #if __cplusplus >= 202002L
145  constexpr bool operator==(const launch_configuration_t&) const noexcept = default;
146 #endif
147 public: // non-mutators
148 
157  {
158  if (block_cooperation) { return true; }
159 #if CUDA_VERSION >= 12000
160  return programmatically_dependent_launch or programmatic_completion.event
161  or in_remote_memory_synchronization_domain or clustering.cluster_dimensions != grid::dimensions_t::point();
162 #else
163  return false;
164 #endif
165  }
166 
167  // In C++11, an inline initializer for a struct's field costs us a lot
168  // of its defaulted constructors; but - we must initialize the shared
169  // memory size to 0, as otherwise, people might be tempted to initialize
170  // a launch configuration with { num_blocks, num_threads } - and get an
171  // uninitialized shared memory size which they did not expect. So,
172  // we do have the inline initializers above regardless of the language
173  // standard version, and we just have to "pay the price" of spelling things out:
174  launch_configuration_t() = delete;
175  constexpr launch_configuration_t(const launch_configuration_t&) = default;
176  constexpr launch_configuration_t(launch_configuration_t&&) = default;
177 
191  constexpr launch_configuration_t(
193  grid::composite_dimensions_t grid_and_block_dimensions,
194  memory::shared::size_t dynamic_shared_mem = 0u
195  ) :
196  dimensions{grid_and_block_dimensions},
197  dynamic_shared_memory_size{dynamic_shared_mem}
198  { }
199 
200  constexpr launch_configuration_t(
201  grid::dimensions_t grid_dims,
202  grid::dimensions_t block_dims,
203  memory::shared::size_t dynamic_shared_mem = 0u
204  ) : launch_configuration_t( {grid_dims, block_dims}, dynamic_shared_mem) { }
205 
206  // A "convenience" delegating ctor to avoid narrowing-conversion warnings
207  constexpr launch_configuration_t(
208  int grid_dims,
209  int block_dims,
210  memory::shared::size_t dynamic_shared_mem = 0u
211  ) : launch_configuration_t(
212  grid::dimensions_t(grid_dims),
213  grid::block_dimensions_t(block_dims),
214  dynamic_shared_mem)
215  { }
217 
218  CPP14_CONSTEXPR launch_configuration_t& operator=(const launch_configuration_t& other) = default;
219  CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) = default;
220 };
221 
222 #if __cplusplus < 202002L
223 constexpr bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
225 {
226  return
227  lhs.dimensions == rhs.dimensions
229  and lhs.block_cooperation == rhs.block_cooperation
230 #if CUDA_VERSION >= 12000
231  and lhs.programmatically_dependent_launch == rhs.programmatically_dependent_launch
232  and lhs.programmatic_completion.event == rhs.programmatic_completion.event
233  and lhs.programmatic_completion.trigger_event_at_block_start == rhs.programmatic_completion.trigger_event_at_block_start
234  and lhs.in_remote_memory_synchronization_domain == rhs.in_remote_memory_synchronization_domain
235  and lhs.clustering.cluster_dimensions == rhs.clustering.cluster_dimensions
236  and lhs.clustering.scheduling_policy == rhs.clustering.scheduling_policy
237 #endif // CUDA_VERSION >= 12000
238  ;
239 }
240 
241 constexpr bool operator!=(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
242 {
243  return not (lhs == rhs);
244 }
246 #endif
247 
248 namespace detail_ {
249 
250 // Note: This will not check anything related to the device or the kernel
251 // with which the launch configuration is to be used
252 inline void validate(const launch_configuration_t& launch_config) noexcept(false)
253 {
254  validate_block_dimensions(launch_config.dimensions.block);
255  validate_grid_dimensions(launch_config.dimensions.grid);
256 }
257 
258 inline void validate_compatibility(
259  const device_t& device,
260  launch_configuration_t launch_config) noexcept(false)
261 {
262  validate(launch_config);
263  validate_block_dimension_compatibility(device, launch_config.dimensions.block);
264  // Uncomment if we actually get such checks
265  // validate_grid_dimension_compatibility(device, launch_config.dimensions.grid);
266 }
267 
268 void validate_compatibility(
269  const kernel_t& kernel,
270  launch_configuration_t launch_config) noexcept(false);
271 
272 using launch_attribute_index_t = unsigned int;
273 
274 // ensure we have the same number here as the number of attribute insertions in marsha()
275 constexpr launch_attribute_index_t maximum_possible_kernel_launch_attributes = 7;
276 
277 #if CUDA_VERSION >= 12000
278 // Note: The attribute_storage must have a capacity of maximum_possible_kernel_launch_attributes+1 at least
279 CUlaunchConfig marshal(
280  const launch_configuration_t& config,
281  const stream::handle_t stream_handle,
282  span<CUlaunchAttribute> attribute_storage) noexcept(true);
283 #endif // CUDA_VERSION >= 12000
284 
285 } // namespace detail_
286 
287 } // namespace cuda
288 
289 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIGURATION_CUH_
Alias for the default behavior; see heuristic .
Definition: types.hpp:901
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
bool block_cooperation
When true, CUDA&#39;s "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86
Wrapper class for a CUDA event.
Definition: event.hpp:133
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
static constexpr __host__ __device__ dimensions_t point() noexcept
Dimensions of a single point - trivial in all axes.
Definition: types.hpp:363