cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
launch_config_builder.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
11 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
12 
13 #include "launch_configuration.hpp"
14 #include "kernel.hpp"
15 #include "device.hpp"
16 #include "types.hpp"
17 
18 namespace cuda {
19 
20 namespace grid {
21 
22 namespace detail_ {
23 
24 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
25 {
26  dimension_t quotient = static_cast<dimension_t>(dividend / divisor);
27  // It is up to the caller to ensure we don't overflow the dimension_t type
28  return (divisor * quotient == dividend) ? quotient : quotient + 1;
29 }
30 
31 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
32 {
33  return {
34  div_rounding_up(overall_dims.x, block_dims.x),
35  div_rounding_up(overall_dims.y, block_dims.y),
36  div_rounding_up(overall_dims.z, block_dims.z)
37  };
38 }
39 
40 // Note: We're not implementing a grid-to-block rounding up here, since - currently -
41 // block_dimensions_t is the same as grid_dimensions_t.
42 
43 } // namespace detail_
44 
45 } // namespace grid
46 
47 #ifndef NDEBUG
48 
49 namespace detail_ {
50 
51 static void validate_all_dimensions_compatibility(
52  grid::block_dimensions_t block,
53  grid::dimensions_t grid,
54  grid::overall_dimensions_t overall)
55 {
56  if (grid * block != overall) {
57  throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
58  }
59 }
60 
61 } // namespace detail_
62 
63 #endif // NDEBUG
64 
66 protected:
67  memory::shared::size_t get_dynamic_shared_memory_size(grid::block_dimensions_t block_dims) const
68  {
69  return static_cast<memory::shared::size_t>((dynamic_shared_memory_size_determiner_ == nullptr) ?
70  dynamic_shared_memory_size_ :
71  dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.volume())));
72  // Q: Why the need for type conversion?
73  // A: MSVC is being a bit finicky here for some reason
74  }
75 
76 #ifndef NDEBUG
77  grid::composite_dimensions_t get_unvalidated_composite_dimensions() const noexcept(false)
78 #else
79  grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
80 #endif
81  {
83  if (saturate_with_active_blocks_) {
84 #if CUDA_VERSION >= 10000
85  if (use_min_params_for_max_occupancy_) {
86  throw ::std::logic_error(
87  "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate "
88  "the grid with fixed-size blocks.");
89  }
90 #endif
91  if (not (kernel_)) {
92  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
93  }
94  if (not (dimensions_.block)) {
95  throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
96  }
97  if (dimensions_.grid or dimensions_.overall) {
98  throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
99  }
100 
101  result.block = dimensions_.block.value();
102  auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
103  auto num_block_threads = static_cast<grid::block_dimension_t>(dimensions_.block.value().volume());
104  auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
105  auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
106  result.grid = blocks_per_multiprocessor * num_multiprocessors;
107  return result;
108  }
109 #if CUDA_VERSION >= 10000
110  if (use_min_params_for_max_occupancy_) {
111  if (not (kernel_)) {
112  throw ::std::logic_error("A kernel must be set to determine the minimum grid parameter sfor m");
113  }
114  if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
115  throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
116  }
117  auto composite_dims = dynamic_shared_memory_size_determiner_ ?
118  kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
119  kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
120  result.block = composite_dims.block;
121  result.grid = composite_dims.grid;
122  return result;
123  }
124 #endif
125  if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
126  result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
127  result.block = dimensions_.block.value();
128  return result;
129  }
130  if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
131  result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
132  result.grid = dimensions_.grid.value();
133  return result;
134  }
135 
136  if (dimensions_.grid and dimensions_.block) {
137  if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
138  throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
139  }
140  result.block = dimensions_.block.value();
141  result.grid = dimensions_.grid.value();
142  return result;
143  }
144 
145  if (not dimensions_.block and not dimensions_.grid) {
146  throw ::std::logic_error(
147  "Neither block nor grid dimensions have been specified");
148  } else if (not dimensions_.block and not dimensions_.overall) {
149  throw ::std::logic_error(
150  "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified "
151  "in terms of blocks, not threads, with no block dimensions specified");
152  } else { // it must be the case that (not dimensions_.block and not dimensions_.overall)
153  throw ::std::logic_error(
154  "Only block dimensions have been specified - cannot resolve launch grid dimensions");
155  }
156  }
157 
158 #ifndef NDEBUG
159  grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
160  {
161  auto result = get_unvalidated_composite_dimensions();
162  validate_composite_dimensions(result);
163  return result;
164  }
165 #endif
166 
167 public:
168  launch_configuration_t build() const
169  {
170  auto result = launch_configuration_t{ get_composite_dimensions() };
171  result.dynamic_shared_memory_size = get_dynamic_shared_memory_size(result.dimensions.block);
172  result.block_cooperation = thread_block_cooperation;
173  // TODO: More fields!
174  return result;
175  }
176 
177 protected:
178 
179  struct {
180  optional<grid::block_dimensions_t > block;
181  optional<grid::dimensions_t > block_cluster;
182  optional<grid::dimensions_t > grid;
183  optional<grid::overall_dimensions_t> overall;
184  } dimensions_;
185 
186  bool thread_block_cooperation { false };
187 
188  // Note: We could have used a variant between these two;
189  // but the semantic is that if the determiner is not null, we use it;
190  // and if you want to force a concrete apriori value, then you nullify
191  // the determiner
192  kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
193  memory::shared::size_t dynamic_shared_memory_size_ { 0 };
194 
195  const kernel_t* kernel_ { nullptr };
196  optional<device::id_t> device_;
197  bool saturate_with_active_blocks_ { false };
198 #if CUDA_VERSION >= 10000
199  bool use_min_params_for_max_occupancy_ { false };
200 #endif
201 
202  static cuda::device_t device(optional<device::id_t> maybe_id)
203  {
204  return cuda::device::get(maybe_id.value());
205  }
206 
207  cuda::device_t device() const { return device(device_.value()); }
208 
210  {
211 #ifndef NDEBUG
212  detail_::validate(config);
213  if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
214  if (device_) { detail_::validate_compatibility(device(), config); }
215 #endif
216  thread_block_cooperation = config.block_cooperation;
217  dynamic_shared_memory_size_ = config.dynamic_shared_memory_size;
218  dimensions(config.dimensions);
219  return *this;
220  }
221 
222 #ifndef NDEBUG
223  static void validate_compatibility(
224  const kernel_t* kernel_ptr,
225  memory::shared::size_t shared_mem_size)
226  {
227  if (kernel_ptr == nullptr) { return; }
228  detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
229  }
230 
231  static void validate_compatibility(
232  optional<device::id_t> maybe_device_id,
233  memory::shared::size_t shared_mem_size)
234  {
235  if (not maybe_device_id) { return; }
236  detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
237  }
238 
239  void validate_dynamic_shared_memory_size(memory::shared::size_t size)
240  {
241  validate_compatibility(kernel_, size);
242  validate_compatibility(device_, size);
243  }
244 
245  static void validate_block_dimension_compatibility(
246  const kernel_t* kernel_ptr,
247  grid::block_dimensions_t block_dims)
248  {
249  if (kernel_ptr == nullptr) { return; }
250  return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
251  }
252 
253  static void validate_block_dimension_compatibility(
254  optional<device::id_t> maybe_device_id,
255  grid::block_dimensions_t block_dims)
256  {
257  if (not maybe_device_id) { return; }
258  detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
259  }
260 
261  void validate_block_dimensions(grid::block_dimensions_t block_dims) const
262  {
263  detail_::validate_block_dimensions(block_dims);
264  if (dimensions_.grid and dimensions_.overall) {
265  detail_::validate_all_dimensions_compatibility(
266  block_dims, dimensions_.grid.value(), dimensions_.overall.value());
267  }
268  // TODO: Check divisibility
269  validate_block_dimension_compatibility(kernel_, block_dims);
270  validate_block_dimension_compatibility(device_, block_dims);
271  }
272 
273 
274  static void validate_grid_dimension_compatibility(
275  optional<device::id_t> maybe_device_id,
276  grid::block_dimensions_t block_dims)
277  {
278  if (not maybe_device_id) { return; }
279  detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
280  }
281 
282  void validate_grid_dimensions(grid::dimensions_t grid_dims) const
283  {
284  detail_::validate_grid_dimensions(grid_dims);
285  if (dimensions_.block and dimensions_.overall) {
286  detail_::validate_all_dimensions_compatibility(
287  dimensions_.block.value(), grid_dims, dimensions_.overall.value());
288  }
289  // TODO: Check divisibility
290  }
291 
292 #if CUDA_VERSION >= 12000
293  void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
294  {
295  if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
296  throw ::std::runtime_error("The requested block cluster dimensions do not "
297  "divide the grid dimensions (in blocks)");
298  }
299  }
300 #endif // CUDA_VERSION >= 12000
301 
302  void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
303  {
304  if (dimensions_.block and dimensions_.grid) {
305  if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
306  throw ::std::invalid_argument(
307  "specified overall dimensions conflict with the already-specified "
308  "block and grid dimensions");
309  }
310  }
311  }
312 
313  void validate_kernel(const kernel_t* kernel_ptr) const
314  {
315  if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
316  auto block_dims = dimensions_.block ?
317  dimensions_.block.value() :
318  get_composite_dimensions().block;
319  validate_block_dimension_compatibility(kernel_ptr, block_dims);
320  }
321  validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
322  }
323 
324  void validate_device(device::id_t device_id) const
325  {
326  if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
327  auto block_dims = dimensions_.block ?
328  dimensions_.block.value() :
329  get_composite_dimensions().block;
330  validate_block_dimension_compatibility(device_id, block_dims);
331  }
332  detail_::validate_compatibility(
333  device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
334  }
335 
336  void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
337  {
338  validate_block_dimension_compatibility(kernel_, composite_dims.block);
339  validate_block_dimension_compatibility(device_, composite_dims.block);
340 
341  // Is there anything to validate regarding the grid dims?
342  validate_grid_dimension_compatibility(device_, composite_dims.grid);
343  }
344 #endif // ifndef NDEBUG
345 
346 public:
347  launch_config_builder_t& dimensions(grid::composite_dimensions_t composite_dims)
348  {
349 #ifndef NDEBUG
350  validate_composite_dimensions(composite_dims);
351 #endif
352  dimensions_.overall = nullopt;
353  dimensions_.grid = composite_dims.grid;
354  dimensions_.block = composite_dims.block;
355  return *this;
356  }
357 
358  launch_config_builder_t& block_dimensions(grid::block_dimensions_t dims)
359  {
360 #ifndef NDEBUG
361  validate_block_dimensions(dims);
362 #endif
363  dimensions_.block = dims;
364  if (dimensions_.grid) {
365  dimensions_.overall = nullopt;
366  }
367  return *this;
368 
369  }
370 
371  launch_config_builder_t& block_dimensions(
375  {
376  return block_dimensions(grid::block_dimensions_t{x, y, z});
377  }
378 
379  launch_config_builder_t& block_size(grid::block_dimension_t size) { return block_dimensions(size, 1, 1); }
380 
381  launch_config_builder_t& use_maximum_linear_block()
382  {
383  grid::block_dimension_t max_size;
384  if (kernel_) {
385  max_size = kernel_->maximum_threads_per_block();
386  }
387  else if (device_) {
388  max_size = device().maximum_threads_per_block();
389  }
390  else {
391  throw ::std::logic_error("Request to use the maximum-size linear block, with no device or kernel specified");
392  }
393  auto block_dims = grid::block_dimensions_t { max_size, 1, 1 };
394 
395  if (dimensions_.grid and dimensions_.overall) {
396  dimensions_.overall = nullopt;
397  }
398  dimensions_.block = block_dims;
399  return *this;
400  }
401 
402 #if CUDA_VERSION >= 12000
403  launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
404  {
405 #ifndef NDEBUG
406  validate_cluster_dimensions(cluster_dims);
407 #endif
408  dimensions_.block_cluster = cluster_dims;
409  return *this;
410  }
411 #endif
412 
413  launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
414  {
415 #ifndef NDEBUG
416  validate_grid_dimensions(dims);
417 #endif
418  if (dimensions_.block) {
419  dimensions_.overall = nullopt;
420  }
421  dimensions_.grid = dims;
422  saturate_with_active_blocks_ = false;
423  return *this;
424  }
425 
426  launch_config_builder_t& grid_dimensions(
428  grid::dimension_t y = 1,
429  grid::dimension_t z = 1)
430  {
431  return grid_dimensions(grid::dimensions_t{x, y, z});
432  }
433 
434  launch_config_builder_t& grid_size(grid::dimension_t size) {return grid_dimensions(size, 1, 1); }
435  launch_config_builder_t& num_blocks(grid::dimension_t size) {return grid_size(size); }
436 
437  launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims)
438  {
439 #ifndef NDEBUG
440  validate_overall_dimensions(dims);
441 #endif
442  dimensions_.overall = dims;
443  saturate_with_active_blocks_ = false;
444  return *this;
445  }
446  launch_config_builder_t& overall_dimensions(
450  {
451  return overall_dimensions(grid::overall_dimensions_t{x, y, z});
452  }
453 
454  launch_config_builder_t& overall_size(grid::overall_dimension_t size) { return overall_dimensions(size, 1, 1); }
455 
456  launch_config_builder_t& block_cooperation(bool cooperation)
457  {
458  thread_block_cooperation = cooperation;
459  return *this;
460  }
461 
462  launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }
463  launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }
464 
465  launch_config_builder_t& dynamic_shared_memory_size(
466  kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
467  {
468  dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
469  return *this;
470  }
471 
472  launch_config_builder_t& no_dynamic_shared_memory()
473  {
474  return dynamic_shared_memory_size(memory::shared::size_t(0));
475  }
476 
477  launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size)
478  {
479 #ifndef NDEBUG
480  validate_dynamic_shared_memory_size(size);
481 #endif
482  dynamic_shared_memory_size_ = size;
483  dynamic_shared_memory_size_determiner_ = nullptr;
484  return *this;
485  }
486 
487  launch_config_builder_t& dynamic_shared_memory(memory::shared::size_t size)
488  {
489  return dynamic_shared_memory_size(size);
490  }
491 
492  launch_config_builder_t& dynamic_shared_memory(
493  kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
494  {
495  return dynamic_shared_memory_size(shared_mem_size_determiner);
496  }
497 
498  launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
499  {
500  if (device_ and kernel_->device_id() != device_) {
501  throw ::std::invalid_argument("Launch config builder already associated with "
502  + device::detail_::identify(*device_) + " and cannot further be associated "
503  "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
504  }
505 #ifndef NDEBUG
506  validate_kernel(wrapped_kernel_ptr);
507 #endif
508  kernel_ = wrapped_kernel_ptr;
509  return *this;
510  }
511 
512  launch_config_builder_t& device(const device::id_t device_id)
513  {
514  if (kernel_ and kernel_->device_id() != device_id) {
515  throw ::std::invalid_argument("Launch config builder already associated with "
516  + kernel::detail_::identify(*kernel_) + " and cannot further be associated "
517  "another device: " + device::detail_::identify(device_id));
518  }
519  device_ = device_id;
520  return *this;
521  }
522 
523  launch_config_builder_t& device(const device_t& device)
524  {
525  return this->device(device.id());
526  }
527 
528  launch_config_builder_t& kernel_independent()
529  {
530  kernel_ = nullptr;
531  return *this;
532  }
533  launch_config_builder_t& no_kernel()
534  {
535  kernel_ = nullptr;
536  return *this;
537  }
538 
547  {
548  if (not (kernel_)) {
549  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
550  }
551  if (not (dimensions_.block)) {
552  throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
553  }
554  dimensions_.grid = nullopt;
555  dimensions_.overall = nullopt;
556 #if CUDA_VERSION >= 10000
557  use_min_params_for_max_occupancy_ = false;
558 #endif
559  saturate_with_active_blocks_ = true;
560  return *this;
561  }
562 
563  launch_config_builder_t& min_params_for_max_occupancy()
564  {
565  if (not (kernel_)) {
566  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
567  }
568  dimensions_.block = nullopt;
569  dimensions_.grid = nullopt;
570  dimensions_.overall = nullopt;
571 #if CUDA_VERSION >= 10000
572  use_min_params_for_max_occupancy_ = true;
573 #endif
574  saturate_with_active_blocks_ = false;
575  return *this;
576  }
577 }; // launch_config_builder_t
578 
579 inline launch_config_builder_t launch_config_builder() { return {}; }
580 
581 } // namespace cuda
582 
583 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:319
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:22
Definition: launch_configuration.hpp:58
bool block_cooperation
When true, CUDA&#39;s "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:74
Definition: kernel_launch.hpp:238
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:332
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:752
Definition: kernel_launch.hpp:77
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:347
Definition: launch_config_builder.hpp:65
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:649
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:419
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:452
Dimensions of a grid in threads, i.e.
Definition: types.hpp:458
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:65
Contains the class launch_configuration_t, an enhanced child class of the CUlaunchConfig struct of CU...
Contains a base wrapper class for CUDA kernels - both statically and dynamically compiled; and some r...
launch_config_builder_t & saturate_with_active_blocks()
THis will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:546
Fundamental CUDA-related type definitions.