cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
launch_config_builder.hpp
Go to the documentation of this file.
1 
10 #pragma once
11 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
12 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
13 
14 // This definition in types.hpp is usually sufficient, but let's be on the safe side
15 #ifdef _MSC_VER
16 // See @url https://stackoverflow.com/q/4913922/1593077
17 #define NOMINMAX
18 #endif
19 
20 #include "launch_configuration.hpp"
21 #include "kernel_launch.hpp"
22 #include "device.hpp"
23 #include "types.hpp"
24 
25 #include <limits>
26 #include <string>
27 
28 namespace cuda {
29 
30 namespace detail_ {
31 
32 void validate_shared_mem_size_compatibility(const kernel_t& kernel_ptr, memory::shared::size_t shared_mem_size) noexcept(false);
33 void validate_shared_mem_compatibility(const device_t &device, memory::shared::size_t shared_mem_size) noexcept(false);
34 void validate_grid_dimension_compatibility(const device_t &device, grid::block_dimensions_t block_dims) noexcept(false);
35 void validate_compatibility(const kernel_t& kernel, launch_configuration_t launch_config) noexcept(false);
36 void validate_compatibility(const device::id_t, memory::shared::size_t, bool, optional<grid::dimensions_t>) noexcept(false);
37 
38 } // namespace detail_
39 
40 
41 namespace grid {
42 
43 namespace detail_ {
44 
45 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
46 {
47  dimension_t quotient = static_cast<dimension_t>(dividend / divisor);
48  // It is up to the caller to ensure we don't overflow the dimension_t type
49  return (divisor * quotient == dividend) ? quotient : quotient + 1;
50 }
51 
52 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
53 {
54  return {
55  div_rounding_up(overall_dims.x, block_dims.x),
56  div_rounding_up(overall_dims.y, block_dims.y),
57  div_rounding_up(overall_dims.z, block_dims.z)
58  };
59 }
60 
61 // Note: We're not implementing a grid-to-block rounding up here, since - currently -
62 // block_dimensions_t is the same as grid_dimensions_t.
63 
64 } // namespace detail_
65 
66 } // namespace grid
67 
68 #ifndef NDEBUG
69 
70 namespace detail_ {
71 
72 static void validate_all_dimensions_compatibility(
73  grid::block_dimensions_t block,
74  grid::dimensions_t grid,
75  grid::overall_dimensions_t overall)
76 {
77  if (grid * block != overall) {
78  throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
79  }
80 }
81 
82 } // namespace detail_
83 
84 #endif // NDEBUG
85 
100 protected:
104  memory::shared::size_t get_dynamic_shared_memory_size(grid::block_dimensions_t block_dims) const
105  {
106  return static_cast<memory::shared::size_t>((dynamic_shared_memory_size_determiner_ == nullptr) ?
107  dynamic_shared_memory_size_ :
108  dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.volume())));
109  // Q: Why the need for type conversion?
110  // A: MSVC is being a bit finicky here for some reason
111  }
112 
113  grid::composite_dimensions_t get_unvalidated_composite_dimensions() const noexcept(false)
114  {
116  if (saturate_with_active_blocks_) {
117 #if CUDA_VERSION >= 10000
118  if (use_min_params_for_max_occupancy_) {
119  throw ::std::logic_error(
120  "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate "
121  "the grid with fixed-size blocks.");
122  }
123 #endif
124  if (not (kernel_)) {
125  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
126  }
127  if (not (dimensions_.block)) {
128  throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
129  }
130  if (dimensions_.grid or dimensions_.overall) {
131  throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
132  }
133 
134  result.block = dimensions_.block.value();
135  auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
136  auto num_block_threads = static_cast<grid::block_dimension_t>(dimensions_.block.value().volume());
137  auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
138  auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
139  result.grid = blocks_per_multiprocessor * num_multiprocessors;
140  return result;
141  }
142 #if CUDA_VERSION >= 10000
143  if (use_min_params_for_max_occupancy_) {
144  if (not (kernel_)) {
145  throw ::std::logic_error("A kernel must be set to determine the minimum grid parameter sfor m");
146  }
147  if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
148  throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
149  }
150  auto composite_dims = dynamic_shared_memory_size_determiner_ ?
151  kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
152  kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
153  result.block = composite_dims.block;
154  result.grid = composite_dims.grid;
155  return result;
156  }
157 #endif
158  if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
159  result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
160  result.block = dimensions_.block.value();
161  return result;
162  }
163  if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
164  result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
165  result.grid = dimensions_.grid.value();
166  return result;
167  }
168 
169  if (dimensions_.grid and dimensions_.block) {
170  if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
171  throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
172  }
173  result.block = dimensions_.block.value();
174  result.grid = dimensions_.grid.value();
175  return result;
176  }
177 
178  if (not dimensions_.block and not dimensions_.grid) {
179  throw ::std::logic_error(
180  "Neither block nor grid dimensions have been specified");
181  } else if (not dimensions_.block and not dimensions_.overall) {
182  throw ::std::logic_error(
183  "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified "
184  "in terms of blocks, not threads, with no block dimensions specified");
185  } else { // it must be the case that (not dimensions_.block and not dimensions_.overall)
186  throw ::std::logic_error(
187  "Only block dimensions have been specified - cannot resolve launch grid dimensions");
188  }
189  }
190 
193  grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
194  {
195  auto result = get_unvalidated_composite_dimensions();
196 #ifndef NDEBUG
197  validate_composite_dimensions(result);
198 #endif
199  return result;
200  }
201 
202 public:
208  {
209  auto result = launch_configuration_t{ get_composite_dimensions() };
210  result.dynamic_shared_memory_size = get_dynamic_shared_memory_size(result.dimensions.block);
211  result.block_cooperation = thread_block_cooperation;
212  // TODO: More fields!
213  return result;
214  }
215 
216 protected:
217 
218  struct {
219  optional<grid::block_dimensions_t > block;
220  optional<grid::dimensions_t > block_cluster;
221  optional<grid::dimensions_t > grid;
222  optional<grid::overall_dimensions_t> overall;
223  } dimensions_;
224 
225  bool thread_block_cooperation { false };
226 
227  // Note: We could have used a variant between these two;
228  // but the semantic is that if the determiner is not null, we use it;
229  // and if you want to force a concrete apriori value, then you nullify
230  // the determiner
231  kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
232  memory::shared::size_t dynamic_shared_memory_size_ { 0 };
233 
234  const kernel_t* kernel_ { nullptr };
235  optional<device::id_t> device_;
236  bool saturate_with_active_blocks_ { false };
237 #if CUDA_VERSION >= 10000
238  bool use_min_params_for_max_occupancy_ { false };
239 #endif
240 
241  static cuda::device_t device(optional<device::id_t> maybe_id)
242  {
243  return cuda::device::get(maybe_id.value());
244  }
245 
246  cuda::device_t device() const { return device(device_.value()); }
247 
248  launch_config_builder_t& configure_for(launch_configuration_t config)
249  {
250 #ifndef NDEBUG
251  detail_::validate(config);
252  if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
253  if (device_) { detail_::validate_compatibility(device(), config); }
254 #endif
255  thread_block_cooperation = config.block_cooperation;
256  dynamic_shared_memory_size_ = config.dynamic_shared_memory_size;
257  dimensions(config.dimensions);
258  return *this;
259  }
260 
261 #ifndef NDEBUG
262  static void validate_compatibility(
263  const kernel_t* kernel_ptr,
264  memory::shared::size_t shared_mem_size)
265  {
266  if (kernel_ptr == nullptr) { return; }
267  detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
268  }
269 
270  static void validate_compatibility(
271  optional<device::id_t> maybe_device_id,
272  memory::shared::size_t shared_mem_size)
273  {
274  if (not maybe_device_id) { return; }
275  detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
276  }
277 
278  void validate_dynamic_shared_memory_size(memory::shared::size_t size)
279  {
280  validate_compatibility(kernel_, size);
281  validate_compatibility(device_, size);
282  }
283 
284  static void validate_block_dimension_compatibility(
285  const kernel_t* kernel_ptr,
286  grid::block_dimensions_t block_dims)
287  {
288  if (kernel_ptr == nullptr) { return; }
289  return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
290  }
291 
292  static void validate_block_dimension_compatibility(
293  optional<device::id_t> maybe_device_id,
294  grid::block_dimensions_t block_dims)
295  {
296  if (not maybe_device_id) { return; }
297  detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
298  }
299 
300  void validate_block_dimensions(grid::block_dimensions_t block_dims) const
301  {
302  detail_::validate_block_dimensions(block_dims);
303  if (dimensions_.grid and dimensions_.overall) {
304  detail_::validate_all_dimensions_compatibility(
305  block_dims, dimensions_.grid.value(), dimensions_.overall.value());
306  }
307  // TODO: Check divisibility
308  validate_block_dimension_compatibility(kernel_, block_dims);
309  validate_block_dimension_compatibility(device_, block_dims);
310  }
311 
312 
313  static void validate_grid_dimension_compatibility(
314  optional<device::id_t> maybe_device_id,
315  grid::block_dimensions_t block_dims)
316  {
317  if (not maybe_device_id) { return; }
318  detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
319  }
320 
321  void validate_grid_dimensions(grid::dimensions_t grid_dims) const
322  {
323  detail_::validate_grid_dimensions(grid_dims);
324  if (dimensions_.block and dimensions_.overall) {
325  detail_::validate_all_dimensions_compatibility(
326  dimensions_.block.value(), grid_dims, dimensions_.overall.value());
327  }
328  // TODO: Check divisibility
329  }
330 
331 #if CUDA_VERSION >= 12000
332  void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
333  {
334  if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
335  throw ::std::runtime_error("The requested block cluster dimensions do not "
336  "divide the grid dimensions (in blocks)");
337  }
338  }
339 #endif // CUDA_VERSION >= 12000
340 
341  void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
342  {
343  if (dimensions_.block and dimensions_.grid) {
344  if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
345  throw ::std::invalid_argument(
346  "specified overall dimensions conflict with the already-specified "
347  "block and grid dimensions");
348  }
349  }
350  }
351 
352  void validate_kernel(const kernel_t* kernel_ptr) const
353  {
354  if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
355  auto block_dims = dimensions_.block ?
356  dimensions_.block.value() :
357  get_composite_dimensions().block;
358  validate_block_dimension_compatibility(kernel_ptr, block_dims);
359  }
360  validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
361  }
362 
363  void validate_device(device::id_t device_id) const
364  {
365  if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
366  auto block_dims = dimensions_.block ?
367  dimensions_.block.value() :
368  get_composite_dimensions().block;
369  validate_block_dimension_compatibility(device_id, block_dims);
370  }
371  detail_::validate_compatibility(
372  device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
373  }
374 
375  void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
376  {
377  validate_block_dimension_compatibility(kernel_, composite_dims.block);
378  validate_block_dimension_compatibility(device_, composite_dims.block);
379 
380  // Is there anything to validate regarding the grid dims?
381  validate_grid_dimension_compatibility(device_, composite_dims.grid);
382  }
383 #endif // ifndef NDEBUG
384 
385 public:
386  launch_config_builder_t& dimensions(grid::composite_dimensions_t composite_dims)
387  {
388 #ifndef NDEBUG
389  validate_composite_dimensions(composite_dims);
390 #endif
391  dimensions_.overall = nullopt;
392  dimensions_.grid = composite_dims.grid;
393  dimensions_.block = composite_dims.block;
394  return *this;
395  }
396 
397  launch_config_builder_t& block_dimensions(grid::block_dimensions_t dims)
398  {
399 #ifndef NDEBUG
400  validate_block_dimensions(dims);
401 #endif
402  dimensions_.block = dims;
403  if (dimensions_.grid) {
404  dimensions_.overall = nullopt;
405  }
406  return *this;
407 
408  }
409 
415  {
416  return block_dimensions(grid::block_dimensions_t{x, y, z});
417  }
418 
422  {
423  static constexpr const auto max_representable_block_dim = ::std:: numeric_limits<grid::block_dimension_t> ::max();
424  if (size > (size_t) max_representable_block_dim) {
425  throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
426  + " exceeds " + ::std::to_string(max_representable_block_dim)
427  + " , the maximum representable size of a block");
428  // and note this is a super-lenient check, since in practice, device properties
429  // limit block sizes at much lower values; but NVIDIA doesn't "let us know that" via
430  // any global definitions.
431 
432  }
433  if (kernel_) {
434  auto max_threads_per_block = kernel_->maximum_threads_per_block();
435  if (size > max_threads_per_block) {
436  throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
437  + " exceeds " + ::std::to_string(max_threads_per_block)
438  + " , the maximum number of threads per block supported by "
439  + kernel::detail_::identify(*kernel_));
440  }
441  }
442  if (device_) {
443  auto max_threads_per_block = device().maximum_threads_per_block();
444  if (size > max_threads_per_block) {
445  throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
446  + " exceeds " + ::std::to_string(max_threads_per_block)
447  + " , the maximum number of threads per block supported by "
448  + device::detail_::identify(device_.value()));
449  }
450  }
451  return block_dimensions(static_cast<grid::block_dimension_t>(size), 1, 1);
452  }
453 
462  {
463  grid::block_dimension_t max_size;
464  if (kernel_) {
465  max_size = kernel_->maximum_threads_per_block();
466  }
467  else if (device_) {
468  max_size = device().maximum_threads_per_block();
469  }
470  else {
471  throw ::std::logic_error("Request to use the maximum-size linear block, with no device or kernel specified");
472  }
473  auto block_dims = grid::block_dimensions_t { max_size, 1, 1 };
474 
475  if (dimensions_.grid and dimensions_.overall) {
476  dimensions_.overall = nullopt;
477  }
478  dimensions_.block = block_dims;
479  return *this;
480  }
481 
482 #if CUDA_VERSION >= 12000
483 
493  launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
494  {
495 #ifndef NDEBUG
496  validate_cluster_dimensions(cluster_dims);
497 #endif
498  dimensions_.block_cluster = cluster_dims;
499  return *this;
500  }
501 #endif
502 
507  {
508 #ifndef NDEBUG
509  validate_grid_dimensions(dims);
510 #endif
511  if (dimensions_.block) {
512  dimensions_.overall = nullopt;
513  }
514  dimensions_.grid = dims;
515  saturate_with_active_blocks_ = false;
516  return *this;
517  }
518 
520  launch_config_builder_t& grid_dimensions(
522  grid::dimension_t y = 1,
523  grid::dimension_t z = 1)
524  {
525  return grid_dimensions(grid::dimensions_t{x, y, z});
526  }
527 
532 #ifndef NDEBUG
533  if (size > static_cast<size_t>(::std::numeric_limits<int>::max())) {
534  throw ::std::invalid_argument("Specified (1-dimensional) grid size " + ::std::to_string(size)
535  + "in blocks exceeds " + ::std::to_string(::std::numeric_limits<int>::max())
536  + " , the maximum supported number of blocks");
537  }
538 #endif
539  return grid_dimensions(static_cast<grid::dimension_t>(size), 1, 1);
540  }
541  launch_config_builder_t& num_blocks(size_t size) {return grid_size(size); }
543 
544 
549  {
550 #ifndef NDEBUG
551  validate_overall_dimensions(dims);
552 #endif
553  dimensions_.overall = dims;
554  saturate_with_active_blocks_ = false;
555  return *this;
556  }
557  launch_config_builder_t& overall_dimensions(
561  {
562  return overall_dimensions(grid::overall_dimensions_t{x, y, z});
563  }
565 
569  {
570  static_assert(std::is_same<grid::overall_dimension_t, size_t>::value, "Unexpected type difference");
571  return overall_dimensions(size, 1, 1);
572  }
573 
581  {
582  thread_block_cooperation = cooperation;
583  return *this;
584  }
585 
588  launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }
589 
592  launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }
593 
594  launch_config_builder_t& dynamic_shared_memory_size(
595  kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
596  {
597  dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
598  return *this;
599  }
600 
605  {
606  return dynamic_shared_memory_size(memory::shared::size_t(0));
607  }
608 
614  {
615 #ifndef NDEBUG
616  validate_dynamic_shared_memory_size(size);
617 #endif
618  dynamic_shared_memory_size_ = size;
619  dynamic_shared_memory_size_determiner_ = nullptr;
620  return *this;
621  }
622 
623  launch_config_builder_t& dynamic_shared_memory(memory::shared::size_t size)
624  {
625  return dynamic_shared_memory_size(size);
626  }
628 
639  kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
640  {
641  return dynamic_shared_memory_size(shared_mem_size_determiner);
642  }
643 
653  launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
654  {
655  if (device_ and kernel_->device_id() != device_.value()) {
656  throw ::std::invalid_argument("Launch config builder already associated with "
657  + device::detail_::identify(*device_) + " and cannot further be associated "
658  "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
659  }
660 #ifndef NDEBUG
661  validate_kernel(wrapped_kernel_ptr);
662 #endif
663  kernel_ = wrapped_kernel_ptr;
664  return *this;
665  }
666 
675  launch_config_builder_t& device(const device::id_t device_id)
677  {
678  if (kernel_ and kernel_->device_id() != device_id) {
679  throw ::std::invalid_argument("Launch config builder already associated with "
680  + kernel::detail_::identify(*kernel_) + " and cannot further be associated "
681  "another device: " + device::detail_::identify(device_id));
682  }
683  device_ = device_id;
684  return *this;
685  }
686 
687  launch_config_builder_t& device(const device_t& device)
688  {
689  return this->device(device.id());
690  }
692 
697  {
698  kernel_ = nullptr;
699  return *this;
700  }
701  launch_config_builder_t& no_kernel()
702  {
703  kernel_ = nullptr;
704  return *this;
705  }
707 
716  {
717  if (not (kernel_)) {
718  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
719  }
720  if (not (dimensions_.block)) {
721  throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
722  }
723  dimensions_.grid = nullopt;
724  dimensions_.overall = nullopt;
725 #if CUDA_VERSION >= 10000
726  use_min_params_for_max_occupancy_ = false;
727 #endif
728  saturate_with_active_blocks_ = true;
729  return *this;
730  }
731 
741  {
742  if (not (kernel_)) {
743  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
744  }
745  dimensions_.block = nullopt;
746  dimensions_.grid = nullopt;
747  dimensions_.overall = nullopt;
748 #if CUDA_VERSION >= 10000
749  use_min_params_for_max_occupancy_ = true;
750 #endif
751  saturate_with_active_blocks_ = false;
752  return *this;
753  }
754 }; // launch_config_builder_t
755 
758 
759 } // namespace cuda
760 
761 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
launch_config_builder_t & blocks_may_cooperate()
Let kernel thread blocks synchronize with each other, or are guaranteed to act independently (atomic ...
Definition: launch_config_builder.hpp:588
launch_config_builder_t & min_params_for_max_occupancy()
This will use information about the kernel and the device to define a minimum launch grid which shoul...
Definition: launch_config_builder.hpp:740
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
launch_config_builder_t & block_dimensions(grid::block_dimension_t x, grid::block_dimension_t y=1, grid::block_dimension_t z=1)
Set the dimensions for each block in the intended kernel launch grid.
Definition: launch_config_builder.hpp:411
launch_config_builder_t & no_dynamic_shared_memory()
Indicate that the intended launch should not allocate any shared memory for the kernel to use beyond ...
Definition: launch_config_builder.hpp:604
launch_config_builder_t & block_cooperation(bool cooperation)
Set whether or blocks may synchronize with each other or not.
Definition: launch_config_builder.hpp:580
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
launch_config_builder_t launch_config_builder()
A slightly shorter-named construction idiom for launch_config_builder_t.
Definition: launch_config_builder.hpp:757
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
bool block_cooperation
When true, CUDA&#39;s "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86
launch_config_builder_t & grid_size(size_t size)
Set the grid for the intended launch to be one-dimensional, with a specified number of blocks...
Definition: launch_config_builder.hpp:531
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
launch_config_builder_t & kernel_independent()
Clear the association with a specific kernel (which may have been set using the kernel method) ...
Definition: launch_config_builder.hpp:696
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:342
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:367
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
A convenience class for gradually constructing a launch_configuration_t instance, as per the "builder...
Definition: launch_config_builder.hpp:99
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
launch_configuration_t build() const
Use the information specified to the builder (and defaults for the unspecified information) to finali...
Definition: launch_config_builder.hpp:207
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:837
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
launch_config_builder_t & overall_size(size_t size)
Set the intended launch grid to be linear, with a specified overall number of threads over all (1D) b...
Definition: launch_config_builder.hpp:568
launch_config_builder_t & use_maximum_linear_block()
Set the intended kernel launch grid to have 1D blocks, of the maximum length possible given the infor...
Definition: launch_config_builder.hpp:461
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:426
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Dimensions of a grid in threads, i.e.
Definition: types.hpp:432
launch_config_builder_t & kernel(const kernel_t *wrapped_kernel_ptr)
Indicate that the specified wrapped kernel will be the one launched with the configuration to be prod...
Definition: launch_config_builder.hpp:653
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
Contains the class cuda::launch_configuration_t and some supporting code.
launch_config_builder_t & dynamic_shared_memory_size(memory::shared::size_t size)
Indicate that the intended launch should allocate a certain amount of shared memory for the kernel to...
Definition: launch_config_builder.hpp:613
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
launch_config_builder_t & saturate_with_active_blocks()
This will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:715
launch_config_builder_t & dynamic_shared_memory(kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
Indicate that the intended launch should allocate additional shared memory for the kernel to use beyo...
Definition: launch_config_builder.hpp:638
launch_config_builder_t & blocks_dont_cooperate()
Prevent kernel thread blocks synchronize with each other, guaranteeing each block will work entirely ...
Definition: launch_config_builder.hpp:592
launch_config_builder_t & overall_dimensions(grid::overall_dimensions_t dims)
Set the overall number of threads, in each dimension, of all blocks in the grid of the intended kerne...
Definition: launch_config_builder.hpp:548
launch_config_builder_t & grid_dimensions(grid::dimensions_t dims)
Set the dimension of the grid for the intended kernel launch, in terms of blocks. ...
Definition: launch_config_builder.hpp:506
launch_config_builder_t & block_size(size_t size)
Set the block in the intended kernel launch grid to be uni-dimensional with a specified size...
Definition: launch_config_builder.hpp:421
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44