cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
launch_config_builder.hpp
Go to the documentation of this file.
1 
10 #pragma once
11 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
12 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
13 
14 // This definition in types.hpp is usually sufficient, but let's be on the safe side
15 #ifdef _MSC_VER
16 // See @url https://stackoverflow.com/q/4913922/1593077
17 #define NOMINMAX
18 #endif
19 
20 #include "launch_configuration.hpp"
21 #include "kernel_launch.hpp"
22 #include "device.hpp"
23 #include "types.hpp"
24 
25 #include <limits>
26 #include <string>
27 
28 namespace cuda {
29 
30 namespace detail_ {
31 
32 void validate_shared_mem_size_compatibility(const kernel_t& kernel_ptr, memory::shared::size_t shared_mem_size) noexcept(false);
33 void validate_shared_mem_compatibility(const device_t &device, memory::shared::size_t shared_mem_size) noexcept(false);
34 void validate_grid_dimension_compatibility(const device_t &device, grid::block_dimensions_t block_dims) noexcept(false);
35 void validate_compatibility(const kernel_t& kernel, launch_configuration_t launch_config) noexcept(false);
36 void validate_compatibility(device::id_t, memory::shared::size_t, bool, optional<grid::dimensions_t>) noexcept(false);
37 
38 } // namespace detail_
39 
40 
41 namespace grid {
42 
43 namespace detail_ {
44 
45 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
46 {
47  dimension_t quotient = static_cast<dimension_t>(dividend / divisor);
48  // It is up to the caller to ensure we don't overflow the dimension_t type
49  return (divisor * quotient == dividend) ? quotient : quotient + 1;
50 }
51 
52 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
53 {
54  return {
55  div_rounding_up(overall_dims.x, block_dims.x),
56  div_rounding_up(overall_dims.y, block_dims.y),
57  div_rounding_up(overall_dims.z, block_dims.z)
58  };
59 }
60 
61 // Note: We're not implementing a grid-to-block rounding up here, since - currently -
62 // block_dimensions_t is the same as grid_dimensions_t.
63 
64 } // namespace detail_
65 
66 } // namespace grid
67 
68 #ifndef NDEBUG
69 
70 namespace detail_ {
71 
72 static void validate_all_dimensions_compatibility(
73  grid::block_dimensions_t block,
74  grid::dimensions_t grid,
75  grid::overall_dimensions_t overall)
76 {
77  if (grid * block != overall) {
78  throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
79  }
80 }
81 
82 } // namespace detail_
83 
84 #endif // NDEBUG
85 
100 protected:
104  memory::shared::size_t get_dynamic_shared_memory_size(grid::block_dimensions_t block_dims) const
105  {
106  return static_cast<memory::shared::size_t>((dynamic_shared_memory_size_determiner_ == nullptr) ?
107  dynamic_shared_memory_size_ :
108  dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.volume())));
109  // Q: Why the need for type conversion?
110  // A: MSVC is being a bit finicky here for some reason
111  }
112 
113  grid::composite_dimensions_t get_unvalidated_composite_dimensions() const noexcept(false)
114  {
116  if (saturate_with_active_blocks_) {
117 #if CUDA_VERSION >= 10000
118  if (use_min_params_for_max_occupancy_) {
119  throw ::std::logic_error(
120  "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate "
121  "the grid with fixed-size blocks.");
122  }
123 #endif
124  if (not (kernel_)) {
125  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
126  }
127  if (not (dimensions_.block)) {
128  throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
129  }
130  if (dimensions_.grid or dimensions_.overall) {
131  throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
132  }
133 
134  result.block = dimensions_.block.value();
135  auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
136  auto num_block_threads = static_cast<grid::block_dimension_t>(dimensions_.block.value().volume());
137  auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
138  auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
139  result.grid = blocks_per_multiprocessor * num_multiprocessors;
140  return result;
141  }
142 #if CUDA_VERSION >= 10000
143  if (use_min_params_for_max_occupancy_) {
144  if (not (kernel_)) {
145  throw ::std::logic_error("A kernel must be set to determine the minimum grid parameter sfor m");
146  }
147  if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
148  throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
149  }
150  auto composite_dims = dynamic_shared_memory_size_determiner_ ?
151  kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
152  kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
153  result.block = composite_dims.block;
154  result.grid = composite_dims.grid;
155  return result;
156  }
157 #endif
158  if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
159  result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
160  result.block = dimensions_.block.value();
161  return result;
162  }
163  if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
164  result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
165  result.grid = dimensions_.grid.value();
166  return result;
167  }
168 
169  if (dimensions_.grid and dimensions_.block) {
170  if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
171  throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
172  }
173  result.block = dimensions_.block.value();
174  result.grid = dimensions_.grid.value();
175  return result;
176  }
177 
178  if (not dimensions_.block and not dimensions_.grid) {
179  throw ::std::logic_error(
180  "Neither block nor grid dimensions have been specified");
181  } else if (not dimensions_.block and not dimensions_.overall) {
182  throw ::std::logic_error(
183  "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified "
184  "in terms of blocks, not threads, with no block dimensions specified");
185  } else { // it must be the case that (not dimensions_.block and not dimensions_.overall)
186  throw ::std::logic_error(
187  "Only block dimensions have been specified - cannot resolve launch grid dimensions");
188  }
189  }
190 
193  grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
194  {
195  auto result = get_unvalidated_composite_dimensions();
196 #ifndef NDEBUG
197  validate_composite_dimensions(result);
198 #endif
199  return result;
200  }
201 
202 public:
208  {
209  auto result = launch_configuration_t{ get_composite_dimensions() };
210  result.dynamic_shared_memory_size = get_dynamic_shared_memory_size(result.dimensions.block);
211  result.block_cooperation = thread_block_cooperation;
212  // TODO: More fields!
213  detail_::validate(result);
214  if (device_id_) {
215  auto device = device::get(*device_id_);
216  detail_::validate_compatibility(device, result);
217  }
218  if (kernel_) {
219  detail_::validate_compatibility(*kernel_, result);
220  }
221  return result;
222  }
223 
224 protected:
225 
226  struct {
227  optional<grid::block_dimensions_t > block;
228  optional<grid::dimensions_t > block_cluster;
229  optional<grid::dimensions_t > grid;
230  optional<grid::overall_dimensions_t> overall;
231  } dimensions_;
232 
233  bool thread_block_cooperation { false };
234 
235  // Note: We could have used a variant between these two;
236  // but the semantic is that if the determiner is not null, we use it;
237  // and if you want to force a concrete apriori value, then you nullify
238  // the determiner
239  kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
240  memory::shared::size_t dynamic_shared_memory_size_ { 0 };
241 
242  const kernel_t* kernel_ { nullptr };
243  optional<device::id_t> device_id_;
244  bool saturate_with_active_blocks_ { false };
245 #if CUDA_VERSION >= 10000
246  bool use_min_params_for_max_occupancy_ { false };
247 #endif
248 
249  static cuda::device_t device(optional<device::id_t> maybe_id)
250  {
251  return cuda::device::get(maybe_id.value());
252  }
253 
254  cuda::device_t device() const { return device(device_id_.value()); }
255 
256  launch_config_builder_t& configure_for(launch_configuration_t config)
257  {
258 #ifndef NDEBUG
259  detail_::validate(config);
260  if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
261  if (device_id_) { detail_::validate_compatibility(device(), config); }
262 #endif
263  thread_block_cooperation = config.block_cooperation;
264  dynamic_shared_memory_size_ = config.dynamic_shared_memory_size;
265  dimensions(config.dimensions);
266  return *this;
267  }
268 
269 #ifndef NDEBUG
270  static void validate_compatibility(
271  const kernel_t* kernel_ptr,
272  memory::shared::size_t shared_mem_size)
273  {
274  if (kernel_ptr == nullptr) { return; }
275  detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
276  }
277 
278  static void validate_compatibility(
279  optional<device::id_t> maybe_device_id,
280  memory::shared::size_t shared_mem_size)
281  {
282  if (not maybe_device_id) { return; }
283  detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
284  }
285 
286  void validate_dynamic_shared_memory_size(memory::shared::size_t size)
287  {
288  validate_compatibility(kernel_, size);
289  validate_compatibility(device_id_, size);
290  }
291 
292  static void validate_block_dimension_compatibility(
293  const kernel_t* kernel_ptr,
294  grid::block_dimensions_t block_dims)
295  {
296  if (kernel_ptr == nullptr) { return; }
297  return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
298  }
299 
300  static void validate_block_dimension_compatibility(
301  optional<device::id_t> maybe_device_id,
302  grid::block_dimensions_t block_dims)
303  {
304  if (not maybe_device_id) { return; }
305  detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
306  }
307 
308  void validate_block_dimensions(grid::block_dimensions_t block_dims) const
309  {
310  detail_::validate_block_dimensions(block_dims);
311  if (dimensions_.grid and dimensions_.overall) {
312  detail_::validate_all_dimensions_compatibility(
313  block_dims, dimensions_.grid.value(), dimensions_.overall.value());
314  }
315  // TODO: Check divisibility
316  validate_block_dimension_compatibility(kernel_, block_dims);
317  validate_block_dimension_compatibility(device_id_, block_dims);
318  }
319 
320 
321  static void validate_grid_dimension_compatibility(
322  optional<device::id_t> maybe_device_id,
323  grid::block_dimensions_t block_dims)
324  {
325  if (not maybe_device_id) { return; }
326  detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
327  }
328 
329  void validate_grid_dimensions(grid::dimensions_t grid_dims) const
330  {
331  detail_::validate_grid_dimensions(grid_dims);
332  if (dimensions_.block and dimensions_.overall) {
333  detail_::validate_all_dimensions_compatibility(
334  dimensions_.block.value(), grid_dims, dimensions_.overall.value());
335  }
336  // TODO: Check divisibility
337  }
338 
339 #if CUDA_VERSION >= 12000
340  void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
341  {
342  if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
343  throw ::std::runtime_error("The requested block cluster dimensions do not "
344  "divide the grid dimensions (in blocks)");
345  }
346  }
347 #endif // CUDA_VERSION >= 12000
348 
349  void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
350  {
351  if (dimensions_.block and dimensions_.grid) {
352  if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
353  throw ::std::invalid_argument(
354  "specified overall dimensions conflict with the already-specified "
355  "block and grid dimensions");
356  }
357  }
358  }
359 
360  void validate_kernel(const kernel_t* kernel_ptr) const
361  {
362  if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
363  auto block_dims = dimensions_.block ?
364  dimensions_.block.value() :
365  get_composite_dimensions().block;
366  validate_block_dimension_compatibility(kernel_ptr, block_dims);
367  }
368  validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
369  }
370 
371  void validate_device(device::id_t device_id) const
372  {
373  if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
374  auto block_dims = dimensions_.block ?
375  dimensions_.block.value() :
376  get_composite_dimensions().block;
377  validate_block_dimension_compatibility(device_id, block_dims);
378  }
379  detail_::validate_compatibility(
380  device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
381  }
382 
383  void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
384  {
385  validate_block_dimension_compatibility(kernel_, composite_dims.block);
386  validate_block_dimension_compatibility(device_id_, composite_dims.block);
387 
388  // Is there anything to validate regarding the grid dims?
389  validate_grid_dimension_compatibility(device_id_, composite_dims.grid);
390  }
391 #endif // ifndef NDEBUG
392 
393 public:
394  launch_config_builder_t& dimensions(grid::composite_dimensions_t composite_dims)
395  {
396 #ifndef NDEBUG
397  validate_composite_dimensions(composite_dims);
398 #endif
399  dimensions_.overall = nullopt;
400  dimensions_.grid = composite_dims.grid;
401  dimensions_.block = composite_dims.block;
402  return *this;
403  }
404 
405  launch_config_builder_t& block_dimensions(grid::block_dimensions_t dims)
406  {
407 #ifndef NDEBUG
408  validate_block_dimensions(dims);
409 #endif
410  dimensions_.block = dims;
411  if (dimensions_.grid) {
412  dimensions_.overall = nullopt;
413  }
414  return *this;
415 
416  }
417 
423  {
424  return block_dimensions(grid::block_dimensions_t{x, y, z});
425  }
426 
430  {
431  static constexpr const auto max_representable_block_dim = ::std:: numeric_limits<grid::block_dimension_t> ::max();
432  if (size > (size_t) max_representable_block_dim) {
433  throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
434  + " exceeds " + ::std::to_string(max_representable_block_dim)
435  + " , the maximum representable size of a block");
436  // and note this is a super-lenient check, since in practice, device properties
437  // limit block sizes at much lower values; but NVIDIA doesn't "let us know that" via
438  // any global definitions.
439 
440  }
441  if (kernel_) {
442  auto max_threads_per_block = kernel_->maximum_threads_per_block();
443  if (size > max_threads_per_block) {
444  throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
445  + " exceeds " + ::std::to_string(max_threads_per_block)
446  + " , the maximum number of threads per block supported by "
447  + kernel::detail_::identify(*kernel_));
448  }
449  }
450  if (device_id_) {
451  auto max_threads_per_block = device().maximum_threads_per_block();
452  if (size > max_threads_per_block) {
453  throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
454  + " exceeds " + ::std::to_string(max_threads_per_block)
455  + " , the maximum number of threads per block supported by "
456  + device::detail_::identify(device_id_.value()));
457  }
458  }
459  return block_dimensions(static_cast<grid::block_dimension_t>(size), 1, 1);
460  }
461 
470  {
471  grid::block_dimension_t max_size;
472  if (kernel_) {
473  max_size = kernel_->maximum_threads_per_block();
474  }
475  else if (device_id_) {
476  max_size = device().maximum_threads_per_block();
477  }
478  else {
479  throw ::std::logic_error("Request to use the maximum-size linear block, with no device or kernel specified");
480  }
481  auto block_dims = grid::block_dimensions_t { max_size, 1, 1 };
482 
483  if (dimensions_.grid and dimensions_.overall) {
484  dimensions_.overall = nullopt;
485  }
486  dimensions_.block = block_dims;
487  return *this;
488  }
489 
490 #if CUDA_VERSION >= 12000
491 
501  launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
502  {
503 #ifndef NDEBUG
504  validate_cluster_dimensions(cluster_dims);
505 #endif
506  dimensions_.block_cluster = cluster_dims;
507  return *this;
508  }
509 #endif
510 
515  {
516 #ifndef NDEBUG
517  validate_grid_dimensions(dims);
518 #endif
519  if (dimensions_.block) {
520  dimensions_.overall = nullopt;
521  }
522  dimensions_.grid = dims;
523  saturate_with_active_blocks_ = false;
524  return *this;
525  }
526 
528  launch_config_builder_t& grid_dimensions(
530  grid::dimension_t y = 1,
531  grid::dimension_t z = 1)
532  {
533  return grid_dimensions(grid::dimensions_t{x, y, z});
534  }
535 
540 #ifndef NDEBUG
541  if (size > static_cast<size_t>(::std::numeric_limits<int>::max())) {
542  throw ::std::invalid_argument("Specified (1-dimensional) grid size " + ::std::to_string(size)
543  + "in blocks exceeds " + ::std::to_string(::std::numeric_limits<int>::max())
544  + " , the maximum supported number of blocks");
545  }
546 #endif
547  return grid_dimensions(static_cast<grid::dimension_t>(size), 1, 1);
548  }
549  launch_config_builder_t& num_blocks(size_t size) {return grid_size(size); }
551 
552 
557  {
558 #ifndef NDEBUG
559  validate_overall_dimensions(dims);
560 #endif
561  dimensions_.overall = dims;
562  saturate_with_active_blocks_ = false;
563  return *this;
564  }
565  launch_config_builder_t& overall_dimensions(
569  {
570  return overall_dimensions(grid::overall_dimensions_t{x, y, z});
571  }
573 
577  {
578  static_assert(::std::is_same<grid::overall_dimension_t, size_t>::value, "Unexpected type difference");
579  return overall_dimensions(size, 1, 1);
580  }
581 
589  {
590  thread_block_cooperation = cooperation;
591  return *this;
592  }
593 
596  launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }
597 
600  launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }
601 
602  launch_config_builder_t& dynamic_shared_memory_size(
603  kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
604  {
605  dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
606  return *this;
607  }
608 
613  {
614  return dynamic_shared_memory_size(memory::shared::size_t(0));
615  }
616 
622  {
623 #ifndef NDEBUG
624  validate_dynamic_shared_memory_size(size);
625 #endif
626  dynamic_shared_memory_size_ = size;
627  dynamic_shared_memory_size_determiner_ = nullptr;
628  return *this;
629  }
630 
631  launch_config_builder_t& dynamic_shared_memory(memory::shared::size_t size)
632  {
633  return dynamic_shared_memory_size(size);
634  }
636 
647  kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
648  {
649  return dynamic_shared_memory_size(shared_mem_size_determiner);
650  }
651 
661  launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
662  {
663  if (device_id_ and kernel_->device_id() != device_id_.value()) {
664  throw ::std::invalid_argument("Launch config builder already associated with "
665  + device::detail_::identify(*device_id_) + " and cannot further be associated "
666  "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
667  }
668 #ifndef NDEBUG
669  validate_kernel(wrapped_kernel_ptr);
670 #endif
671  kernel_ = wrapped_kernel_ptr;
672  return *this;
673  }
674 
683  launch_config_builder_t& device(const device::id_t device_id)
685  {
686  if (kernel_ and kernel_->device_id() != device_id) {
687  throw ::std::invalid_argument("Launch config builder already associated with "
688  + kernel::detail_::identify(*kernel_) + " and cannot further be associated "
689  "another device: " + device::detail_::identify(device_id));
690  }
691  device_id_ = device_id;
692  return *this;
693  }
694 
695  launch_config_builder_t& device(const device_t& device)
696  {
697  return this->device(device.id());
698  }
700 
705  {
706  kernel_ = nullptr;
707  return *this;
708  }
709  launch_config_builder_t& no_kernel()
710  {
711  kernel_ = nullptr;
712  return *this;
713  }
715 
724  {
725  if (not (kernel_)) {
726  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
727  }
728  if (not (dimensions_.block)) {
729  throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
730  }
731  dimensions_.grid = nullopt;
732  dimensions_.overall = nullopt;
733 #if CUDA_VERSION >= 10000
734  use_min_params_for_max_occupancy_ = false;
735 #endif
736  saturate_with_active_blocks_ = true;
737  return *this;
738  }
739 
749  {
750  if (not (kernel_)) {
751  throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
752  }
753  dimensions_.block = nullopt;
754  dimensions_.grid = nullopt;
755  dimensions_.overall = nullopt;
756 #if CUDA_VERSION >= 10000
757  use_min_params_for_max_occupancy_ = true;
758 #endif
759  saturate_with_active_blocks_ = false;
760  return *this;
761  }
762 }; // launch_config_builder_t
763 
766 
767 } // namespace cuda
768 
769 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
launch_config_builder_t & blocks_may_cooperate()
Let kernel thread blocks synchronize with each other, or are guaranteed to act independently (atomic ...
Definition: launch_config_builder.hpp:596
launch_config_builder_t & min_params_for_max_occupancy()
This will use information about the kernel and the device to define a minimum launch grid which shoul...
Definition: launch_config_builder.hpp:748
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:296
launch_config_builder_t & block_dimensions(grid::block_dimension_t x, grid::block_dimension_t y=1, grid::block_dimension_t z=1)
Set the dimensions for each block in the intended kernel launch grid.
Definition: launch_config_builder.hpp:419
launch_config_builder_t & no_dynamic_shared_memory()
Indicate that the intended launch should not allocate any shared memory for the kernel to use beyond ...
Definition: launch_config_builder.hpp:612
launch_config_builder_t & block_cooperation(bool cooperation)
Set whether or blocks may synchronize with each other or not.
Definition: launch_config_builder.hpp:588
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
launch_config_builder_t launch_config_builder()
A slightly shorter-named construction idiom for launch_config_builder_t.
Definition: launch_config_builder.hpp:765
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
bool block_cooperation
When true, CUDA&#39;s "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86
launch_config_builder_t & grid_size(size_t size)
Set the grid for the intended launch to be one-dimensional, with a specified number of blocks...
Definition: launch_config_builder.hpp:539
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:309
launch_config_builder_t & kernel_independent()
Clear the association with a specific kernel (which may have been set using the kernel method) ...
Definition: launch_config_builder.hpp:704
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:339
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:364
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:322
A convenience class for gradually constructing a launch_configuration_t instance, as per the "builder...
Definition: launch_config_builder.hpp:99
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:732
launch_configuration_t build() const
Use the information specified to the builder (and defaults for the unspecified information) to finali...
Definition: launch_config_builder.hpp:207
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:832
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:505
launch_config_builder_t & overall_size(size_t size)
Set the intended launch grid to be linear, with a specified overall number of threads over all (1D) b...
Definition: launch_config_builder.hpp:576
launch_config_builder_t & use_maximum_linear_block()
Set the intended kernel launch grid to have 1D blocks, of the maximum length possible given the infor...
Definition: launch_config_builder.hpp:469
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:423
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Dimensions of a grid in threads, i.e.
Definition: types.hpp:429
launch_config_builder_t & kernel(const kernel_t *wrapped_kernel_ptr)
Indicate that the specified wrapped kernel will be the one launched with the configuration to be prod...
Definition: launch_config_builder.hpp:661
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:91
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
Contains the class cuda::launch_configuration_t and some supporting code.
launch_config_builder_t & dynamic_shared_memory_size(memory::shared::size_t size)
Indicate that the intended launch should allocate a certain amount of shared memory for the kernel to...
Definition: launch_config_builder.hpp:621
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
launch_config_builder_t & saturate_with_active_blocks()
This will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:723
launch_config_builder_t & dynamic_shared_memory(kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
Indicate that the intended launch should allocate additional shared memory for the kernel to use beyo...
Definition: launch_config_builder.hpp:646
launch_config_builder_t & blocks_dont_cooperate()
Prevent kernel thread blocks synchronize with each other, guaranteeing each block will work entirely ...
Definition: launch_config_builder.hpp:600
launch_config_builder_t & overall_dimensions(grid::overall_dimensions_t dims)
Set the overall number of threads, in each dimension, of all blocks in the grid of the intended kerne...
Definition: launch_config_builder.hpp:556
launch_config_builder_t & grid_dimensions(grid::dimensions_t dims)
Set the dimension of the grid for the intended kernel launch, in terms of blocks. ...
Definition: launch_config_builder.hpp:514
launch_config_builder_t & block_size(size_t size)
Set the block in the intended kernel launch grid to be uni-dimensional with a specified size...
Definition: launch_config_builder.hpp:429
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44