11 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ 12 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ 32 void validate_shared_mem_size_compatibility(
const kernel_t& kernel_ptr,
memory::shared::size_t shared_mem_size) noexcept(
false);
33 void validate_shared_mem_compatibility(
const device_t &device,
memory::shared::size_t shared_mem_size) noexcept(
false);
34 void validate_grid_dimension_compatibility(
const device_t &device, grid::block_dimensions_t block_dims) noexcept(
false);
35 void validate_compatibility(
const kernel_t& kernel, launch_configuration_t launch_config) noexcept(
false);
45 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
49 return (divisor * quotient == dividend) ? quotient : quotient + 1;
52 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
55 div_rounding_up(overall_dims.x, block_dims.x),
56 div_rounding_up(overall_dims.y, block_dims.y),
57 div_rounding_up(overall_dims.z, block_dims.z)
72 static void validate_all_dimensions_compatibility(
73 grid::block_dimensions_t block,
74 grid::dimensions_t grid,
75 grid::overall_dimensions_t overall)
77 if (grid * block != overall) {
78 throw ::std::invalid_argument(
"specified block, grid and overall dimensions do not agree");
107 dynamic_shared_memory_size_ :
108 dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.
volume())));
116 if (saturate_with_active_blocks_) {
117 #if CUDA_VERSION >= 10000 118 if (use_min_params_for_max_occupancy_) {
119 throw ::std::logic_error(
120 "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate " 121 "the grid with fixed-size blocks.");
125 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
127 if (not (dimensions_.block)) {
128 throw ::std::logic_error(
"The block dimensions must be known to determine how many of them one needs for saturating a device");
130 if (dimensions_.grid or dimensions_.overall) {
131 throw ::std::logic_error(
"Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
134 result.block = dimensions_.block.value();
135 auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
137 auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
138 auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
139 result.grid = blocks_per_multiprocessor * num_multiprocessors;
142 #if CUDA_VERSION >= 10000 143 if (use_min_params_for_max_occupancy_) {
145 throw ::std::logic_error(
"A kernel must be set to determine the minimum grid parameter sfor m");
147 if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
148 throw ::std::logic_error(
"Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
150 auto composite_dims = dynamic_shared_memory_size_determiner_ ?
151 kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
152 kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
153 result.block = composite_dims.block;
154 result.grid = composite_dims.grid;
158 if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
159 result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
160 result.block = dimensions_.block.value();
163 if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
164 result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
165 result.grid = dimensions_.grid.value();
169 if (dimensions_.grid and dimensions_.block) {
170 if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
171 throw ::std::invalid_argument(
"specified block, grid and overall dimensions do not agree");
173 result.block = dimensions_.block.value();
174 result.grid = dimensions_.grid.value();
178 if (not dimensions_.block and not dimensions_.grid) {
179 throw ::std::logic_error(
180 "Neither block nor grid dimensions have been specified");
181 }
else if (not dimensions_.block and not dimensions_.overall) {
182 throw ::std::logic_error(
183 "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified " 184 "in terms of blocks, not threads, with no block dimensions specified");
186 throw ::std::logic_error(
187 "Only block dimensions have been specified - cannot resolve launch grid dimensions");
195 auto result = get_unvalidated_composite_dimensions();
197 validate_composite_dimensions(result);
211 result.block_cooperation = thread_block_cooperation;
219 optional<grid::block_dimensions_t > block;
220 optional<grid::dimensions_t > block_cluster;
221 optional<grid::dimensions_t > grid;
222 optional<grid::overall_dimensions_t> overall;
225 bool thread_block_cooperation {
false };
234 const kernel_t* kernel_ {
nullptr };
235 optional<device::id_t> device_;
236 bool saturate_with_active_blocks_ {
false };
237 #if CUDA_VERSION >= 10000 238 bool use_min_params_for_max_occupancy_ {
false };
251 detail_::validate(config);
252 if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
253 if (device_) { detail_::validate_compatibility(device(), config); }
262 static void validate_compatibility(
266 if (kernel_ptr ==
nullptr) {
return; }
267 detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
270 static void validate_compatibility(
271 optional<device::id_t> maybe_device_id,
274 if (not maybe_device_id) {
return; }
275 detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
280 validate_compatibility(kernel_, size);
281 validate_compatibility(device_, size);
284 static void validate_block_dimension_compatibility(
288 if (kernel_ptr ==
nullptr) {
return; }
289 return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
292 static void validate_block_dimension_compatibility(
293 optional<device::id_t> maybe_device_id,
296 if (not maybe_device_id) {
return; }
297 detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
302 detail_::validate_block_dimensions(block_dims);
303 if (dimensions_.grid and dimensions_.overall) {
304 detail_::validate_all_dimensions_compatibility(
305 block_dims, dimensions_.grid.value(), dimensions_.overall.value());
308 validate_block_dimension_compatibility(kernel_, block_dims);
309 validate_block_dimension_compatibility(device_, block_dims);
313 static void validate_grid_dimension_compatibility(
314 optional<device::id_t> maybe_device_id,
317 if (not maybe_device_id) {
return; }
318 detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
323 detail_::validate_grid_dimensions(grid_dims);
324 if (dimensions_.block and dimensions_.overall) {
325 detail_::validate_all_dimensions_compatibility(
326 dimensions_.block.value(), grid_dims, dimensions_.overall.value());
331 #if CUDA_VERSION >= 12000 335 throw ::std::runtime_error(
"The requested block cluster dimensions do not " 336 "divide the grid dimensions (in blocks)");
339 #endif // CUDA_VERSION >= 12000 343 if (dimensions_.block and dimensions_.grid) {
344 if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
345 throw ::std::invalid_argument(
346 "specified overall dimensions conflict with the already-specified " 347 "block and grid dimensions");
352 void validate_kernel(
const kernel_t* kernel_ptr)
const 354 if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
355 auto block_dims = dimensions_.block ?
356 dimensions_.block.value() :
357 get_composite_dimensions().block;
358 validate_block_dimension_compatibility(kernel_ptr, block_dims);
360 validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
365 if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
366 auto block_dims = dimensions_.block ?
367 dimensions_.block.value() :
368 get_composite_dimensions().block;
369 validate_block_dimension_compatibility(device_id, block_dims);
371 detail_::validate_compatibility(
372 device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
377 validate_block_dimension_compatibility(kernel_, composite_dims.block);
378 validate_block_dimension_compatibility(device_, composite_dims.block);
381 validate_grid_dimension_compatibility(device_, composite_dims.grid);
383 #endif // ifndef NDEBUG 389 validate_composite_dimensions(composite_dims);
391 dimensions_.overall = nullopt;
392 dimensions_.grid = composite_dims.grid;
393 dimensions_.block = composite_dims.block;
400 validate_block_dimensions(dims);
402 dimensions_.block = dims;
403 if (dimensions_.grid) {
404 dimensions_.overall = nullopt;
423 static constexpr
const auto max_representable_block_dim = ::std:: numeric_limits<grid::block_dimension_t> ::max();
424 if (size > (
size_t) max_representable_block_dim) {
425 throw ::std::invalid_argument(
"Specified (1-dimensional) block size " + ::std::to_string(size)
426 +
" exceeds " + ::std::to_string(max_representable_block_dim)
427 +
" , the maximum representable size of a block");
434 auto max_threads_per_block = kernel_->maximum_threads_per_block();
435 if (size > max_threads_per_block) {
436 throw ::std::invalid_argument(
"Specified (1-dimensional) block size " + ::std::to_string(size)
437 +
" exceeds " + ::std::to_string(max_threads_per_block)
438 +
" , the maximum number of threads per block supported by " 439 + kernel::detail_::identify(*kernel_));
443 auto max_threads_per_block = device().maximum_threads_per_block();
444 if (size > max_threads_per_block) {
445 throw ::std::invalid_argument(
"Specified (1-dimensional) block size " + ::std::to_string(size)
446 +
" exceeds " + ::std::to_string(max_threads_per_block)
447 +
" , the maximum number of threads per block supported by " 448 + device::detail_::identify(device_.value()));
451 return block_dimensions(static_cast<grid::block_dimension_t>(size), 1, 1);
465 max_size = kernel_->maximum_threads_per_block();
468 max_size = device().maximum_threads_per_block();
471 throw ::std::logic_error(
"Request to use the maximum-size linear block, with no device or kernel specified");
475 if (dimensions_.grid and dimensions_.overall) {
476 dimensions_.overall = nullopt;
478 dimensions_.block = block_dims;
482 #if CUDA_VERSION >= 12000 496 validate_cluster_dimensions(cluster_dims);
498 dimensions_.block_cluster = cluster_dims;
509 validate_grid_dimensions(dims);
511 if (dimensions_.block) {
512 dimensions_.overall = nullopt;
514 dimensions_.grid = dims;
515 saturate_with_active_blocks_ =
false;
533 if (size > static_cast<size_t>(::std::numeric_limits<int>::max())) {
534 throw ::std::invalid_argument(
"Specified (1-dimensional) grid size " + ::std::to_string(size)
535 +
"in blocks exceeds " + ::std::to_string(::std::numeric_limits<int>::max())
536 +
" , the maximum supported number of blocks");
539 return grid_dimensions(static_cast<grid::dimension_t>(size), 1, 1);
551 validate_overall_dimensions(dims);
553 dimensions_.overall = dims;
554 saturate_with_active_blocks_ =
false;
570 static_assert(std::is_same<grid::overall_dimension_t, size_t>::value,
"Unexpected type difference");
571 return overall_dimensions(size, 1, 1);
582 thread_block_cooperation = cooperation;
597 dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
616 validate_dynamic_shared_memory_size(size);
618 dynamic_shared_memory_size_ = size;
619 dynamic_shared_memory_size_determiner_ =
nullptr;
625 return dynamic_shared_memory_size(size);
641 return dynamic_shared_memory_size(shared_mem_size_determiner);
655 if (device_ and kernel_->device_id() != device_.value()) {
656 throw ::std::invalid_argument(
"Launch config builder already associated with " 657 + device::detail_::identify(*device_) +
" and cannot further be associated " 658 "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
661 validate_kernel(wrapped_kernel_ptr);
663 kernel_ = wrapped_kernel_ptr;
678 if (kernel_ and kernel_->device_id() != device_id) {
679 throw ::std::invalid_argument(
"Launch config builder already associated with " 680 + kernel::detail_::identify(*kernel_) +
" and cannot further be associated " 681 "another device: " + device::detail_::identify(device_id));
689 return this->device(device.
id());
718 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
720 if (not (dimensions_.block)) {
721 throw ::std::logic_error(
"The block dimensions must be known to determine how many of them one needs for saturating a device");
723 dimensions_.grid = nullopt;
724 dimensions_.overall = nullopt;
725 #if CUDA_VERSION >= 10000 726 use_min_params_for_max_occupancy_ =
false;
728 saturate_with_active_blocks_ =
true;
743 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
745 dimensions_.block = nullopt;
746 dimensions_.grid = nullopt;
747 dimensions_.overall = nullopt;
748 #if CUDA_VERSION >= 10000 749 use_min_params_for_max_occupancy_ =
true;
751 saturate_with_active_blocks_ =
false;
761 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ launch_config_builder_t & blocks_may_cooperate()
Let kernel thread blocks synchronize with each other, or are guaranteed to act independently (atomic ...
Definition: launch_config_builder.hpp:588
launch_config_builder_t & min_params_for_max_occupancy()
This will use information about the kernel and the device to define a minimum launch grid which shoul...
Definition: launch_config_builder.hpp:740
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
launch_config_builder_t & block_dimensions(grid::block_dimension_t x, grid::block_dimension_t y=1, grid::block_dimension_t z=1)
Set the dimensions for each block in the intended kernel launch grid.
Definition: launch_config_builder.hpp:411
launch_config_builder_t & no_dynamic_shared_memory()
Indicate that the intended launch should not allocate any shared memory for the kernel to use beyond ...
Definition: launch_config_builder.hpp:604
launch_config_builder_t & block_cooperation(bool cooperation)
Set whether or blocks may synchronize with each other or not.
Definition: launch_config_builder.hpp:580
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
launch_config_builder_t launch_config_builder()
A slightly shorter-named construction idiom for launch_config_builder_t.
Definition: launch_config_builder.hpp:757
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
bool block_cooperation
When true, CUDA's "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86
launch_config_builder_t & grid_size(size_t size)
Set the grid for the intended launch to be one-dimensional, with a specified number of blocks...
Definition: launch_config_builder.hpp:531
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
launch_config_builder_t & kernel_independent()
Clear the association with a specific kernel (which may have been set using the kernel method) ...
Definition: launch_config_builder.hpp:696
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:342
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:367
device::id_t id() const noexcept
Return the proxied device's ID.
Definition: device.hpp:594
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
A convenience class for gradually constructing a launch_configuration_t instance, as per the "builder...
Definition: launch_config_builder.hpp:99
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
launch_configuration_t build() const
Use the information specified to the builder (and defaults for the unspecified information) to finali...
Definition: launch_config_builder.hpp:207
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:837
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
launch_config_builder_t & overall_size(size_t size)
Set the intended launch grid to be linear, with a specified overall number of threads over all (1D) b...
Definition: launch_config_builder.hpp:568
launch_config_builder_t & use_maximum_linear_block()
Set the intended kernel launch grid to have 1D blocks, of the maximum length possible given the infor...
Definition: launch_config_builder.hpp:461
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:426
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Dimensions of a grid in threads, i.e.
Definition: types.hpp:432
launch_config_builder_t & kernel(const kernel_t *wrapped_kernel_ptr)
Indicate that the specified wrapped kernel will be the one launched with the configuration to be prod...
Definition: launch_config_builder.hpp:653
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
Contains the class cuda::launch_configuration_t and some supporting code.
launch_config_builder_t & dynamic_shared_memory_size(memory::shared::size_t size)
Indicate that the intended launch should allocate a certain amount of shared memory for the kernel to...
Definition: launch_config_builder.hpp:613
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
launch_config_builder_t & saturate_with_active_blocks()
This will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:715
launch_config_builder_t & dynamic_shared_memory(kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
Indicate that the intended launch should allocate additional shared memory for the kernel to use beyo...
Definition: launch_config_builder.hpp:638
launch_config_builder_t & blocks_dont_cooperate()
Prevent kernel thread blocks synchronize with each other, guaranteeing each block will work entirely ...
Definition: launch_config_builder.hpp:592
launch_config_builder_t & overall_dimensions(grid::overall_dimensions_t dims)
Set the overall number of threads, in each dimension, of all blocks in the grid of the intended kerne...
Definition: launch_config_builder.hpp:548
launch_config_builder_t & grid_dimensions(grid::dimensions_t dims)
Set the dimension of the grid for the intended kernel launch, in terms of blocks. ...
Definition: launch_config_builder.hpp:506
launch_config_builder_t & block_size(size_t size)
Set the block in the intended kernel launch grid to be uni-dimensional with a specified size...
Definition: launch_config_builder.hpp:421
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44