11 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ 12 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ 32 void validate_shared_mem_size_compatibility(
const kernel_t& kernel_ptr,
memory::shared::size_t shared_mem_size) noexcept(
false);
33 void validate_shared_mem_compatibility(
const device_t &device,
memory::shared::size_t shared_mem_size) noexcept(
false);
34 void validate_grid_dimension_compatibility(
const device_t &device, grid::block_dimensions_t block_dims) noexcept(
false);
35 void validate_compatibility(
const kernel_t& kernel, launch_configuration_t launch_config) noexcept(
false);
45 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
49 return (divisor * quotient == dividend) ? quotient : quotient + 1;
52 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
55 div_rounding_up(overall_dims.x, block_dims.x),
56 div_rounding_up(overall_dims.y, block_dims.y),
57 div_rounding_up(overall_dims.z, block_dims.z)
72 static void validate_all_dimensions_compatibility(
73 grid::block_dimensions_t block,
74 grid::dimensions_t grid,
75 grid::overall_dimensions_t overall)
77 if (grid * block != overall) {
78 throw ::std::invalid_argument(
"specified block, grid and overall dimensions do not agree");
107 dynamic_shared_memory_size_ :
108 dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.
volume())));
116 if (saturate_with_active_blocks_) {
117 #if CUDA_VERSION >= 10000 118 if (use_min_params_for_max_occupancy_) {
119 throw ::std::logic_error(
120 "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate " 121 "the grid with fixed-size blocks.");
125 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
127 if (not (dimensions_.block)) {
128 throw ::std::logic_error(
"The block dimensions must be known to determine how many of them one needs for saturating a device");
130 if (dimensions_.grid or dimensions_.overall) {
131 throw ::std::logic_error(
"Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
134 result.block = dimensions_.block.value();
135 auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
137 auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
138 auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
139 result.grid = blocks_per_multiprocessor * num_multiprocessors;
142 #if CUDA_VERSION >= 10000 143 if (use_min_params_for_max_occupancy_) {
145 throw ::std::logic_error(
"A kernel must be set to determine the minimum grid parameter sfor m");
147 if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
148 throw ::std::logic_error(
"Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
150 auto composite_dims = dynamic_shared_memory_size_determiner_ ?
151 kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
152 kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
153 result.block = composite_dims.block;
154 result.grid = composite_dims.grid;
158 if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
159 result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
160 result.block = dimensions_.block.value();
163 if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
164 result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
165 result.grid = dimensions_.grid.value();
169 if (dimensions_.grid and dimensions_.block) {
170 if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
171 throw ::std::invalid_argument(
"specified block, grid and overall dimensions do not agree");
173 result.block = dimensions_.block.value();
174 result.grid = dimensions_.grid.value();
178 if (not dimensions_.block and not dimensions_.grid) {
179 throw ::std::logic_error(
180 "Neither block nor grid dimensions have been specified");
181 }
else if (not dimensions_.block and not dimensions_.overall) {
182 throw ::std::logic_error(
183 "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified " 184 "in terms of blocks, not threads, with no block dimensions specified");
186 throw ::std::logic_error(
187 "Only block dimensions have been specified - cannot resolve launch grid dimensions");
195 auto result = get_unvalidated_composite_dimensions();
197 validate_composite_dimensions(result);
211 result.block_cooperation = thread_block_cooperation;
213 detail_::validate(result);
216 detail_::validate_compatibility(device, result);
219 detail_::validate_compatibility(*kernel_, result);
227 optional<grid::block_dimensions_t > block;
228 optional<grid::dimensions_t > block_cluster;
229 optional<grid::dimensions_t > grid;
230 optional<grid::overall_dimensions_t> overall;
233 bool thread_block_cooperation {
false };
242 const kernel_t* kernel_ {
nullptr };
243 optional<device::id_t> device_id_;
244 bool saturate_with_active_blocks_ {
false };
245 #if CUDA_VERSION >= 10000 246 bool use_min_params_for_max_occupancy_ {
false };
254 cuda::device_t device()
const {
return device(device_id_.value()); }
259 detail_::validate(config);
260 if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
261 if (device_id_) { detail_::validate_compatibility(device(), config); }
270 static void validate_compatibility(
274 if (kernel_ptr ==
nullptr) {
return; }
275 detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
278 static void validate_compatibility(
279 optional<device::id_t> maybe_device_id,
282 if (not maybe_device_id) {
return; }
283 detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
288 validate_compatibility(kernel_, size);
289 validate_compatibility(device_id_, size);
292 static void validate_block_dimension_compatibility(
296 if (kernel_ptr ==
nullptr) {
return; }
297 return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
300 static void validate_block_dimension_compatibility(
301 optional<device::id_t> maybe_device_id,
304 if (not maybe_device_id) {
return; }
305 detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
310 detail_::validate_block_dimensions(block_dims);
311 if (dimensions_.grid and dimensions_.overall) {
312 detail_::validate_all_dimensions_compatibility(
313 block_dims, dimensions_.grid.value(), dimensions_.overall.value());
316 validate_block_dimension_compatibility(kernel_, block_dims);
317 validate_block_dimension_compatibility(device_id_, block_dims);
321 static void validate_grid_dimension_compatibility(
322 optional<device::id_t> maybe_device_id,
325 if (not maybe_device_id) {
return; }
326 detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
331 detail_::validate_grid_dimensions(grid_dims);
332 if (dimensions_.block and dimensions_.overall) {
333 detail_::validate_all_dimensions_compatibility(
334 dimensions_.block.value(), grid_dims, dimensions_.overall.value());
339 #if CUDA_VERSION >= 12000 343 throw ::std::runtime_error(
"The requested block cluster dimensions do not " 344 "divide the grid dimensions (in blocks)");
347 #endif // CUDA_VERSION >= 12000 351 if (dimensions_.block and dimensions_.grid) {
352 if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
353 throw ::std::invalid_argument(
354 "specified overall dimensions conflict with the already-specified " 355 "block and grid dimensions");
360 void validate_kernel(
const kernel_t* kernel_ptr)
const 362 if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
363 auto block_dims = dimensions_.block ?
364 dimensions_.block.value() :
365 get_composite_dimensions().block;
366 validate_block_dimension_compatibility(kernel_ptr, block_dims);
368 validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
373 if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
374 auto block_dims = dimensions_.block ?
375 dimensions_.block.value() :
376 get_composite_dimensions().block;
377 validate_block_dimension_compatibility(device_id, block_dims);
379 detail_::validate_compatibility(
380 device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
385 validate_block_dimension_compatibility(kernel_, composite_dims.block);
386 validate_block_dimension_compatibility(device_id_, composite_dims.block);
389 validate_grid_dimension_compatibility(device_id_, composite_dims.grid);
391 #endif // ifndef NDEBUG 397 validate_composite_dimensions(composite_dims);
399 dimensions_.overall = nullopt;
400 dimensions_.grid = composite_dims.grid;
401 dimensions_.block = composite_dims.block;
408 validate_block_dimensions(dims);
410 dimensions_.block = dims;
411 if (dimensions_.grid) {
412 dimensions_.overall = nullopt;
431 static constexpr
const auto max_representable_block_dim = ::std:: numeric_limits<grid::block_dimension_t> ::max();
432 if (size > (
size_t) max_representable_block_dim) {
433 throw ::std::invalid_argument(
"Specified (1-dimensional) block size " + ::std::to_string(size)
434 +
" exceeds " + ::std::to_string(max_representable_block_dim)
435 +
" , the maximum representable size of a block");
442 auto max_threads_per_block = kernel_->maximum_threads_per_block();
443 if (size > max_threads_per_block) {
444 throw ::std::invalid_argument(
"Specified (1-dimensional) block size " + ::std::to_string(size)
445 +
" exceeds " + ::std::to_string(max_threads_per_block)
446 +
" , the maximum number of threads per block supported by " 447 + kernel::detail_::identify(*kernel_));
451 auto max_threads_per_block = device().maximum_threads_per_block();
452 if (size > max_threads_per_block) {
453 throw ::std::invalid_argument(
"Specified (1-dimensional) block size " + ::std::to_string(size)
454 +
" exceeds " + ::std::to_string(max_threads_per_block)
455 +
" , the maximum number of threads per block supported by " 456 + device::detail_::identify(device_id_.value()));
459 return block_dimensions(static_cast<grid::block_dimension_t>(size), 1, 1);
473 max_size = kernel_->maximum_threads_per_block();
475 else if (device_id_) {
476 max_size = device().maximum_threads_per_block();
479 throw ::std::logic_error(
"Request to use the maximum-size linear block, with no device or kernel specified");
483 if (dimensions_.grid and dimensions_.overall) {
484 dimensions_.overall = nullopt;
486 dimensions_.block = block_dims;
490 #if CUDA_VERSION >= 12000 504 validate_cluster_dimensions(cluster_dims);
506 dimensions_.block_cluster = cluster_dims;
517 validate_grid_dimensions(dims);
519 if (dimensions_.block) {
520 dimensions_.overall = nullopt;
522 dimensions_.grid = dims;
523 saturate_with_active_blocks_ =
false;
541 if (size > static_cast<size_t>(::std::numeric_limits<int>::max())) {
542 throw ::std::invalid_argument(
"Specified (1-dimensional) grid size " + ::std::to_string(size)
543 +
"in blocks exceeds " + ::std::to_string(::std::numeric_limits<int>::max())
544 +
" , the maximum supported number of blocks");
547 return grid_dimensions(static_cast<grid::dimension_t>(size), 1, 1);
559 validate_overall_dimensions(dims);
561 dimensions_.overall = dims;
562 saturate_with_active_blocks_ =
false;
578 static_assert(::std::is_same<grid::overall_dimension_t, size_t>::value,
"Unexpected type difference");
579 return overall_dimensions(size, 1, 1);
590 thread_block_cooperation = cooperation;
605 dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
624 validate_dynamic_shared_memory_size(size);
626 dynamic_shared_memory_size_ = size;
627 dynamic_shared_memory_size_determiner_ =
nullptr;
633 return dynamic_shared_memory_size(size);
649 return dynamic_shared_memory_size(shared_mem_size_determiner);
663 if (device_id_ and kernel_->device_id() != device_id_.value()) {
664 throw ::std::invalid_argument(
"Launch config builder already associated with " 665 + device::detail_::identify(*device_id_) +
" and cannot further be associated " 666 "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
669 validate_kernel(wrapped_kernel_ptr);
671 kernel_ = wrapped_kernel_ptr;
686 if (kernel_ and kernel_->device_id() != device_id) {
687 throw ::std::invalid_argument(
"Launch config builder already associated with " 688 + kernel::detail_::identify(*kernel_) +
" and cannot further be associated " 689 "another device: " + device::detail_::identify(device_id));
691 device_id_ = device_id;
697 return this->device(device.
id());
726 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
728 if (not (dimensions_.block)) {
729 throw ::std::logic_error(
"The block dimensions must be known to determine how many of them one needs for saturating a device");
731 dimensions_.grid = nullopt;
732 dimensions_.overall = nullopt;
733 #if CUDA_VERSION >= 10000 734 use_min_params_for_max_occupancy_ =
false;
736 saturate_with_active_blocks_ =
true;
751 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
753 dimensions_.block = nullopt;
754 dimensions_.grid = nullopt;
755 dimensions_.overall = nullopt;
756 #if CUDA_VERSION >= 10000 757 use_min_params_for_max_occupancy_ =
true;
759 saturate_with_active_blocks_ =
false;
769 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ launch_config_builder_t & blocks_may_cooperate()
Let kernel thread blocks synchronize with each other, or are guaranteed to act independently (atomic ...
Definition: launch_config_builder.hpp:596
launch_config_builder_t & min_params_for_max_occupancy()
This will use information about the kernel and the device to define a minimum launch grid which shoul...
Definition: launch_config_builder.hpp:748
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:296
launch_config_builder_t & block_dimensions(grid::block_dimension_t x, grid::block_dimension_t y=1, grid::block_dimension_t z=1)
Set the dimensions for each block in the intended kernel launch grid.
Definition: launch_config_builder.hpp:419
launch_config_builder_t & no_dynamic_shared_memory()
Indicate that the intended launch should not allocate any shared memory for the kernel to use beyond ...
Definition: launch_config_builder.hpp:612
launch_config_builder_t & block_cooperation(bool cooperation)
Set whether or blocks may synchronize with each other or not.
Definition: launch_config_builder.hpp:588
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
launch_config_builder_t launch_config_builder()
A slightly shorter-named construction idiom for launch_config_builder_t.
Definition: launch_config_builder.hpp:765
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
bool block_cooperation
When true, CUDA's "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86
launch_config_builder_t & grid_size(size_t size)
Set the grid for the intended launch to be one-dimensional, with a specified number of blocks...
Definition: launch_config_builder.hpp:539
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:309
launch_config_builder_t & kernel_independent()
Clear the association with a specific kernel (which may have been set using the kernel method) ...
Definition: launch_config_builder.hpp:704
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:339
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:364
device::id_t id() const noexcept
Return the proxied device's ID.
Definition: device.hpp:594
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:322
A convenience class for gradually constructing a launch_configuration_t instance, as per the "builder...
Definition: launch_config_builder.hpp:99
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:732
launch_configuration_t build() const
Use the information specified to the builder (and defaults for the unspecified information) to finali...
Definition: launch_config_builder.hpp:207
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:832
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:505
launch_config_builder_t & overall_size(size_t size)
Set the intended launch grid to be linear, with a specified overall number of threads over all (1D) b...
Definition: launch_config_builder.hpp:576
launch_config_builder_t & use_maximum_linear_block()
Set the intended kernel launch grid to have 1D blocks, of the maximum length possible given the infor...
Definition: launch_config_builder.hpp:469
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:423
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Dimensions of a grid in threads, i.e.
Definition: types.hpp:429
launch_config_builder_t & kernel(const kernel_t *wrapped_kernel_ptr)
Indicate that the specified wrapped kernel will be the one launched with the configuration to be prod...
Definition: launch_config_builder.hpp:661
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:91
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
Contains the class cuda::launch_configuration_t and some supporting code.
launch_config_builder_t & dynamic_shared_memory_size(memory::shared::size_t size)
Indicate that the intended launch should allocate a certain amount of shared memory for the kernel to...
Definition: launch_config_builder.hpp:621
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
launch_config_builder_t & saturate_with_active_blocks()
This will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:723
launch_config_builder_t & dynamic_shared_memory(kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
Indicate that the intended launch should allocate additional shared memory for the kernel to use beyo...
Definition: launch_config_builder.hpp:646
launch_config_builder_t & blocks_dont_cooperate()
Prevent kernel thread blocks synchronize with each other, guaranteeing each block will work entirely ...
Definition: launch_config_builder.hpp:600
launch_config_builder_t & overall_dimensions(grid::overall_dimensions_t dims)
Set the overall number of threads, in each dimension, of all blocks in the grid of the intended kerne...
Definition: launch_config_builder.hpp:556
launch_config_builder_t & grid_dimensions(grid::dimensions_t dims)
Set the dimension of the grid for the intended kernel launch, in terms of blocks. ...
Definition: launch_config_builder.hpp:514
launch_config_builder_t & block_size(size_t size)
Set the block in the intended kernel launch grid to be uni-dimensional with a specified size...
Definition: launch_config_builder.hpp:429
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44