10 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ 11 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ 24 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
26 dimension_t quotient =
static_cast<dimension_t
>(dividend / divisor);
28 return (divisor * quotient == dividend) ? quotient : quotient + 1;
31 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
34 div_rounding_up(overall_dims.x, block_dims.x),
35 div_rounding_up(overall_dims.y, block_dims.y),
36 div_rounding_up(overall_dims.z, block_dims.z)
51 static void validate_all_dimensions_compatibility(
52 grid::block_dimensions_t block,
53 grid::dimensions_t grid,
54 grid::overall_dimensions_t overall)
56 if (grid * block != overall) {
57 throw ::std::invalid_argument(
"specified block, grid and overall dimensions do not agree");
70 dynamic_shared_memory_size_ :
71 dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.volume())));
83 if (saturate_with_active_blocks_) {
84 #if CUDA_VERSION >= 10000 85 if (use_min_params_for_max_occupancy_) {
86 throw ::std::logic_error(
87 "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate " 88 "the grid with fixed-size blocks.");
92 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
94 if (not (dimensions_.block)) {
95 throw ::std::logic_error(
"The block dimensions must be known to determine how many of them one needs for saturating a device");
97 if (dimensions_.grid or dimensions_.overall) {
98 throw ::std::logic_error(
"Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
101 result.block = dimensions_.block.value();
102 auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
104 auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
105 auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
106 result.grid = blocks_per_multiprocessor * num_multiprocessors;
109 #if CUDA_VERSION >= 10000 110 if (use_min_params_for_max_occupancy_) {
112 throw ::std::logic_error(
"A kernel must be set to determine the minimum grid parameter sfor m");
114 if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
115 throw ::std::logic_error(
"Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
117 auto composite_dims = dynamic_shared_memory_size_determiner_ ?
118 kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
119 kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
120 result.block = composite_dims.block;
121 result.grid = composite_dims.grid;
125 if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
126 result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
127 result.block = dimensions_.block.value();
130 if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
131 result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
132 result.grid = dimensions_.grid.value();
136 if (dimensions_.grid and dimensions_.block) {
137 if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
138 throw ::std::invalid_argument(
"specified block, grid and overall dimensions do not agree");
140 result.block = dimensions_.block.value();
141 result.grid = dimensions_.grid.value();
145 if (not dimensions_.block and not dimensions_.grid) {
146 throw ::std::logic_error(
147 "Neither block nor grid dimensions have been specified");
148 }
else if (not dimensions_.block and not dimensions_.overall) {
149 throw ::std::logic_error(
150 "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified " 151 "in terms of blocks, not threads, with no block dimensions specified");
153 throw ::std::logic_error(
154 "Only block dimensions have been specified - cannot resolve launch grid dimensions");
161 auto result = get_unvalidated_composite_dimensions();
162 validate_composite_dimensions(result);
172 result.block_cooperation = thread_block_cooperation;
180 optional<grid::block_dimensions_t > block;
181 optional<grid::dimensions_t > block_cluster;
182 optional<grid::dimensions_t > grid;
183 optional<grid::overall_dimensions_t> overall;
186 bool thread_block_cooperation {
false };
192 kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ {
nullptr };
195 const kernel_t* kernel_ {
nullptr };
196 optional<device::id_t> device_;
197 bool saturate_with_active_blocks_ {
false };
198 #if CUDA_VERSION >= 10000 199 bool use_min_params_for_max_occupancy_ {
false };
202 static cuda::device_t device(optional<device::id_t> maybe_id)
204 return cuda::device::get(maybe_id.value());
207 cuda::device_t device()
const {
return device(device_.value()); }
212 detail_::validate(config);
213 if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
214 if (device_) { detail_::validate_compatibility(device(), config); }
218 dimensions(config.dimensions);
223 static void validate_compatibility(
224 const kernel_t* kernel_ptr,
227 if (kernel_ptr ==
nullptr) {
return; }
228 detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
231 static void validate_compatibility(
232 optional<device::id_t> maybe_device_id,
235 if (not maybe_device_id) {
return; }
236 detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
241 validate_compatibility(kernel_, size);
242 validate_compatibility(device_, size);
245 static void validate_block_dimension_compatibility(
246 const kernel_t* kernel_ptr,
249 if (kernel_ptr ==
nullptr) {
return; }
250 return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
253 static void validate_block_dimension_compatibility(
254 optional<device::id_t> maybe_device_id,
257 if (not maybe_device_id) {
return; }
258 detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
263 detail_::validate_block_dimensions(block_dims);
264 if (dimensions_.grid and dimensions_.overall) {
265 detail_::validate_all_dimensions_compatibility(
266 block_dims, dimensions_.grid.value(), dimensions_.overall.value());
269 validate_block_dimension_compatibility(kernel_, block_dims);
270 validate_block_dimension_compatibility(device_, block_dims);
274 static void validate_grid_dimension_compatibility(
275 optional<device::id_t> maybe_device_id,
278 if (not maybe_device_id) {
return; }
279 detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
284 detail_::validate_grid_dimensions(grid_dims);
285 if (dimensions_.block and dimensions_.overall) {
286 detail_::validate_all_dimensions_compatibility(
287 dimensions_.block.value(), grid_dims, dimensions_.overall.value());
292 #if CUDA_VERSION >= 12000 295 if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
296 throw ::std::runtime_error(
"The requested block cluster dimensions do not " 297 "divide the grid dimensions (in blocks)");
300 #endif // CUDA_VERSION >= 12000 304 if (dimensions_.block and dimensions_.grid) {
305 if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
306 throw ::std::invalid_argument(
307 "specified overall dimensions conflict with the already-specified " 308 "block and grid dimensions");
313 void validate_kernel(
const kernel_t* kernel_ptr)
const 315 if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
316 auto block_dims = dimensions_.block ?
317 dimensions_.block.value() :
318 get_composite_dimensions().block;
319 validate_block_dimension_compatibility(kernel_ptr, block_dims);
321 validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
326 if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
327 auto block_dims = dimensions_.block ?
328 dimensions_.block.value() :
329 get_composite_dimensions().block;
330 validate_block_dimension_compatibility(device_id, block_dims);
332 detail_::validate_compatibility(
333 device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
338 validate_block_dimension_compatibility(kernel_, composite_dims.block);
339 validate_block_dimension_compatibility(device_, composite_dims.block);
342 validate_grid_dimension_compatibility(device_, composite_dims.grid);
344 #endif // ifndef NDEBUG 350 validate_composite_dimensions(composite_dims);
352 dimensions_.overall = nullopt;
353 dimensions_.grid = composite_dims.grid;
354 dimensions_.block = composite_dims.block;
361 validate_block_dimensions(dims);
363 dimensions_.block = dims;
364 if (dimensions_.grid) {
365 dimensions_.overall = nullopt;
385 max_size = kernel_->maximum_threads_per_block();
388 max_size = device().maximum_threads_per_block();
391 throw ::std::logic_error(
"Request to use the maximum-size linear block, with no device or kernel specified");
395 if (dimensions_.grid and dimensions_.overall) {
396 dimensions_.overall = nullopt;
398 dimensions_.block = block_dims;
402 #if CUDA_VERSION >= 12000 406 validate_cluster_dimensions(cluster_dims);
408 dimensions_.block_cluster = cluster_dims;
416 validate_grid_dimensions(dims);
418 if (dimensions_.block) {
419 dimensions_.overall = nullopt;
421 dimensions_.grid = dims;
422 saturate_with_active_blocks_ =
false;
440 validate_overall_dimensions(dims);
442 dimensions_.overall = dims;
443 saturate_with_active_blocks_ =
false;
458 thread_block_cooperation = cooperation;
466 kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
468 dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
480 validate_dynamic_shared_memory_size(size);
482 dynamic_shared_memory_size_ = size;
483 dynamic_shared_memory_size_determiner_ =
nullptr;
489 return dynamic_shared_memory_size(size);
493 kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
495 return dynamic_shared_memory_size(shared_mem_size_determiner);
500 if (device_ and kernel_->device_id() != device_) {
501 throw ::std::invalid_argument(
"Launch config builder already associated with " 502 + device::detail_::identify(*device_) +
" and cannot further be associated " 503 "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
506 validate_kernel(wrapped_kernel_ptr);
508 kernel_ = wrapped_kernel_ptr;
514 if (kernel_ and kernel_->device_id() != device_id) {
515 throw ::std::invalid_argument(
"Launch config builder already associated with " 516 + kernel::detail_::identify(*kernel_) +
" and cannot further be associated " 517 "another device: " + device::detail_::identify(device_id));
525 return this->device(device.id());
549 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
551 if (not (dimensions_.block)) {
552 throw ::std::logic_error(
"The block dimensions must be known to determine how many of them one needs for saturating a device");
554 dimensions_.grid = nullopt;
555 dimensions_.overall = nullopt;
556 #if CUDA_VERSION >= 10000 557 use_min_params_for_max_occupancy_ =
false;
559 saturate_with_active_blocks_ =
true;
566 throw ::std::logic_error(
"A kernel must be set to determine how many blocks are required to saturate the device");
568 dimensions_.block = nullopt;
569 dimensions_.grid = nullopt;
570 dimensions_.overall = nullopt;
571 #if CUDA_VERSION >= 10000 572 use_min_params_for_max_occupancy_ =
true;
574 saturate_with_active_blocks_ =
false;
583 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_ A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:319
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:22
Definition: launch_configuration.hpp:58
bool block_cooperation
When true, CUDA's "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:74
Definition: kernel_launch.hpp:238
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:332
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:752
Definition: kernel_launch.hpp:77
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:347
Definition: launch_config_builder.hpp:65
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:649
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:419
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:452
Dimensions of a grid in threads, i.e.
Definition: types.hpp:458
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:65
Contains the class launch_configuration_t, an enhanced child class of the CUlaunchConfig struct of CU...
Contains a base wrapper class for CUDA kernels - both statically and dynamically compiled; and some r...
launch_config_builder_t & saturate_with_active_blocks()
THis will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:546
Fundamental CUDA-related type definitions.