11 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIGURATION_CUH_ 12 #define CUDA_API_WRAPPERS_LAUNCH_CONFIGURATION_CUH_ 17 #include <type_traits> 30 inline void validate_block_dimensions(grid::block_dimensions_t block_dims)
32 if (block_dims.volume() == 0) {
33 throw ::std::invalid_argument(
"Zero-volume grid-of-blocks dimensions provided");
37 inline void validate_grid_dimensions(grid::dimensions_t grid_dims)
39 if (grid_dims.volume() == 0) {
40 throw ::std::invalid_argument(
"Zero-volume block dimensions provided");
45 void validate_block_dimension_compatibility(
const device_t &device, grid::block_dimensions_t block_dims);
46 void validate_block_dimension_compatibility(
const kernel_t &kernel, grid::block_dimensions_t block_dims);
53 #if CUDA_VERSION >= 12000 54 enum class cluster_scheduling_policy_t {
55 default_ = CU_CLUSTER_SCHEDULING_POLICY_DEFAULT,
56 spread = CU_CLUSTER_SCHEDULING_POLICY_SPREAD,
57 load_balance = CU_CLUSTER_SCHEDULING_POLICY_LOAD_BALANCING
86 bool block_cooperation {
false };
88 #if CUDA_VERSION >= 12000 98 bool programmatically_dependent_launch {
true };
114 struct programmatic_completion_t {
117 bool trigger_event_at_block_start {
true };
118 #if __cplusplus >= 202002L 119 constexpr
bool operator==(
const programmatic_completion_t&)
const noexcept =
default;
121 } programmatic_completion;
129 bool in_remote_memory_synchronization_domain {
false };
135 struct clustering_t {
137 cluster_scheduling_policy_t scheduling_policy { cluster_scheduling_policy_t::default_ };
138 #if __cplusplus >= 202002L 139 constexpr
bool operator==(
const clustering_t &)
const noexcept =
default;
142 #endif // CUDA_VERSION >= 12000 144 #if __cplusplus >= 202002L 158 if (block_cooperation) {
return true; }
159 #if CUDA_VERSION >= 12000 160 return programmatically_dependent_launch or programmatic_completion.event
196 dimensions{grid_and_block_dimensions},
197 dynamic_shared_memory_size{dynamic_shared_mem}
200 constexpr launch_configuration_t(
204 ) : launch_configuration_t( {grid_dims, block_dims}, dynamic_shared_mem) { }
207 constexpr launch_configuration_t(
211 ) : launch_configuration_t(
218 CPP14_CONSTEXPR launch_configuration_t& operator=(
const launch_configuration_t& other) =
default;
219 CPP14_CONSTEXPR launch_configuration_t& operator=(launch_configuration_t&&) =
default;
222 #if __cplusplus < 202002L 230 #if CUDA_VERSION >= 12000 231 and lhs.programmatically_dependent_launch == rhs.programmatically_dependent_launch
232 and lhs.programmatic_completion.event == rhs.programmatic_completion.event
233 and lhs.programmatic_completion.trigger_event_at_block_start == rhs.programmatic_completion.trigger_event_at_block_start
234 and lhs.in_remote_memory_synchronization_domain == rhs.in_remote_memory_synchronization_domain
235 and lhs.clustering.cluster_dimensions == rhs.clustering.cluster_dimensions
236 and lhs.clustering.scheduling_policy == rhs.clustering.scheduling_policy
237 #endif // CUDA_VERSION >= 12000 243 return not (lhs == rhs);
254 validate_block_dimensions(launch_config.
dimensions.block);
255 validate_grid_dimensions(launch_config.
dimensions.grid);
258 inline void validate_compatibility(
262 validate(launch_config);
263 validate_block_dimension_compatibility(device, launch_config.
dimensions.block);
268 void validate_compatibility(
272 using launch_attribute_index_t =
unsigned int;
275 constexpr launch_attribute_index_t maximum_possible_kernel_launch_attributes = 7;
277 #if CUDA_VERSION >= 12000 279 CUlaunchConfig marshal(
282 span<CUlaunchAttribute> attribute_storage) noexcept(
true);
283 #endif // CUDA_VERSION >= 12000 289 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIGURATION_CUH_ Alias for the default behavior; see heuristic .
Definition: types.hpp:901
bool has_nondefault_attributes() const
Determine whether the configuration includes launch attributes different than the default values...
Definition: launch_configuration.hpp:156
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
bool block_cooperation
When true, CUDA's "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86
Wrapper class for a CUDA event.
Definition: event.hpp:133
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:239
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
Wrapper class for a CUDA device.
Definition: device.hpp:135
Fundamental CUDA-related type definitions.
static constexpr __host__ __device__ dimensions_t point() noexcept
Dimensions of a single point - trivial in all axes.
Definition: types.hpp:363