eyalroz/cuda-api-wrappers/launch__config__builder_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
 #define CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_

 // This definition in types.hpp is usually sufficient, but let's be on the safe side
 #ifdef _MSC_VER
 // See @url https://stackoverflow.com/q/4913922/1593077
 #define NOMINMAX
 #endif

 #include "launch_configuration.hpp"
 #include "kernel_launch.hpp"
 #include "device.hpp"
 #include "types.hpp"

 #include <limits>
 #include <string>

 namespace cuda {

 namespace detail_ {

 void validate_shared_mem_size_compatibility(const kernel_t& kernel_ptr, memory::shared::size_t shared_mem_size) noexcept(false);
 void validate_shared_mem_compatibility(const device_t &device, memory::shared::size_t shared_mem_size) noexcept(false);
 void validate_grid_dimension_compatibility(const device_t &device, grid::block_dimensions_t block_dims) noexcept(false);
 void validate_compatibility(const kernel_t& kernel, launch_configuration_t launch_config) noexcept(false);
 void validate_compatibility(const device::id_t, memory::shared::size_t, bool, optional<grid::dimensions_t>) noexcept(false);

 } // namespace detail_


 namespace grid {

 namespace detail_ {

 inline dimension_t div_rounding_up(overall_dimension_t dividend, block_dimension_t divisor)
 {
     dimension_t quotient = static_cast<dimension_t>(dividend / divisor);
         // It is up to the caller to ensure we don't overflow the dimension_t type
     return (divisor * quotient == dividend) ? quotient : quotient + 1;
 }

 inline dimensions_t div_rounding_up(overall_dimensions_t overall_dims, block_dimensions_t block_dims)
 {
     return {
         div_rounding_up(overall_dims.x, block_dims.x),
         div_rounding_up(overall_dims.y, block_dims.y),
         div_rounding_up(overall_dims.z, block_dims.z)
     };
 }

 // Note: We're not implementing a grid-to-block rounding up here, since - currently -
 // block_dimensions_t is the same as grid_dimensions_t.

 } // namespace detail_

 } // namespace grid

 #ifndef NDEBUG

 namespace detail_ {

 static void validate_all_dimensions_compatibility(
     grid::block_dimensions_t   block,
     grid::dimensions_t         grid,
     grid::overall_dimensions_t overall)
 {
     if (grid * block != overall) {
         throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
     }
 }

 } // namespace detail_

 #endif // NDEBUG

 class launch_config_builder_t {
 protected:
     memory::shared::size_t  get_dynamic_shared_memory_size(grid::block_dimensions_t block_dims) const
     {
         return static_cast<memory::shared::size_t>((dynamic_shared_memory_size_determiner_ == nullptr) ?
             dynamic_shared_memory_size_ :
             dynamic_shared_memory_size_determiner_(static_cast<int>(block_dims.volume())));
             // Q: Why the need for type conversion?
             // A: MSVC is being a bit finicky here for some reason
     }

     grid::composite_dimensions_t get_unvalidated_composite_dimensions() const noexcept(false)
     {
         grid::composite_dimensions_t result;
         if (saturate_with_active_blocks_) {
 #if CUDA_VERSION >= 10000
             if (use_min_params_for_max_occupancy_) {
                 throw ::std::logic_error(
                     "Cannot both use the minimum grid parameters for achieving maximum occupancy, _and_ saturate "
                     "the grid with fixed-size blocks.");
             }
 #endif
             if (not (kernel_)) {
                 throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
             }
             if (not (dimensions_.block)) {
                 throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
             }
             if (dimensions_.grid or dimensions_.overall) {
                 throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
             }

             result.block = dimensions_.block.value();
             auto dshmem_size = get_dynamic_shared_memory_size(dimensions_.block.value());
             auto num_block_threads = static_cast<grid::block_dimension_t>(dimensions_.block.value().volume());
             auto blocks_per_multiprocessor = kernel_->max_active_blocks_per_multiprocessor(num_block_threads, dshmem_size);
             auto num_multiprocessors = device().get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
             result.grid = blocks_per_multiprocessor * num_multiprocessors;
             return result;
         }
 #if CUDA_VERSION >= 10000
         if (use_min_params_for_max_occupancy_) {
             if (not (kernel_)) {
                 throw ::std::logic_error("A kernel must be set to determine the minimum grid parameter sfor m");
             }
             if (dimensions_.block or dimensions_.grid or dimensions_.overall) {
                 throw ::std::logic_error("Conflicting specifications: Grid or overall dimensions specified, but requested to saturate kernels with active blocks");
             }
             auto composite_dims = dynamic_shared_memory_size_determiner_ ?
                                   kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_determiner_) :
                                   kernel_->min_grid_params_for_max_occupancy(dynamic_shared_memory_size_);
             result.block = composite_dims.block;
             result.grid = composite_dims.grid;
             return result;
         }
 #endif
         if (dimensions_.block and dimensions_.overall and not dimensions_.grid) {
             result.grid = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.block.value());
             result.block = dimensions_.block.value();
             return result;
         }
         if (dimensions_.grid and dimensions_.overall and not dimensions_.block) {
             result.block = grid::detail_::div_rounding_up(dimensions_.overall.value(), dimensions_.grid.value());
             result.grid = dimensions_.grid.value();
             return result;
         }

         if (dimensions_.grid and dimensions_.block) {
             if (dimensions_.overall and (dimensions_.grid.value() * dimensions_.block.value() != dimensions_.overall.value())) {
                 throw ::std::invalid_argument("specified block, grid and overall dimensions do not agree");
             }
             result.block = dimensions_.block.value();
             result.grid = dimensions_.grid.value();
             return result;
         }

         if (not dimensions_.block and not dimensions_.grid) {
             throw ::std::logic_error(
                 "Neither block nor grid dimensions have been specified");
         } else if (not dimensions_.block and not dimensions_.overall) {
             throw ::std::logic_error(
                 "Attempt to obtain the composite grid dimensions, while the grid dimensions have only been specified "
                 "in terms of blocks, not threads, with no block dimensions specified");
         } else { // it must be the case that (not dimensions_.block and not dimensions_.overall)
             throw ::std::logic_error(
                 "Only block dimensions have been specified - cannot resolve launch grid dimensions");
         }
     }

     grid::composite_dimensions_t get_composite_dimensions() const noexcept(false)
     {
         auto result = get_unvalidated_composite_dimensions();
 #ifndef NDEBUG
         validate_composite_dimensions(result);
 #endif
         return result;
     }

 public:
     launch_configuration_t build() const
     {
         auto result = launch_configuration_t{ get_composite_dimensions() };
         result.dynamic_shared_memory_size = get_dynamic_shared_memory_size(result.dimensions.block);
         result.block_cooperation = thread_block_cooperation;
         // TODO: More fields!
         return result;
     }

 protected:

     struct {
         optional<grid::block_dimensions_t  > block;
         optional<grid::dimensions_t        > block_cluster;
         optional<grid::dimensions_t        > grid;
         optional<grid::overall_dimensions_t> overall;
     } dimensions_;

     bool thread_block_cooperation { false };

     // Note: We could have used a variant between these two;
     // but the semantic is that if the determiner is not null, we use it;
     // and if you want to force a concrete apriori value, then you nullify
     // the determiner
     kernel::shared_memory_size_determiner_t dynamic_shared_memory_size_determiner_ { nullptr };
     memory::shared::size_t dynamic_shared_memory_size_ { 0 };

     const kernel_t* kernel_ { nullptr };
     optional<device::id_t> device_;
     bool saturate_with_active_blocks_ { false };
 #if CUDA_VERSION >= 10000
     bool use_min_params_for_max_occupancy_ { false };
 #endif

     static cuda::device_t device(optional<device::id_t> maybe_id)
     {
         return cuda::device::get(maybe_id.value());
     }

     cuda::device_t device() const { return device(device_.value()); }

     launch_config_builder_t& configure_for(launch_configuration_t config)
     {
 #ifndef NDEBUG
         detail_::validate(config);
         if (kernel_) { detail_::validate_compatibility(*kernel_, config); }
         if (device_) { detail_::validate_compatibility(device(), config); }
 #endif
         thread_block_cooperation = config.block_cooperation;
         dynamic_shared_memory_size_ = config.dynamic_shared_memory_size;
         dimensions(config.dimensions);
         return *this;
     }

 #ifndef NDEBUG
     static void validate_compatibility(
         const kernel_t*         kernel_ptr,
         memory::shared::size_t  shared_mem_size)
     {
         if (kernel_ptr == nullptr) { return; }
         detail_::validate_shared_mem_size_compatibility(*kernel_ptr, shared_mem_size);
     }

     static void validate_compatibility(
         optional<device::id_t> maybe_device_id,
         memory::shared::size_t shared_mem_size)
     {
         if (not maybe_device_id) { return; }
         detail_::validate_shared_mem_compatibility(device(maybe_device_id), shared_mem_size);
     }

     void validate_dynamic_shared_memory_size(memory::shared::size_t size)
     {
         validate_compatibility(kernel_, size);
         validate_compatibility(device_, size);
     }

     static void validate_block_dimension_compatibility(
         const kernel_t*          kernel_ptr,
         grid::block_dimensions_t block_dims)
     {
         if (kernel_ptr == nullptr) { return; }
         return detail_::validate_block_dimension_compatibility(*kernel_ptr, block_dims);
     }

     static void validate_block_dimension_compatibility(
         optional<device::id_t>    maybe_device_id,
         grid::block_dimensions_t  block_dims)
     {
         if (not maybe_device_id) { return; }
         detail_::validate_block_dimension_compatibility(device(maybe_device_id), block_dims);
     }

     void validate_block_dimensions(grid::block_dimensions_t block_dims) const
     {
         detail_::validate_block_dimensions(block_dims);
         if (dimensions_.grid and dimensions_.overall) {
             detail_::validate_all_dimensions_compatibility(
                 block_dims, dimensions_.grid.value(), dimensions_.overall.value());
         }
         // TODO: Check divisibility
         validate_block_dimension_compatibility(kernel_, block_dims);
         validate_block_dimension_compatibility(device_, block_dims);
     }


     static void validate_grid_dimension_compatibility(
         optional<device::id_t>    maybe_device_id,
         grid::block_dimensions_t  block_dims)
     {
         if (not maybe_device_id) { return; }
         detail_::validate_grid_dimension_compatibility(device(maybe_device_id), block_dims);
     }

     void validate_grid_dimensions(grid::dimensions_t grid_dims) const
     {
         detail_::validate_grid_dimensions(grid_dims);
         if (dimensions_.block and dimensions_.overall) {
             detail_::validate_all_dimensions_compatibility(
                 dimensions_.block.value(), grid_dims, dimensions_.overall.value());
         }
         // TODO: Check divisibility
     }

 #if CUDA_VERSION >= 12000
     void validate_cluster_dimensions(grid::dimensions_t cluster_dims) const
     {
         if (dimensions_.grid and grid::dimensions_t::divides(cluster_dims, dimensions_.grid.value())) {
             throw ::std::runtime_error("The requested block cluster dimensions do not "
                 "divide the grid dimensions (in blocks)");
         }
     }
 #endif // CUDA_VERSION >= 12000

     void validate_overall_dimensions(grid::overall_dimensions_t overall_dims) const
     {
         if (dimensions_.block and dimensions_.grid) {
             if (dimensions_.grid.value() * dimensions_.block.value() != overall_dims) {
                 throw ::std::invalid_argument(
                 "specified overall dimensions conflict with the already-specified "
                 "block and grid dimensions");
             }
         }
     }

     void validate_kernel(const kernel_t* kernel_ptr) const
     {
         if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
             auto block_dims = dimensions_.block ?
                         dimensions_.block.value() :
                         get_composite_dimensions().block;
             validate_block_dimension_compatibility(kernel_ptr, block_dims);
         }
         validate_compatibility(kernel_ptr, dynamic_shared_memory_size_);
     }

     void validate_device(device::id_t device_id) const
     {
         if (dimensions_.block or (dimensions_.grid and dimensions_.overall)) {
             auto block_dims = dimensions_.block ?
                 dimensions_.block.value() :
                 get_composite_dimensions().block;
             validate_block_dimension_compatibility(device_id, block_dims);
         }
         detail_::validate_compatibility(
             device_id, dynamic_shared_memory_size_, thread_block_cooperation, dimensions_.block_cluster);
     }

     void validate_composite_dimensions(grid::composite_dimensions_t composite_dims) const
     {
         validate_block_dimension_compatibility(kernel_, composite_dims.block);
         validate_block_dimension_compatibility(device_, composite_dims.block);

         // Is there anything to validate regarding the grid dims?
         validate_grid_dimension_compatibility(device_, composite_dims.grid);
     }
 #endif // ifndef NDEBUG

 public:
     launch_config_builder_t& dimensions(grid::composite_dimensions_t composite_dims)
     {
 #ifndef NDEBUG
         validate_composite_dimensions(composite_dims);
 #endif
         dimensions_.overall = nullopt;
         dimensions_.grid = composite_dims.grid;
         dimensions_.block = composite_dims.block;
         return *this;
     }

     launch_config_builder_t& block_dimensions(grid::block_dimensions_t dims)
     {
 #ifndef NDEBUG
         validate_block_dimensions(dims);
 #endif
         dimensions_.block = dims;
         if (dimensions_.grid) {
             dimensions_.overall = nullopt;
         }
         return *this;

     }

     launch_config_builder_t& block_dimensions(
         grid::block_dimension_t x,
         grid::block_dimension_t y = 1,
         grid::block_dimension_t z = 1)
     {
         return block_dimensions(grid::block_dimensions_t{x, y, z});
     }

     launch_config_builder_t& block_size(size_t size)
     {
         static constexpr const auto max_representable_block_dim = ::std:: numeric_limits<grid::block_dimension_t> ::max();
         if (size > (size_t) max_representable_block_dim) {
             throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
                   + " exceeds " + ::std::to_string(max_representable_block_dim)
                   + " , the maximum representable size of a block");
             // and note this is a super-lenient check, since in practice, device properties
             // limit block sizes at much lower values; but NVIDIA doesn't "let us know that" via
             // any global definitions.

         }
         if (kernel_) {
             auto max_threads_per_block = kernel_->maximum_threads_per_block();
             if (size > max_threads_per_block) {
                 throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
                     + " exceeds " + ::std::to_string(max_threads_per_block)
                     + " , the maximum number of threads per block supported by "
                     + kernel::detail_::identify(*kernel_));
             }
         }
         if (device_) {
             auto max_threads_per_block = device().maximum_threads_per_block();
             if (size > max_threads_per_block) {
                 throw ::std::invalid_argument("Specified (1-dimensional) block size " + ::std::to_string(size)
                     + " exceeds " + ::std::to_string(max_threads_per_block)
                     + " , the maximum number of threads per block supported by "
                     + device::detail_::identify(device_.value()));
             }
         }
         return block_dimensions(static_cast<grid::block_dimension_t>(size), 1, 1);
     }

     launch_config_builder_t& use_maximum_linear_block()
     {
         grid::block_dimension_t max_size;
         if (kernel_) {
             max_size = kernel_->maximum_threads_per_block();
         }
         else if (device_) {
             max_size = device().maximum_threads_per_block();
         }
         else {
             throw ::std::logic_error("Request to use the maximum-size linear block, with no device or kernel specified");
         }
         auto block_dims = grid::block_dimensions_t { max_size, 1, 1 };

         if (dimensions_.grid and dimensions_.overall) {
             dimensions_.overall = nullopt;
         }
         dimensions_.block = block_dims;
         return *this;
     }

 #if CUDA_VERSION >= 12000

     launch_config_builder_t& cluster_blocks(grid::block_dimensions_t cluster_dims)
     {
 #ifndef NDEBUG
         validate_cluster_dimensions(cluster_dims);
 #endif
         dimensions_.block_cluster = cluster_dims;
         return *this;
     }
 #endif

     launch_config_builder_t& grid_dimensions(grid::dimensions_t dims)
     {
 #ifndef NDEBUG
         validate_grid_dimensions(dims);
 #endif
         if (dimensions_.block) {
             dimensions_.overall = nullopt;
         }
         dimensions_.grid = dims;
         saturate_with_active_blocks_ = false;
         return *this;
     }

     launch_config_builder_t& grid_dimensions(
         grid::dimension_t x,
         grid::dimension_t y = 1,
         grid::dimension_t z = 1)
     {
         return grid_dimensions(grid::dimensions_t{x, y, z});
     }

     launch_config_builder_t& grid_size(size_t size) {
 #ifndef NDEBUG
         if (size > static_cast<size_t>(::std::numeric_limits<int>::max())) {
             throw ::std::invalid_argument("Specified (1-dimensional) grid size " + ::std::to_string(size)
                 + "in blocks exceeds " + ::std::to_string(::std::numeric_limits<int>::max())
                 + " , the maximum supported number of blocks");
         }
 #endif
         return grid_dimensions(static_cast<grid::dimension_t>(size), 1, 1);
     }
     launch_config_builder_t& num_blocks(size_t size) {return grid_size(size); }


     launch_config_builder_t& overall_dimensions(grid::overall_dimensions_t dims)
     {
 #ifndef NDEBUG
         validate_overall_dimensions(dims);
 #endif
         dimensions_.overall = dims;
         saturate_with_active_blocks_ = false;
         return *this;
     }
     launch_config_builder_t& overall_dimensions(
         grid::overall_dimension_t x,
         grid::overall_dimension_t y = 1,
         grid::overall_dimension_t z = 1)
     {
         return overall_dimensions(grid::overall_dimensions_t{x, y, z});
     }

     launch_config_builder_t& overall_size(size_t size)
     {
         static_assert(std::is_same<grid::overall_dimension_t, size_t>::value, "Unexpected type difference");
         return overall_dimensions(size, 1, 1);
     }

     launch_config_builder_t& block_cooperation(bool cooperation)
     {
         thread_block_cooperation = cooperation;
         return *this;
     }

     launch_config_builder_t& blocks_may_cooperate() { return block_cooperation(true); }

     launch_config_builder_t& blocks_dont_cooperate() { return block_cooperation(false); }

     launch_config_builder_t& dynamic_shared_memory_size(
         kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
     {
         dynamic_shared_memory_size_determiner_ = shared_mem_size_determiner;
         return *this;
     }

     launch_config_builder_t& no_dynamic_shared_memory()
     {
         return dynamic_shared_memory_size(memory::shared::size_t(0));
     }

     launch_config_builder_t& dynamic_shared_memory_size(memory::shared::size_t size)
     {
 #ifndef NDEBUG
         validate_dynamic_shared_memory_size(size);
 #endif
         dynamic_shared_memory_size_ = size;
         dynamic_shared_memory_size_determiner_ = nullptr;
         return *this;
     }

     launch_config_builder_t& dynamic_shared_memory(memory::shared::size_t size)
     {
         return dynamic_shared_memory_size(size);
     }

     launch_config_builder_t& dynamic_shared_memory(
         kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
     {
         return dynamic_shared_memory_size(shared_mem_size_determiner);
     }

     launch_config_builder_t& kernel(const kernel_t* wrapped_kernel_ptr)
     {
         if (device_ and kernel_->device_id() != device_.value()) {
             throw ::std::invalid_argument("Launch config builder already associated with "
             + device::detail_::identify(*device_) + " and cannot further be associated "
             "with " +kernel::detail_::identify(*wrapped_kernel_ptr));
         }
 #ifndef NDEBUG
         validate_kernel(wrapped_kernel_ptr);
 #endif
         kernel_ = wrapped_kernel_ptr;
         return *this;
     }

     launch_config_builder_t& device(const device::id_t device_id)
     {
         if (kernel_ and kernel_->device_id() != device_id) {
             throw ::std::invalid_argument("Launch config builder already associated with "
                 + kernel::detail_::identify(*kernel_) + " and cannot further be associated "
                 "another device: " + device::detail_::identify(device_id));
         }
         device_ = device_id;
         return *this;
     }

     launch_config_builder_t& device(const device_t& device)
     {
         return this->device(device.id());
     }

     launch_config_builder_t& kernel_independent()
     {
         kernel_ = nullptr;
         return *this;
     }
     launch_config_builder_t& no_kernel()
     {
         kernel_ = nullptr;
         return *this;
     }

     launch_config_builder_t& saturate_with_active_blocks()
     {
         if (not (kernel_)) {
             throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
         }
         if (not (dimensions_.block)) {
             throw ::std::logic_error("The block dimensions must be known to determine how many of them one needs for saturating a device");
         }
         dimensions_.grid = nullopt;
         dimensions_.overall = nullopt;
 #if CUDA_VERSION >= 10000
         use_min_params_for_max_occupancy_ = false;
 #endif
         saturate_with_active_blocks_ = true;
         return *this;
     }

     launch_config_builder_t& min_params_for_max_occupancy()
     {
         if (not (kernel_)) {
             throw ::std::logic_error("A kernel must be set to determine how many blocks are required to saturate the device");
         }
         dimensions_.block = nullopt;
         dimensions_.grid = nullopt;
         dimensions_.overall = nullopt;
 #if CUDA_VERSION >= 10000
         use_min_params_for_max_occupancy_ = true;
 #endif
         saturate_with_active_blocks_ = false;
         return *this;
     }
 }; // launch_config_builder_t

 inline launch_config_builder_t launch_config_builder() { return {}; }

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_LAUNCH_CONFIG_BUILDER_CUH_
cuda::launch_config_builder_t::blocks_may_cooperate
launch_config_builder_t & blocks_may_cooperate()
Let kernel thread blocks synchronize with each other, or are guaranteed to act independently (atomic ...
Definition: launch_config_builder.hpp:588

cuda::launch_config_builder_t::min_params_for_max_occupancy
launch_config_builder_t & min_params_for_max_occupancy()
This will use information about the kernel and the device to define a minimum launch grid which shoul...
Definition: launch_config_builder.hpp:740

device.hpp
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...

cuda::grid::dimension_t
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299

cuda::launch_config_builder_t::block_dimensions
launch_config_builder_t & block_dimensions(grid::block_dimension_t x, grid::block_dimension_t y=1, grid::block_dimension_t z=1)
Set the dimensions for each block in the intended kernel launch grid.
Definition: launch_config_builder.hpp:411

cuda::launch_config_builder_t::no_dynamic_shared_memory
launch_config_builder_t & no_dynamic_shared_memory()
Indicate that the intended launch should not allocate any shared memory for the kernel to use beyond ...
Definition: launch_config_builder.hpp:604

cuda::launch_config_builder_t::block_cooperation
launch_config_builder_t & block_cooperation(bool cooperation)
Set whether or blocks may synchronize with each other or not.
Definition: launch_config_builder.hpp:580

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::launch_config_builder
launch_config_builder_t launch_config_builder()
A slightly shorter-named construction idiom for launch_config_builder_t.
Definition: launch_config_builder.hpp:757

cuda::launch_configuration_t
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69

cuda::launch_configuration_t::block_cooperation
bool block_cooperation
When true, CUDA&#39;s "cooperative launch" mechanism will be used, enabling more flexible device-wide syn...
Definition: launch_configuration.hpp:86

cuda::launch_config_builder_t::grid_size
launch_config_builder_t & grid_size(size_t size)
Set the grid for the intended launch to be one-dimensional, with a specified number of blocks...
Definition: launch_config_builder.hpp:531

cuda::grid::block_dimension_t
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312

cuda::launch_config_builder_t::kernel_independent
launch_config_builder_t & kernel_independent()
Clear the association with a specific kernel (which may have been set using the kernel method) ...
Definition: launch_config_builder.hpp:696

cuda::launch_configuration_t::dimensions
grid::composite_dimensions_t dimensions
Dimensions of the launch grid in blocks, and of the individual blocks in the grid.
Definition: launch_configuration.hpp:71

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::grid::dimensions_t::volume
constexpr __host__ __device__ size_t volume() const noexcept
The number of total elements in a 3D object with these dimensions.
Definition: types.hpp:342

cuda::grid::dimensions_t::divides
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:367

cuda::device_t::id
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594

cuda::grid::dimensions_t
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325

cuda::launch_config_builder_t
A convenience class for gradually constructing a launch_configuration_t instance, as per the "builder...
Definition: launch_config_builder.hpp:99

cuda::memory::shared::size_t
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730

cuda::launch_config_builder_t::build
launch_configuration_t build() const
Use the information specified to the builder (and defaults for the unspecified information) to finali...
Definition: launch_config_builder.hpp:207

cuda::device::get
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:837

cuda::grid::composite_dimensions_t
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508

cuda::launch_config_builder_t::overall_size
launch_config_builder_t & overall_size(size_t size)
Set the intended launch grid to be linear, with a specified overall number of threads over all (1D) b...
Definition: launch_config_builder.hpp:568

cuda::launch_config_builder_t::use_maximum_linear_block
launch_config_builder_t & use_maximum_linear_block()
Set the intended kernel launch grid to have 1D blocks, of the maximum length possible given the infor...
Definition: launch_config_builder.hpp:461

cuda::grid::overall_dimension_t
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:426

kernel_launch.hpp
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.

cuda::grid::overall_dimensions_t
Dimensions of a grid in threads, i.e.
Definition: types.hpp:432

cuda::launch_config_builder_t::kernel
launch_config_builder_t & kernel(const kernel_t *wrapped_kernel_ptr)
Indicate that the specified wrapped kernel will be the one launched with the configuration to be prod...
Definition: launch_config_builder.hpp:653

cuda::array::dimension_t
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94

cuda::launch_configuration_t::dynamic_shared_memory_size
memory::shared::size_t dynamic_shared_memory_size
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: launch_configuration.hpp:77

launch_configuration.hpp
Contains the class cuda::launch_configuration_t and some supporting code.

cuda::launch_config_builder_t::dynamic_shared_memory_size
launch_config_builder_t & dynamic_shared_memory_size(memory::shared::size_t size)
Indicate that the intended launch should allocate a certain amount of shared memory for the kernel to...
Definition: launch_config_builder.hpp:613

cuda::kernel_t
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159

cuda::launch_config_builder_t::saturate_with_active_blocks
launch_config_builder_t & saturate_with_active_blocks()
This will use information about the kernel, the already-set block size, and the device to create a un...
Definition: launch_config_builder.hpp:715

cuda::launch_config_builder_t::dynamic_shared_memory
launch_config_builder_t & dynamic_shared_memory(kernel::shared_memory_size_determiner_t shared_mem_size_determiner)
Indicate that the intended launch should allocate additional shared memory for the kernel to use beyo...
Definition: launch_config_builder.hpp:638

cuda::launch_config_builder_t::blocks_dont_cooperate
launch_config_builder_t & blocks_dont_cooperate()
Prevent kernel thread blocks synchronize with each other, guaranteeing each block will work entirely ...
Definition: launch_config_builder.hpp:592

cuda::launch_config_builder_t::overall_dimensions
launch_config_builder_t & overall_dimensions(grid::overall_dimensions_t dims)
Set the overall number of threads, in each dimension, of all blocks in the grid of the intended kerne...
Definition: launch_config_builder.hpp:548

cuda::launch_config_builder_t::grid_dimensions
launch_config_builder_t & grid_dimensions(grid::dimensions_t dims)
Set the dimension of the grid for the intended kernel launch, in terms of blocks. ...
Definition: launch_config_builder.hpp:506

cuda::launch_config_builder_t::block_size
launch_config_builder_t & block_size(size_t size)
Set the block in the intended kernel launch grid to be uni-dimensional with a specified size...
Definition: launch_config_builder.hpp:421

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

types.hpp
Fundamental CUDA-related type definitions.

cuda::kernel::shared_memory_size_determiner_t
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44