eyalroz/cuda-api-wrappers/kernel_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_KERNEL_HPP_
 #define CUDA_API_WRAPPERS_KERNEL_HPP_

 #include "primary_context.hpp"
 #include "current_context.hpp"
 #include "device_properties.hpp"
 #include "error.hpp"
 #include "types.hpp"

 #if CUDA_VERSION < 11000
 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 0
 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE virtual
 #else
 #define CAW_CAN_GET_APRIORI_KERNEL_HANDLE 1
 #define VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
 #endif

 namespace cuda {

 class kernel_t;

 namespace kernel {

 using shared_memory_size_determiner_t = size_t (CUDA_CB *)(int block_size);

 kernel_t wrap(
     device::id_t       device_id,
     context::handle_t  context_handle,
     kernel::handle_t   handle,
     bool               hold_primary_context_refcount_unit = false);

 namespace detail_ {

 inline ::std::string identify(const kernel_t& kernel);

 static const char* attribute_name(int attribute_index)
 {
     // Note: These correspond to the values of enum CUfunction_attribute_enum
     static const char* names[] = {
         "Maximum number of threads per block",
         "Statically-allocated shared memory size in bytes",
         "Required constant memory size in bytes",
         "Required local memory size in bytes",
         "Number of registers used by each thread",
         "PTX virtual architecture version into which the kernel code was compiled",
         "Binary architecture version for which the function was compiled",
         "Indication whether the function was compiled with cache mode CA",
         "Maximum allowed size of dynamically-allocated shared memory use size bytes",
         "Preferred shared memory carve-out to actual shared memory"
     };
     return names[attribute_index];
 }

 inline attribute_value_t get_attribute_in_current_context(handle_t handle, attribute_t attribute)
 {
     kernel::attribute_value_t attribute_value;
     auto result = cuFuncGetAttribute(&attribute_value,  attribute, handle);
     throw_if_error_lazy(result, ::std::string("Failed obtaining attribute ") + attribute_name(attribute));
     return attribute_value;
 }

 inline void set_attribute_in_current_context(handle_t handle, attribute_t attribute, attribute_value_t value)
 {
 #if CUDA_VERSION >= 9000
     auto result = cuFuncSetAttribute(handle, static_cast<CUfunction_attribute>(attribute), value);
     throw_if_error_lazy(result,
         "Setting CUDA device function attribute " +
         ::std::string(kernel::detail_::attribute_name(attribute)) + " of function at "
         + cuda::kernel::detail_::identify(handle) + " to value " + ::std::to_string(value));
 #else
     throw(cuda::runtime_error {cuda::status::not_yet_implemented});
 #endif
 }

 #if CUDA_VERSION >= 12030
 inline const char * get_name_in_current_context(handle_t handle)
 {
     const char* result;
     auto status = cuFuncGetName(&result, handle);
     throw_if_error_lazy(status, "Failed obtaining the name for " + identify(handle));
     return result;
 }

 inline const char * get_name(context::handle_t context_handle, handle_t kernel_handle)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return get_name_in_current_context(kernel_handle);
 }

 inline module::handle_t get_module_in_current_context(handle_t handle)
 {
     module::handle_t result;
     auto status = cuFuncGetModule(&result, handle);
     throw_if_error_lazy(status, "Failed obtaining the module containing " + identify(handle));
     return result;
 }

 inline module::handle_t get_module(context::handle_t context_handle, handle_t kernel_handle)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return get_module_in_current_context(kernel_handle);
 }

 #endif // CUDA_VERSION >= 12300

 } // namespace detail_

 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute);

 } // namespace kernel

 class kernel_t {

 public: // getters

     context_t context() const noexcept;
     device_t device() const noexcept;

     device::id_t device_id() const noexcept { return device_id_; }
     context::handle_t context_handle() const noexcept { return context_handle_; }
 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE
     kernel::handle_t handle() const noexcept { return handle_; }
 #else
     kernel::handle_t handle() const
     {
 #ifndef NDEBUG
         if (handle_ == nullptr) {
             throw runtime_error(status::named_t::invalid_resource_handle,
                 "CUDA driver handle unavailable for kernel");
         }
 #endif
         return handle_;
     }
 #endif

 #if CUDA_VERSION >= 12030
     const char *mangled_name() const { return cuda::kernel::detail_::get_name(context_handle_, handle_); }
     module_t module() const;
 #endif

 public: // operators

     kernel_t& operator=(const kernel_t&) = delete;
     kernel_t& operator=(kernel_t&& other) noexcept
     {
         ::std::swap(device_id_, other.device_id_);
         ::std::swap(context_handle_, other.context_handle_);
         ::std::swap(handle_, other.handle_);
         ::std::swap(holds_pc_refcount_unit, holds_pc_refcount_unit);
         return *this;
     }


 public: // non-mutators

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const
     {
         return kernel::get_attribute(*this, attribute);
     }

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     cuda::device::compute_capability_t ptx_version() const
     {
         auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_PTX_VERSION);
         return device::compute_capability_t::from_combined_number(raw_attribute);
     }

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     cuda::device::compute_capability_t binary_compilation_target_architecture() const {
         auto raw_attribute = get_attribute(CU_FUNC_ATTRIBUTE_BINARY_VERSION);
         return device::compute_capability_t::from_combined_number(raw_attribute);
     }

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     grid::block_dimension_t maximum_threads_per_block() const
     {
         return get_attribute(CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
     }

 #if CUDA_VERSION >= 10000

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     grid::composite_dimensions_t min_grid_params_for_max_occupancy(
         memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
         grid::block_dimension_t block_size_limit = 0,
         bool disable_caching_override = false) const;

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     grid::composite_dimensions_t min_grid_params_for_max_occupancy(
         kernel::shared_memory_size_determiner_t  shared_memory_size_determiner,
         grid::block_dimension_t                  block_size_limit = 0,
         bool                                     disable_caching_override = false) const;
 #endif // CUDA_VERSION >= 10000

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     grid::dimension_t max_active_blocks_per_multiprocessor(
         grid::block_dimension_t block_size_in_threads,
         memory::shared::size_t  dynamic_shared_memory_per_block,
         bool                    disable_caching_override = false) const;


 public: // methods mutating the kernel-in-context, but not this reference object

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const;

     void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
     {
         auto amount_required_by_kernel_ = static_cast<kernel::attribute_value_t>(amount_required_by_kernel);
         if (amount_required_by_kernel != static_cast<cuda::memory::shared::size_t>(amount_required_by_kernel_)) {
             throw ::std::invalid_argument("Requested amount of maximum shared memory exceeds the "
                 "representation range for kernel attribute values");
         }
         // TODO: Consider a check in debug mode for the value being within range
         set_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,amount_required_by_kernel_);
     }

     memory::shared::size_t get_maximum_dynamic_shared_memory_per_block() const
     {
         return get_attribute(CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES);
     }

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     void set_cache_preference(multiprocessor_cache_preference_t preference) const
     {
         context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
         auto result = cuFuncSetCacheConfig(handle(), static_cast<CUfunc_cache>(preference));
         throw_if_error_lazy(result,
             "Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
             "CUDA device function");
     }

 #if CUDA_VERSION < 12030

     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
     {
         // TODO: Need to set a context, not a device
         context::current::detail_::scoped_override_t set_context_for_this_context(context_handle_);
         auto result = cuFuncSetSharedMemConfig(handle(), static_cast<CUsharedconfig>(config) );
         throw_if_error_lazy(result, "Failed setting the shared memory bank size");
     }
 #endif // CUDA_VERSION < 12030


 protected: // ctors & dtor
     kernel_t(
         device::id_t       device_id,
         context::handle_t  context_handle,
         kernel::handle_t   handle,
         bool               hold_primary_context_refcount_unit)
      :
         device_id_(device_id),
         context_handle_(context_handle),
         handle_(handle),
         holds_pc_refcount_unit(hold_primary_context_refcount_unit)
     { }

 public: // ctors & dtor
     friend kernel_t kernel::wrap(device::id_t, context::handle_t, kernel::handle_t, bool);

     kernel_t(const kernel_t& other) :
         kernel_t(other.device_id_, other.context_handle_, other.handle_, false) { }

     kernel_t(kernel_t&& other) :
         kernel_t(other.device_id_, other.context_handle_, other.handle_, false)
     {
         ::std::swap(holds_pc_refcount_unit, other.holds_pc_refcount_unit);
     }

 public: // ctors & dtor
     VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE
     ~kernel_t() NOEXCEPT_IF_NDEBUG
     {
         // TODO: DRY
         if (holds_pc_refcount_unit) {
 #ifdef NDEBUG
             device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
                 // Note: "Swallowing" any potential error to avoid ::std::terminate(); also,
                 // because a failure probably means the primary context is inactive already
 #else
             device::primary_context::detail_::decrease_refcount(device_id_);
 #endif
         }
     }

 protected: // data members
     device::id_t device_id_; // We don't _absolutely_ need the device ID, but - why not have it if we can?
     context::handle_t context_handle_;
     mutable kernel::handle_t handle_;
     bool holds_pc_refcount_unit;
 }; // kernel_t

 namespace kernel {

 inline kernel_t wrap(
     device::id_t       device_id,
     context::handle_t  context_handle,
     kernel::handle_t   handle,
     bool hold_primary_context_refcount_unit)
 {
     return kernel_t{device_id, context_handle, handle, hold_primary_context_refcount_unit };
 }

 inline attribute_value_t get_attribute(const kernel_t& kernel, attribute_t attribute)
 {
     CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
     return detail_::get_attribute_in_current_context(kernel.handle(), attribute);
 }

 inline void set_attribute(const kernel_t& kernel, attribute_t attribute, attribute_value_t value)
 {
     CAW_SET_SCOPE_CONTEXT(kernel.context_handle());
     return detail_::set_attribute_in_current_context(kernel.handle(), attribute, value);
 }

 namespace occupancy {

 namespace detail_ {

 inline grid::dimension_t max_active_blocks_per_multiprocessor(
     handle_t                handle,
     grid::block_dimension_t block_size_in_threads,
     memory::shared::size_t  dynamic_shared_memory_per_block,
     bool                    disable_caching_override)
 {
     int result;
         // We don't need the initialization, but NVCC backed by GCC 8 warns us about it.
     auto flags = static_cast<unsigned>(disable_caching_override) ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT;
     cuda::status_t status = cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
         &result, handle, static_cast<int>(block_size_in_threads), dynamic_shared_memory_per_block, flags);
     throw_if_error_lazy(status,
         "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
     return result;
 }

 #if CUDA_VERSION >= 10000
 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
 // if block_size_limit is 0, it is ignored.
 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
     CUfunction                     kernel_handle,
     cuda::device::id_t             device_id,
     CUoccupancyB2DSize             determine_shared_mem_by_block_size,
     cuda::memory::shared::size_t   fixed_shared_mem_size,
     cuda::grid::block_dimension_t  block_size_limit,
     bool                           disable_caching_override)
 {
     int min_grid_size_in_blocks { 0 };
     int block_size { 0 };
     // Note: only initializing the values her because of a
     // spurious (?) compiler warning about potential uninitialized use.

     auto result =  cuOccupancyMaxPotentialBlockSizeWithFlags(
         &min_grid_size_in_blocks, &block_size,
         kernel_handle,
         determine_shared_mem_by_block_size,
         fixed_shared_mem_size,
         static_cast<int>(block_size_limit),
         disable_caching_override ? CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE : CU_OCCUPANCY_DEFAULT
     );

     throw_if_error_lazy(result,
         "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_handle, device_id)
         + " with maximum occupancy given dynamic shared memory and block size data");
     return { static_cast<grid::dimension_t>(min_grid_size_in_blocks), static_cast<grid::block_dimension_t>(block_size) };
 }
 #endif // CUDA_VERSION >= 10000

 } // namespace detail_

 #if CUDA_VERSION >= 11000

 inline memory::shared::size_t max_dynamic_shared_memory_per_block(
     const kernel_t &kernel,
     grid::dimension_t blocks_on_multiprocessor,
     grid::block_dimension_t block_size_in_threads)
 {
     size_t result;
     auto status = cuOccupancyAvailableDynamicSMemPerBlock(
         &result, kernel.handle(), static_cast<int>(blocks_on_multiprocessor), static_cast<int>(block_size_in_threads));
     throw_if_error_lazy(status, "Determining the available dynamic memory per block, given "
         "the number of blocks on a multiprocessor and their size");
     return static_cast<memory::shared::size_t>(result);
 }
 #endif // CUDA_VERSION >= 11000

 inline grid::dimension_t max_active_blocks_per_multiprocessor(
     const kernel_t &kernel,
     grid::block_dimension_t block_size_in_threads,
     memory::shared::size_t dynamic_shared_memory_per_block,
     bool disable_caching_override = false);

 } // namespace occupancy

 namespace detail_ {

 inline ::std::string identify(const kernel_t& kernel)
 {
     return kernel::detail_::identify(kernel.handle()) + " in " + context::detail_::identify(kernel.context());
 }

 } // namespace detail_

 } // namespace kernel

 #if CUDA_VERSION >= 10000
 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
     memory::shared::size_t   dynamic_shared_memory_size,
     grid::block_dimension_t  block_size_limit,
     bool                     disable_caching_override) const
 {
     kernel::shared_memory_size_determiner_t no_shared_memory_size_determiner { nullptr };
     return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
         handle(), device_id(), no_shared_memory_size_determiner,
         dynamic_shared_memory_size, block_size_limit, disable_caching_override);
 }

 inline grid::composite_dimensions_t kernel_t::min_grid_params_for_max_occupancy(
     kernel::shared_memory_size_determiner_t  shared_memory_size_determiner,
     cuda::grid::block_dimension_t            block_size_limit,
     bool                                     disable_caching_override) const
 {
     memory::shared::size_t no_fixed_dynamic_shared_memory_size{ 0 };
     return kernel::occupancy::detail_::min_grid_params_for_max_occupancy(
         handle(), device_id(), shared_memory_size_determiner,
         no_fixed_dynamic_shared_memory_size, block_size_limit, disable_caching_override);
 }
 #endif // CUDA_VERSION >= 10000

 inline grid::dimension_t kernel_t::max_active_blocks_per_multiprocessor(
     grid::block_dimension_t  block_size_in_threads,
     memory::shared::size_t   dynamic_shared_memory_per_block,
     bool                     disable_caching_override) const
 {
     return kernel::occupancy::detail_::max_active_blocks_per_multiprocessor(
         handle(), block_size_in_threads,
         dynamic_shared_memory_per_block, disable_caching_override);
 }

 inline bool operator==(const kernel_t& lhs, const kernel_t& rhs) noexcept
 {
     return
             lhs.device_id()      == rhs.device_id()
         and lhs.context_handle() == rhs.context_handle()
         and lhs.handle()         == rhs.handle();
 }

 inline bool operator!=(const kernel_t& lhs, const kernel_t& rhs) noexcept
 {
     return not (lhs == rhs);
 }

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_KERNEL_HPP_
cuda::kernel_t::context_handle
context::handle_t context_handle() const noexcept
Get the raw handle of the context in which this kernel is defined.
Definition: kernel.hpp:171

cuda::kernel::attribute_value_t
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988

cuda::grid::dimension_t
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299

cuda::kernel_t::ptx_version
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t ptx_version() const
Definition: kernel.hpp:223

cuda::kernel_t::handle
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181

cuda::context_t
Wrapper class for a CUDA context.
Definition: context.hpp:244

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::context::handle_t
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878

cuda::grid::block_dimension_t
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312

primary_context.hpp

cuda::kernel_t::binary_compilation_target_architecture
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t binary_compilation_target_architecture() const
Definition: kernel.hpp:231

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::device::compute_capability_t
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75

cuda::module_t
Wrapper class for a CUDA code module.
Definition: module.hpp:123

cuda::kernel_t::maximum_threads_per_block
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t maximum_threads_per_block() const
Definition: kernel.hpp:244

cuda::multiprocessor_shared_memory_bank_size_option_t
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830

cuda::memory::shared::size_t
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730

current_context.hpp

cuda::size_t
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81

cuda::runtime_error
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271

cuda::device::compute_capability_t::from_combined_number
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...

cuda::kernel_t::set_shared_memory_bank_size
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const
Sets a device function&#39;s preference of shared memory bank size.
Definition: kernel.hpp:383

cuda::multiprocessor_cache_preference_t
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804

cuda::grid::composite_dimensions_t
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::operator==
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762

error.hpp
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...

cuda::kernel_t::set_maximum_dynamic_shared_memory_per_block
void set_maximum_dynamic_shared_memory_per_block(cuda::memory::shared::size_t amount_required_by_kernel) const
Change the hardware resource carve-out between L1 cache and shared memory for launches of the kernel ...
Definition: kernel.hpp:333

cuda::kernel_t
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159

cuda::kernel_t::device_id
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169

cuda::module::handle_t
CUmodule handle_t
Raw CUDA driver handle of a module of compiled code; see module_t.
Definition: module.hpp:34

cuda::kernel_t::context
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

types.hpp
Fundamental CUDA-related type definitions.

cuda::kernel_t::set_cache_preference
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void set_cache_preference(multiprocessor_cache_preference_t preference) const
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with c...
Definition: kernel.hpp:367

cuda::kernel_t::max_active_blocks_per_multiprocessor
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: kernel.hpp:582

cuda::kernel::shared_memory_size_determiner_t
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44

cuda::status_t
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77

cuda::kernel::attribute_t
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983

device_properties.hpp
Classes representing specific and overall properties of CUDA devices.