cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
|
A subclass of the kernel_t interface for kernels being functions marked as global in source files and compiled apriori. More...
#include <apriori_compiled.hpp>
Public Member Functions | |
apriori_compiled::attributes_t | attributes () const |
Obtain the set of all attributes one can obtain individually with get_attribute. | |
void | set_cache_preference (multiprocessor_cache_preference_t preference) const override |
See context_t::cache_preference() | |
void | set_shared_memory_bank_size (multiprocessor_shared_memory_bank_size_option_t config) const override |
See context_t::shared_memory_bank_size() | |
cuda::device::compute_capability_t | ptx_version () const override |
The PTX intermediate-representation language used in the compilation of this kernel (whether as the original source code or as an output of the compilation front-end). More... | |
cuda::device::compute_capability_t | binary_compilation_target_architecture () const override |
grid::block_dimension_t | maximum_threads_per_block () const override |
The constraint on the block size in threads for launch grids of this kernel in its associated context (e.g. More... | |
void | set_attribute (kernel::attribute_t attribute, kernel::attribute_value_t value) const override |
kernel::attribute_value_t | get_attribute (kernel::attribute_t attribute) const override |
grid::dimension_t | max_active_blocks_per_multiprocessor (grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const override |
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneously (i.e. More... | |
apriori_compiled_t (const apriori_compiled_t &)=default | |
apriori_compiled_t (apriori_compiled_t &&)=default | |
const void * | ptr () const noexcept |
Access the raw __global__ kernel function pointer - without any type information. More... | |
const void * | get () const noexcept |
operator const void * () noexcept | |
![]() | |
context_t | context () const noexcept |
Get (a proxy for) the context in which this kernel is defined. | |
device_t | device () const noexcept |
Get (a proxy for) the device for (a context of) which this kernel is defined. | |
device::id_t | device_id () const noexcept |
Get the id of the device for (a context of) which this kernel is defined. | |
context::handle_t | context_handle () const noexcept |
Get the raw handle of the context in which this kernel is defined. | |
kernel::handle_t | handle () const |
Get the raw (intra-context) CUDA handle for this kernel. More... | |
kernel_t & | operator= (const kernel_t &)=delete |
kernel_t & | operator= (kernel_t &&other) noexcept |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE kernel::attribute_value_t | get_attribute (kernel::attribute_t attribute) const |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t | ptx_version () const |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE cuda::device::compute_capability_t | binary_compilation_target_architecture () const |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::block_dimension_t | maximum_threads_per_block () const |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE grid::dimension_t | max_active_blocks_per_multiprocessor (grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const |
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneously (i.e. More... | |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void | set_attribute (kernel::attribute_t attribute, kernel::attribute_value_t value) const |
void | set_maximum_dynamic_shared_memory_per_block (cuda::memory::shared::size_t amount_required_by_kernel) const |
Change the hardware resource carve-out between L1 cache and shared memory for launches of the kernel to allow for at least the specified amount of shared memory. More... | |
memory::shared::size_t | get_maximum_dynamic_shared_memory_per_block () const |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void | set_cache_preference (multiprocessor_cache_preference_t preference) const |
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with coarse granularity. More... | |
VIRTUAL_UNLESS_CAN_GET_APRIORI_KERNEL_HANDLE void | set_shared_memory_bank_size (multiprocessor_shared_memory_bank_size_option_t config) const |
Sets a device function's preference of shared memory bank size. More... | |
kernel_t (const kernel_t &other) | |
kernel_t (kernel_t &&other) | |
A subclass of the kernel_t interface for kernels being functions marked as global in source files and compiled apriori.
|
inlineoverride |
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneously (i.e.
with warps from any of these block being schedulable concurrently)
block_size_in_threads | |
dynamic_shared_memory_per_block | |
disable_caching_override | On some GPUs, the choice of whether to cache memory reads affects occupancy. But what if this caching results in 0 potential occupancy for a kernel? There are two options, controlled by this flag. When it is set to false - the calculator will assume caching is off for the purposes of its work; when set to true, it will return 0 for such device functions. See also the "Unified L1/Texture Cache" section of the Maxwell tuning guide. |
|
inlineoverride |
The constraint on the block size in threads for launch grids of this kernel in its associated context (e.g.
due to the number of registers required by each thread vis-a-vis the overall register file size).
|
inlinenoexcept |
Access the raw __global__
kernel function pointer - without any type information.
|
inlineoverride |
The PTX intermediate-representation language used in the compilation of this kernel (whether as the original source code or as an output of the compilation front-end).