8 #ifndef CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_     9 #define CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_    11 #include "../kernel.hpp"    15 #include <cuda_runtime.h>    17 #include <type_traits>    24 class apriori_compiled_t;
    27 namespace apriori_compiled {
    31 #if CUDA_VERSION < 11000    32 inline handle_t get_handle(
const void *, 
const char* = 
nullptr)
    35         "Only CUDA versions 11.0 and later support obtaining CUDA driver handles "    36         "for kernels compiled alongside the program source");
    39 inline handle_t get_handle(
const void *kernel_function_ptr, 
const char* name = 
nullptr)
    42     auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr);
    44         + ((name == 
nullptr) ? ::std::string(
"a kernel function") : ::std::string(
"kernel function ") + name)
    45         + 
" at " + cuda::detail_::ptr_as_hex(kernel_function_ptr));
    50 apriori_compiled_t 
wrap(
    55     bool hold_primary_context_refcount_unit = 
false);
    60 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE    77 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE    83 #if CUDA_VERSION < 11000    85 template<
typename UnaryFunction, 
class T>
    86 static __inline__ cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_(
    90     UnaryFunction  blockSizeToDynamicSMemSize,
    91     int            blockSizeLimit = 0,
    92     unsigned int   flags = 0)
    98     struct cudaFuncAttributes attr;
   101     int maxThreadsPerMultiProcessor;
   103     int devMaxThreadsPerBlock;
   104     int multiProcessorCount;
   105     int funcMaxThreadsPerBlock;
   110     int maxBlockSize = 0;
   112     int maxOccupancy = 0;
   115     int blockSizeToTryAligned;
   117     int blockSizeLimitAligned;
   118     int occupancyInBlocks;
   119     int occupancyInThreads;
   120     size_t dynamicSMemSize;
   126     if (!minGridSize || !blockSize || !func) {
   127         return cudaErrorInvalidValue;
   134     status = ::cudaGetDevice(&device);
   135     if (status != cudaSuccess) {
   139     status = cudaDeviceGetAttribute(
   140         &maxThreadsPerMultiProcessor,
   141         cudaDevAttrMaxThreadsPerMultiProcessor,
   143     if (status != cudaSuccess) {
   147     status = cudaDeviceGetAttribute(
   151     if (status != cudaSuccess) {
   155     status = cudaDeviceGetAttribute(
   156         &devMaxThreadsPerBlock,
   157         cudaDevAttrMaxThreadsPerBlock,
   159     if (status != cudaSuccess) {
   163     status = cudaDeviceGetAttribute(
   164         &multiProcessorCount,
   165         cudaDevAttrMultiProcessorCount,
   167     if (status != cudaSuccess) {
   171     status = cudaFuncGetAttributes(&attr, func);
   172     if (status != cudaSuccess) {
   176     funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
   182     occupancyLimit = maxThreadsPerMultiProcessor;
   183     granularity    = warpSize;
   185     if (blockSizeLimit == 0) {
   186         blockSizeLimit = devMaxThreadsPerBlock;
   189     if (devMaxThreadsPerBlock < blockSizeLimit) {
   190         blockSizeLimit = devMaxThreadsPerBlock;
   193     if (funcMaxThreadsPerBlock < blockSizeLimit) {
   194         blockSizeLimit = funcMaxThreadsPerBlock;
   197     blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
   199     for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
   203         if (blockSizeLimit < blockSizeToTryAligned) {
   204             blockSizeToTry = blockSizeLimit;
   206             blockSizeToTry = blockSizeToTryAligned;
   209         dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
   211         status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
   218         if (status != cudaSuccess) {
   222         occupancyInThreads = blockSizeToTry * occupancyInBlocks;
   224         if (occupancyInThreads > maxOccupancy) {
   225             maxBlockSize = blockSizeToTry;
   226             numBlocks    = occupancyInBlocks;
   227             maxOccupancy = occupancyInThreads;
   232         if (occupancyLimit == maxOccupancy) {
   243     *minGridSize = numBlocks * multiProcessorCount;
   244     *blockSize = maxBlockSize;
   249 #if CUDA_VERSION > 10000   252 template <
typename UnaryFunction>
   254     const void*                    kernel_function_ptr,
   256     UnaryFunction                  determine_shared_mem_by_block_size,
   258     bool                           disable_caching_override)
   260     int min_grid_size_in_blocks { 0 };
   261     int block_size { 0 };
   265     unsigned flags = disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
   266     auto result = (
cuda::status_t) cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_<UnaryFunction, const void*>(
   267         &min_grid_size_in_blocks,
   270         determine_shared_mem_by_block_size,
   271         (
int) block_size_limit,
   275         "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_function_ptr, device_id)
   276         + 
" with maximum occupancy given dynamic shared memory and block size data");
   279 #endif // CUDA_VERSION > 10000   282     const void*              kernel_function_ptr,
   285     bool                     disable_caching_override)
   290     auto flags = (unsigned) disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
   291     status = (
cuda::status_t) cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
   292         &result, kernel_function_ptr, (
int) block_size_in_threads, (int) dynamic_shared_memory_per_block, flags);
   294         "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
   318     const void *ptr() 
const noexcept { 
return ptr_; }
   320     const void *
get() 
const noexcept { 
return ptr_; }
   323     explicit operator const void *() noexcept { 
return ptr_; }
   328 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE   352         return attributes().binary_compilation_target_architecture();
   362         return attributes().maxThreadsPerBlock;
   367 #if CUDA_VERSION > 10000   371         bool disable_caching_override = 
false)
 const override   373         auto shared_memory_size_determiner =
   374             [dynamic_shared_memory_size](int) -> 
size_t { 
return dynamic_shared_memory_size; };
   375         return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
   377             shared_memory_size_determiner,
   378             block_size_limit, disable_caching_override);
   384         bool disable_caching_override = 
false)
 const override   386         return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
   388             shared_memory_size_determiner,
   389             block_size_limit, disable_caching_override);
   414         bool disable_caching_override = 
false)
 const override   416         return apriori_compiled::occupancy::detail_::max_active_blocks_per_multiprocessor(
   418             block_size_in_threads,
   419             dynamic_shared_memory_per_block,
   420             disable_caching_override);
   422 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE   426         kernel::handle_t handle, 
const void *f, 
bool hold_pc_refcount_unit)
   427     : 
kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) {
   430         assert(f != 
nullptr && 
"Attempt to construct a kernel object for a nullptr kernel function pointer");
   437         bool hold_primary_context_refcount_unit)
   440         primary_context_handle,
   441         apriori_compiled::detail_::get_handle(f),
   443         hold_primary_context_refcount_unit)
   454     const void *
const ptr_;
   457 namespace apriori_compiled {
   466     bool               hold_primary_context_refcount_unit)
   468     return { device_id, primary_context_handle, f, ptr, hold_primary_context_refcount_unit };
   471 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE   474     return "apriori-compiled kernel " + cuda::detail_::ptr_as_hex(kernel.
ptr())
   475         + 
" in " + context::detail_::identify(kernel.
context());
   477 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE   481 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE   484     auto handle = detail_::get_handle(function_ptr);
   485     return kernel::detail_::get_attribute_in_current_context(handle, attribute);
   490     auto handle = detail_::get_handle(function_ptr);
   491     return kernel::detail_::set_attribute_in_current_context(handle, attribute, value);
   496     const void*       function_ptr,
   499     CAW_SET_SCOPE_CONTEXT(context.handle());
   500     return get_attribute(function_ptr, attribute);
   503 inline void set_attribute(
   505     const void*        function_ptr,
   509     CAW_SET_SCOPE_CONTEXT(context.handle());
   510     return set_attribute(function_ptr, attribute, value);
   512 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE   529 template<
typename KernelFunctionPtr>
   533 template<
typename KernelFunctionPtr>
   540 #endif // CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_ int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988
 
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions. 
Definition: types.hpp:299
 
Wrapper class for a CUDA context. 
Definition: context.hpp:244
 
Definitions and functionality wrapping CUDA APIs. 
Definition: array.hpp:22
 
cuda::device::compute_capability_t ptx_version() const override
The PTX intermediate-representation language used in the compilation of this kernel (whether as the o...
Definition: apriori_compiled.hpp:345
 
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}. 
Definition: types.hpp:878
 
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions. 
Definition: types.hpp:312
 
a wrapper around cudaFuncAttributes, offering a few convenience member functions. ...
Definition: apriori_compiled.hpp:65
 
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API. 
Definition: types.hpp:850
 
A numeric designator of the computational capabilities of a CUDA device. 
Definition: device_properties.hpp:75
 
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing... 
Definition: error.hpp:335
 
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
 
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
 
grid::block_dimension_t maximum_threads_per_block() const override
The constraint on the block size in threads for launch grids of this kernel in its associated context...
Definition: apriori_compiled.hpp:360
 
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
 
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...
 
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option. 
Definition: types.hpp:804
 
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
 
cuda::device::compute_capability_t ptx_version() const noexcept
See apriori_compiled_t::ptx_version() 
Definition: apriori_compiled.hpp:68
 
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
 
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
 
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension) 
Definition: array.hpp:34
 
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance. 
Definition: array.hpp:264
 
size_t dimension_t
An individual dimension extent for an array. 
Definition: types.hpp:94
 
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
 
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
 
grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const override
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: apriori_compiled.hpp:411
 
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined. 
Definition: kernel.hpp:22
 
Wrapper class for a CUDA device. 
Definition: device.hpp:135
 
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44
 
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
 
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute. 
Definition: types.hpp:983