8 #ifndef CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_ 9 #define CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_ 11 #include "../kernel.hpp" 15 #include <cuda_runtime.h> 17 #include <type_traits> 24 class apriori_compiled_t;
27 namespace apriori_compiled {
31 #if CUDA_VERSION < 11000 32 inline handle_t get_handle(
const void *,
const char* =
nullptr)
35 "Only CUDA versions 11.0 and later support obtaining CUDA driver handles " 36 "for kernels compiled alongside the program source");
39 inline handle_t get_handle(
const void *kernel_function_ptr,
const char* name =
nullptr)
42 auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr);
44 + ((name ==
nullptr) ? ::std::string(
"a kernel function") : ::std::string(
"kernel function ") + name)
45 +
" at " + cuda::detail_::ptr_as_hex(kernel_function_ptr));
50 apriori_compiled_t
wrap(
55 bool hold_primary_context_refcount_unit =
false);
60 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 77 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE 83 #if CUDA_VERSION < 11000 85 template<
typename UnaryFunction,
class T>
86 static __inline__ cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_(
90 UnaryFunction blockSizeToDynamicSMemSize,
91 int blockSizeLimit = 0,
92 unsigned int flags = 0)
98 struct cudaFuncAttributes attr;
101 int maxThreadsPerMultiProcessor;
103 int devMaxThreadsPerBlock;
104 int multiProcessorCount;
105 int funcMaxThreadsPerBlock;
110 int maxBlockSize = 0;
112 int maxOccupancy = 0;
115 int blockSizeToTryAligned;
117 int blockSizeLimitAligned;
118 int occupancyInBlocks;
119 int occupancyInThreads;
120 size_t dynamicSMemSize;
126 if (!minGridSize || !blockSize || !func) {
127 return cudaErrorInvalidValue;
134 status = ::cudaGetDevice(&device);
135 if (status != cudaSuccess) {
139 status = cudaDeviceGetAttribute(
140 &maxThreadsPerMultiProcessor,
141 cudaDevAttrMaxThreadsPerMultiProcessor,
143 if (status != cudaSuccess) {
147 status = cudaDeviceGetAttribute(
151 if (status != cudaSuccess) {
155 status = cudaDeviceGetAttribute(
156 &devMaxThreadsPerBlock,
157 cudaDevAttrMaxThreadsPerBlock,
159 if (status != cudaSuccess) {
163 status = cudaDeviceGetAttribute(
164 &multiProcessorCount,
165 cudaDevAttrMultiProcessorCount,
167 if (status != cudaSuccess) {
171 status = cudaFuncGetAttributes(&attr, func);
172 if (status != cudaSuccess) {
176 funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
182 occupancyLimit = maxThreadsPerMultiProcessor;
183 granularity = warpSize;
185 if (blockSizeLimit == 0) {
186 blockSizeLimit = devMaxThreadsPerBlock;
189 if (devMaxThreadsPerBlock < blockSizeLimit) {
190 blockSizeLimit = devMaxThreadsPerBlock;
193 if (funcMaxThreadsPerBlock < blockSizeLimit) {
194 blockSizeLimit = funcMaxThreadsPerBlock;
197 blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
199 for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
203 if (blockSizeLimit < blockSizeToTryAligned) {
204 blockSizeToTry = blockSizeLimit;
206 blockSizeToTry = blockSizeToTryAligned;
209 dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
211 status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
218 if (status != cudaSuccess) {
222 occupancyInThreads = blockSizeToTry * occupancyInBlocks;
224 if (occupancyInThreads > maxOccupancy) {
225 maxBlockSize = blockSizeToTry;
226 numBlocks = occupancyInBlocks;
227 maxOccupancy = occupancyInThreads;
232 if (occupancyLimit == maxOccupancy) {
243 *minGridSize = numBlocks * multiProcessorCount;
244 *blockSize = maxBlockSize;
249 #if CUDA_VERSION > 10000 252 template <
typename UnaryFunction>
254 const void* kernel_function_ptr,
256 UnaryFunction determine_shared_mem_by_block_size,
258 bool disable_caching_override)
260 int min_grid_size_in_blocks { 0 };
261 int block_size { 0 };
265 unsigned flags = disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
266 auto result = (
cuda::status_t) cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_<UnaryFunction, const void*>(
267 &min_grid_size_in_blocks,
270 determine_shared_mem_by_block_size,
271 (
int) block_size_limit,
275 "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_function_ptr, device_id)
276 +
" with maximum occupancy given dynamic shared memory and block size data");
279 #endif // CUDA_VERSION > 10000 282 const void* kernel_function_ptr,
285 bool disable_caching_override)
290 auto flags = (unsigned) disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
291 status = (
cuda::status_t) cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
292 &result, kernel_function_ptr, (
int) block_size_in_threads, (int) dynamic_shared_memory_per_block, flags);
294 "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
318 const void *ptr()
const noexcept {
return ptr_; }
320 const void *
get()
const noexcept {
return ptr_; }
323 explicit operator const void *() noexcept {
return ptr_; }
328 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 352 return attributes().binary_compilation_target_architecture();
362 return attributes().maxThreadsPerBlock;
367 #if CUDA_VERSION > 10000 371 bool disable_caching_override =
false)
const override 373 auto shared_memory_size_determiner =
374 [dynamic_shared_memory_size](int) ->
size_t {
return dynamic_shared_memory_size; };
375 return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
377 shared_memory_size_determiner,
378 block_size_limit, disable_caching_override);
384 bool disable_caching_override =
false)
const override 386 return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
388 shared_memory_size_determiner,
389 block_size_limit, disable_caching_override);
414 bool disable_caching_override =
false)
const override 416 return apriori_compiled::occupancy::detail_::max_active_blocks_per_multiprocessor(
418 block_size_in_threads,
419 dynamic_shared_memory_per_block,
420 disable_caching_override);
422 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 426 kernel::handle_t handle,
const void *f,
bool hold_pc_refcount_unit)
427 :
kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) {
430 assert(f !=
nullptr &&
"Attempt to construct a kernel object for a nullptr kernel function pointer");
437 bool hold_primary_context_refcount_unit)
440 primary_context_handle,
441 apriori_compiled::detail_::get_handle(f),
443 hold_primary_context_refcount_unit)
454 const void *
const ptr_;
457 namespace apriori_compiled {
466 bool hold_primary_context_refcount_unit)
468 return { device_id, primary_context_handle, f, ptr, hold_primary_context_refcount_unit };
471 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 474 return "apriori-compiled kernel " + cuda::detail_::ptr_as_hex(kernel.
ptr())
475 +
" in " + context::detail_::identify(kernel.
context());
477 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 481 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE 484 auto handle = detail_::get_handle(function_ptr);
485 return kernel::detail_::get_attribute_in_current_context(handle, attribute);
490 auto handle = detail_::get_handle(function_ptr);
491 return kernel::detail_::set_attribute_in_current_context(handle, attribute, value);
496 const void* function_ptr,
499 CAW_SET_SCOPE_CONTEXT(context.handle());
500 return get_attribute(function_ptr, attribute);
503 inline void set_attribute(
505 const void* function_ptr,
509 CAW_SET_SCOPE_CONTEXT(context.handle());
510 return set_attribute(function_ptr, attribute, value);
512 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE 529 template<
typename KernelFunctionPtr>
533 template<
typename KernelFunctionPtr>
540 #endif // CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_ int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
cuda::device::compute_capability_t ptx_version() const override
The PTX intermediate-representation language used in the compilation of this kernel (whether as the o...
Definition: apriori_compiled.hpp:345
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
a wrapper around cudaFuncAttributes, offering a few convenience member functions. ...
Definition: apriori_compiled.hpp:65
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:335
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
grid::block_dimension_t maximum_threads_per_block() const override
The constraint on the block size in threads for launch grids of this kernel in its associated context...
Definition: apriori_compiled.hpp:360
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
cuda::device::compute_capability_t ptx_version() const noexcept
See apriori_compiled_t::ptx_version()
Definition: apriori_compiled.hpp:68
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:264
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const override
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: apriori_compiled.hpp:411
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22
Wrapper class for a CUDA device.
Definition: device.hpp:135
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983