eyalroz/cuda-api-wrappers/apriori__compiled_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_
 #define CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_

 #include "../kernel.hpp"

 // The following is needed for occupancy-related calculation convenience
 // and kernel-attribute-related API functions
 #include <cuda_runtime.h>

 #include <type_traits>

 namespace cuda {

 namespace kernel {

 class apriori_compiled_t;

 namespace apriori_compiled {

 namespace detail_ {

 #if CUDA_VERSION < 11000
 inline handle_t get_handle(const void *, const char* = nullptr)
 {
     throw cuda::runtime_error(status::not_supported,
         "Only CUDA versions 11.0 and later support obtaining CUDA driver handles "
         "for kernels compiled alongside the program source");
 }
 #else
 inline handle_t get_handle(const void *kernel_function_ptr, const char* name = nullptr)
 {
     handle_t handle;
     auto status = cudaGetFuncBySymbol(&handle, kernel_function_ptr);
     throw_if_error_lazy(status, "Failed obtaining a CUDA function handle for "
         + ((name == nullptr) ? ::std::string("a kernel function") : ::std::string("kernel function ") + name)
         + " at " + cuda::detail_::ptr_as_hex(kernel_function_ptr));
     return handle;
 }
 #endif

 apriori_compiled_t wrap(
     device::id_t device_id,
     context::handle_t primary_context_handle,
     kernel::handle_t f,
     const void* ptr,
     bool hold_primary_context_refcount_unit = false);

 } // namespace detail_


 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE

 struct attributes_t : cudaFuncAttributes {

     cuda::device::compute_capability_t ptx_version() const noexcept {
         return device::compute_capability_t::from_combined_number(ptxVersion);
     }

     cuda::device::compute_capability_t binary_compilation_target_architecture() const noexcept {
         return device::compute_capability_t::from_combined_number(binaryVersion);
     }
 };

 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE

 namespace occupancy {

 namespace detail_ {

 #if CUDA_VERSION < 11000

 template<typename UnaryFunction, class T>
 static __inline__ cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_(
     int           *minGridSize,
     int           *blockSize,
     T              func,
     UnaryFunction  blockSizeToDynamicSMemSize,
     int            blockSizeLimit = 0,
     unsigned int   flags = 0)
 {
     cudaError_t status;

     // Device and function properties
     int                       device;
     struct cudaFuncAttributes attr;

     // Limits
     int maxThreadsPerMultiProcessor;
     int warpSize;
     int devMaxThreadsPerBlock;
     int multiProcessorCount;
     int funcMaxThreadsPerBlock;
     int occupancyLimit;
     int granularity;

     // Recorded maximum
     int maxBlockSize = 0;
     int numBlocks    = 0;
     int maxOccupancy = 0;

     // Temporary
     int blockSizeToTryAligned;
     int blockSizeToTry;
     int blockSizeLimitAligned;
     int occupancyInBlocks;
     int occupancyInThreads;
     size_t dynamicSMemSize;

     // Check user input

     if (!minGridSize || !blockSize || !func) {
         return cudaErrorInvalidValue;
     }

     // Obtain device and function properties

     status = ::cudaGetDevice(&device);
     if (status != cudaSuccess) {
         return status;
     }

     status = cudaDeviceGetAttribute(
         &maxThreadsPerMultiProcessor,
         cudaDevAttrMaxThreadsPerMultiProcessor,
         device);
     if (status != cudaSuccess) {
         return status;
     }

     status = cudaDeviceGetAttribute(
         &warpSize,
         cudaDevAttrWarpSize,
         device);
     if (status != cudaSuccess) {
         return status;
     }

     status = cudaDeviceGetAttribute(
         &devMaxThreadsPerBlock,
         cudaDevAttrMaxThreadsPerBlock,
         device);
     if (status != cudaSuccess) {
         return status;
     }

     status = cudaDeviceGetAttribute(
         &multiProcessorCount,
         cudaDevAttrMultiProcessorCount,
         device);
     if (status != cudaSuccess) {
         return status;
     }

     status = cudaFuncGetAttributes(&attr, func);
     if (status != cudaSuccess) {
         return status;
     }

     funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;

     // Try each block size, and pick the block size with maximum occupancy

     occupancyLimit = maxThreadsPerMultiProcessor;
     granularity    = warpSize;

     if (blockSizeLimit == 0) {
         blockSizeLimit = devMaxThreadsPerBlock;
     }

     if (devMaxThreadsPerBlock < blockSizeLimit) {
         blockSizeLimit = devMaxThreadsPerBlock;
     }

     if (funcMaxThreadsPerBlock < blockSizeLimit) {
         blockSizeLimit = funcMaxThreadsPerBlock;
     }

     blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;

     for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
         // This is needed for the first iteration, because
         // blockSizeLimitAligned could be greater than blockSizeLimit
         //
         if (blockSizeLimit < blockSizeToTryAligned) {
             blockSizeToTry = blockSizeLimit;
         } else {
             blockSizeToTry = blockSizeToTryAligned;
         }

         dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);

         status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
             &occupancyInBlocks,
             func,
             blockSizeToTry,
             dynamicSMemSize,
             flags);

         if (status != cudaSuccess) {
             return status;
         }

         occupancyInThreads = blockSizeToTry * occupancyInBlocks;

         if (occupancyInThreads > maxOccupancy) {
             maxBlockSize = blockSizeToTry;
             numBlocks    = occupancyInBlocks;
             maxOccupancy = occupancyInThreads;
         }

         // Early out if we have reached the maximum
         //
         if (occupancyLimit == maxOccupancy) {
             break;
         }
     }

     // Return best available

     // Suggested min grid size to achieve a full machine launch
     //
     *minGridSize = numBlocks * multiProcessorCount;
     *blockSize = maxBlockSize;

     return status;
 }

 #if CUDA_VERSION > 10000
 // Note: If determine_shared_mem_by_block_size is not null, fixed_shared_mem_size is ignored;
 // if block_size_limit is 0, it is ignored.
 template <typename UnaryFunction>
 inline grid::composite_dimensions_t min_grid_params_for_max_occupancy(
     const void*                    kernel_function_ptr,
     cuda::device::id_t             device_id,
     UnaryFunction                  determine_shared_mem_by_block_size,
     cuda::grid::block_dimension_t  block_size_limit,
     bool                           disable_caching_override)
 {
     int min_grid_size_in_blocks { 0 };
     int block_size { 0 };
     // Note: only initializing the values her because of a
     // spurious (?) compiler warning about potential uninitialized use.

     unsigned flags = disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
     auto result = (cuda::status_t) cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags_<UnaryFunction, const void*>(
         &min_grid_size_in_blocks,
         &block_size,
         kernel_function_ptr,
         determine_shared_mem_by_block_size,
         (int) block_size_limit,
         flags);

     throw_if_error(result,
         "Failed obtaining parameters for a minimum-size grid for " + kernel::detail_::identify(kernel_function_ptr, device_id)
         + " with maximum occupancy given dynamic shared memory and block size data");
     return { (grid::dimension_t) min_grid_size_in_blocks, (grid::block_dimension_t) block_size };
 }
 #endif // CUDA_VERSION > 10000

 inline grid::dimension_t max_active_blocks_per_multiprocessor(
     const void*              kernel_function_ptr,
     grid::block_dimension_t  block_size_in_threads,
     memory::shared::size_t   dynamic_shared_memory_per_block,
     bool                     disable_caching_override)
 {
     // Assuming we don't need to set the current device here
     int result;
     cuda::status_t status = CUDA_SUCCESS;
     auto flags = (unsigned) disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
     status = (cuda::status_t) cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
         &result, kernel_function_ptr, (int) block_size_in_threads, (int) dynamic_shared_memory_per_block, flags);
     throw_if_error(status,
         "Determining the maximum occupancy in blocks per multiprocessor, given the block size and the amount of dynamic memory per block");
     return result;
 }

 #endif

 } // namespace detail_

 } // namespace occupancy

 } // namespace apriori_compiled

 class apriori_compiled_t final : public kernel_t {
 public: // getters
     const void *ptr() const noexcept { return ptr_; }
     const void *get() const noexcept { return ptr_; }

 public: // type_conversions
     explicit operator const void *() noexcept { return ptr_; }

 public: // non-mutators

 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE

     apriori_compiled::attributes_t attributes() const;

     void set_cache_preference(multiprocessor_cache_preference_t preference) const override;

     void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const override;

     cuda::device::compute_capability_t ptx_version() const override
     {
         return attributes().ptx_version();
     }

     cuda::device::compute_capability_t binary_compilation_target_architecture() const override
     {
         return attributes().binary_compilation_target_architecture();
     }

     grid::block_dimension_t maximum_threads_per_block() const override
     {
         return attributes().maxThreadsPerBlock;
     }

     void set_attribute(kernel::attribute_t attribute, kernel::attribute_value_t value) const override;

 #if CUDA_VERSION > 10000
     grid::composite_dimensions_t min_grid_params_for_max_occupancy(
         memory::shared::size_t dynamic_shared_memory_size = no_dynamic_shared_memory,
         grid::block_dimension_t block_size_limit = 0,
         bool disable_caching_override = false) const override
     {
         auto shared_memory_size_determiner =
             [dynamic_shared_memory_size](int) -> size_t { return dynamic_shared_memory_size; };
         return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
             ptr(), device_id(),
             shared_memory_size_determiner,
             block_size_limit, disable_caching_override);
     }

     grid::composite_dimensions_t min_grid_params_for_max_occupancy(
         kernel::shared_memory_size_determiner_t shared_memory_size_determiner,
         grid::block_dimension_t block_size_limit = 0,
         bool disable_caching_override = false) const override
     {
         return kernel::apriori_compiled::occupancy::detail_::min_grid_params_for_max_occupancy(
             ptr(), device_id(),
             shared_memory_size_determiner,
             block_size_limit, disable_caching_override);
     }
 #endif

     kernel::attribute_value_t get_attribute(kernel::attribute_t attribute) const override;

     grid::dimension_t max_active_blocks_per_multiprocessor(
         grid::block_dimension_t block_size_in_threads,
         memory::shared::size_t dynamic_shared_memory_per_block,
         bool disable_caching_override = false) const override
     {
         return apriori_compiled::occupancy::detail_::max_active_blocks_per_multiprocessor(
             ptr(),
             block_size_in_threads,
             dynamic_shared_memory_per_block,
             disable_caching_override);
     }
 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE

 protected: // ctors & dtor
     apriori_compiled_t(device::id_t device_id, context::handle_t primary_context_handle,
         kernel::handle_t handle, const void *f, bool hold_pc_refcount_unit)
     : kernel_t(device_id, primary_context_handle, handle, hold_pc_refcount_unit), ptr_(f) {
         // TODO: Consider checking whether this actually is a device function, at all and in this context
 #ifndef NDEBUG
         assert(f != nullptr && "Attempt to construct a kernel object for a nullptr kernel function pointer");
 #endif
     }
     apriori_compiled_t(
         device::id_t device_id,
         context::handle_t primary_context_handle,
         const void *f,
         bool hold_primary_context_refcount_unit)
     : apriori_compiled_t(
         device_id,
         primary_context_handle,
         apriori_compiled::detail_::get_handle(f),
         f,
         hold_primary_context_refcount_unit)
     { }

 public: // ctors & dtor
     apriori_compiled_t(const apriori_compiled_t&) = default;
     apriori_compiled_t(apriori_compiled_t&&) = default;

 public: // friends
     friend apriori_compiled_t apriori_compiled::detail_::wrap(device::id_t, context::handle_t, kernel::handle_t, const void*, bool);

 protected: // data members
     const void *const ptr_;
 }; // class apriori_compiled_t

 namespace apriori_compiled {

 namespace detail_ {

 inline apriori_compiled_t wrap(
     device::id_t       device_id,
     context::handle_t  primary_context_handle,
     kernel::handle_t   f,
     const void *       ptr,
     bool               hold_primary_context_refcount_unit)
 {
     return { device_id, primary_context_handle, f, ptr, hold_primary_context_refcount_unit };
 }

 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
 inline ::std::string identify(const apriori_compiled_t& kernel)
 {
     return "apriori-compiled kernel " + cuda::detail_::ptr_as_hex(kernel.ptr())
         + " in " + context::detail_::identify(kernel.context());
 }
 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE

 } // namespace detail

 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE
 inline attribute_value_t get_attribute(const void* function_ptr, attribute_t attribute)
 {
     auto handle = detail_::get_handle(function_ptr);
     return kernel::detail_::get_attribute_in_current_context(handle, attribute);
 }

 inline void set_attribute(const void* function_ptr, attribute_t attribute, attribute_value_t value)
 {
     auto handle = detail_::get_handle(function_ptr);
     return kernel::detail_::set_attribute_in_current_context(handle, attribute, value);
 }

 inline attribute_value_t get_attribute(
     const context_t&  context,
     const void*       function_ptr,
     attribute_t       attribute)
 {
     CAW_SET_SCOPE_CONTEXT(context.handle());
     return get_attribute(function_ptr, attribute);
 }

 inline void set_attribute(
     const context_t&   context,
     const void*        function_ptr,
     attribute_t        attribute,
     attribute_value_t  value)
 {
     CAW_SET_SCOPE_CONTEXT(context.handle());
     return set_attribute(function_ptr, attribute, value);
 }
 #endif // CAW_CAN_GET_APRIORI_KERNEL_HANDLE

 } // namespace apriori_compiled


 template<typename KernelFunctionPtr>
 apriori_compiled_t get(const device_t& device, KernelFunctionPtr function_ptr);

 template<typename KernelFunctionPtr>
 apriori_compiled_t get(context_t context, KernelFunctionPtr function_ptr);

 } // namespace kernel

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_KERNELS_APRIORI_COMPILED_HPP_
cuda::kernel::attribute_value_t
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988

cuda::grid::dimension_t
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299

cuda::context_t
Wrapper class for a CUDA context.
Definition: context.hpp:244

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::kernel::apriori_compiled_t::ptx_version
cuda::device::compute_capability_t ptx_version() const override
The PTX intermediate-representation language used in the compilation of this kernel (whether as the o...
Definition: apriori_compiled.hpp:345

cuda::context::handle_t
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878

cuda::grid::block_dimension_t
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312

cuda::kernel::apriori_compiled::attributes_t
a wrapper around cudaFuncAttributes, offering a few convenience member functions. ...
Definition: apriori_compiled.hpp:65

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::device::compute_capability_t
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75

cuda::throw_if_error
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:335

cuda::multiprocessor_shared_memory_bank_size_option_t
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830

cuda::memory::shared::size_t
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730

cuda::kernel::apriori_compiled_t::maximum_threads_per_block
grid::block_dimension_t maximum_threads_per_block() const override
The constraint on the block size in threads for launch grids of this kernel in its associated context...
Definition: apriori_compiled.hpp:360

cuda::runtime_error
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271

cuda::device::compute_capability_t::from_combined_number
static constexpr compute_capability_t from_combined_number(unsigned combined) noexcept
Converts a single-number representation of a compute capability into a proper structured instance of ...

cuda::multiprocessor_cache_preference_t
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804

cuda::grid::composite_dimensions_t
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508

cuda::kernel::apriori_compiled::attributes_t::ptx_version
cuda::device::compute_capability_t ptx_version() const noexcept
See apriori_compiled_t::ptx_version()
Definition: apriori_compiled.hpp:68

cuda::kernel::apriori_compiled_t
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::array::handle_t
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34

cuda::array::wrap
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:264

cuda::array::dimension_t
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94

cuda::kernel::apriori_compiled_t::ptr
const void * ptr() const noexcept
Access the raw __global__ kernel function pointer - without any type information. ...
Definition: apriori_compiled.hpp:319

cuda::kernel_t
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159

cuda::kernel::apriori_compiled_t::max_active_blocks_per_multiprocessor
grid::dimension_t max_active_blocks_per_multiprocessor(grid::block_dimension_t block_size_in_threads, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false) const override
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: apriori_compiled.hpp:411

cuda::kernel_t::context
context_t context() const noexcept
Get (a proxy for) the context in which this kernel is defined.
Definition: kernel.hpp:22

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

cuda::kernel::shared_memory_size_determiner_t
size_t(CUDA_CB *)(int block_size) shared_memory_size_determiner_t
Signature of a function for determining the shared memory size a kernel will use, given the block siz...
Definition: kernel.hpp:44

cuda::status_t
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77

cuda::kernel::attribute_t
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983