11 #ifndef MULTI_WRAPPER_IMPLS_APRIORI_COMPILED_KERNEL_HPP_ 12 #define MULTI_WRAPPER_IMPLS_APRIORI_COMPILED_KERNEL_HPP_ 14 #include "../kernels/apriori_compiled.hpp" 22 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 23 #if defined(__CUDACC__) 31 CAW_SET_SCOPE_CONTEXT(context_handle_);
32 apriori_compiled::attributes_t function_attributes;
33 auto status = cudaFuncGetAttributes(&function_attributes, ptr_);
35 return function_attributes;
41 CAW_SET_SCOPE_CONTEXT(context_handle_);
42 auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference);
44 "Setting the multiprocessor L1/Shared Memory cache distribution preference for a " 45 "CUDA device function");
52 CAW_SET_SCOPE_CONTEXT(context_handle_);
53 auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config);
54 throw_if_error_lazy(result,
"Failed setting shared memory bank size to " + ::std::to_string(config));
57 inline void apriori_compiled_t::set_attribute(attribute_t attribute, attribute_value_t value)
const 60 CAW_SET_SCOPE_CONTEXT(context_handle_);
61 cudaFuncAttribute runtime_attribute = [attribute]() {
63 case CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
64 return cudaFuncAttributeMaxDynamicSharedMemorySize;
65 case CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
66 return cudaFuncAttributePreferredSharedMemoryCarveout;
69 "Kernel attribute " + ::std::to_string(attribute) +
" not supported (with CUDA version " 70 + ::std::to_string(CUDA_VERSION));
73 auto result = cudaFuncSetAttribute(ptr_, runtime_attribute, value);
74 throw_if_error_lazy(result,
"Setting CUDA device function attribute " + ::std::to_string(attribute) +
" to value " + ::std::to_string(value));
77 inline attribute_value_t apriori_compiled_t::get_attribute(attribute_t attribute)
const 79 apriori_compiled::attributes_t attrs =
attributes();
81 case CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
82 return attrs.maxThreadsPerBlock;
83 case CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
84 return attrs.sharedSizeBytes;
85 case CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
86 return attrs.constSizeBytes;
87 case CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
88 return attrs.localSizeBytes;
89 case CU_FUNC_ATTRIBUTE_NUM_REGS:
91 case CU_FUNC_ATTRIBUTE_PTX_VERSION:
92 return attrs.ptxVersion;
93 case CU_FUNC_ATTRIBUTE_BINARY_VERSION:
94 return attrs.binaryVersion;
95 case CU_FUNC_ATTRIBUTE_CACHE_MODE_CA:
96 return attrs.cacheModeCA;
97 case CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
98 return attrs.maxDynamicSharedSizeBytes;
99 case CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
100 return attrs.preferredShmemCarveout;
103 ::std::string(
"Attribute ") +
105 ::std::to_string(
static_cast<::std::underlying_type<attribute_t>::type
>(attribute))
107 detail_::attribute_name(attribute)
109 +
" cannot be obtained for apriori-compiled kernels before CUDA version 11.0" 114 #endif // defined(__CUDACC__) 115 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE 117 namespace apriori_compiled {
121 template<
typename KernelFunctionPtr>
122 apriori_compiled_t
get(
125 KernelFunctionPtr function_ptr)
128 ::std::is_pointer<KernelFunctionPtr>::value
129 and ::std::is_function<typename ::std::remove_pointer<KernelFunctionPtr>::type>::value,
130 "function must be a bona fide pointer to a kernel (__global__) function");
132 auto ptr_ =
reinterpret_cast<const void *
>(function_ptr);
133 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE 134 auto handle = detail_::get_handle(ptr_);
138 return wrap(device_id, primary_context_handle,
handle, ptr_, do_hold_primary_context_refcount_unit);
155 template<
typename KernelFunctionPtr>
158 auto primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device.id());
159 return apriori_compiled::detail_::get(device.id(), primary_context_handle, function_ptr);
167 inline ::cuda::device::primary_context_t
171 return get_implicit_primary_context(kernel_);
178 #endif // MULTI_WRAPPER_IMPLS_APRIORI_COMPILED_KERNEL_HPP_ kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:28
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard dev...
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:264
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
void set_cache_preference(multiprocessor_cache_preference_t preference) const override
See context_t::cache_preference()
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
apriori_compiled::attributes_t attributes() const
Obtain the set of all attributes one can obtain individually with get_attribute.
void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const override
See context_t::shared_memory_bank_size()
Wrapper class for a CUDA device.
Definition: device.hpp:135