cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
apriori_compiled_kernel.hpp
Go to the documentation of this file.
1 
10 #pragma once
11 #ifndef MULTI_WRAPPER_IMPLS_APRIORI_COMPILED_KERNEL_HPP_
12 #define MULTI_WRAPPER_IMPLS_APRIORI_COMPILED_KERNEL_HPP_
13 
14 #include "../kernels/apriori_compiled.hpp"
15 #include "device.hpp"
16 #include "kernel.hpp"
17 
18 namespace cuda {
19 
20 namespace kernel {
21 
22 #if ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
23 #if defined(__CUDACC__)
24 
25 // Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
26 // from code compiled with a host-side-only compiler! See cuda_runtime.h for details
27 
28 inline apriori_compiled::attributes_t apriori_compiled_t::attributes() const
29 {
30  // Note: assuming the primary context is active
31  CAW_SET_SCOPE_CONTEXT(context_handle_);
32  apriori_compiled::attributes_t function_attributes;
33  auto status = cudaFuncGetAttributes(&function_attributes, ptr_);
34  throw_if_error_lazy(status, "Failed obtaining attributes for a CUDA device function");
35  return function_attributes;
36 }
37 
39 {
40  // Note: assuming the primary context is active
41  CAW_SET_SCOPE_CONTEXT(context_handle_);
42  auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference);
43  throw_if_error_lazy(result,
44  "Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
45  "CUDA device function");
46 }
47 
50 {
51  // Note: assuming the primary context is active
52  CAW_SET_SCOPE_CONTEXT(context_handle_);
53  auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config);
54  throw_if_error_lazy(result, "Failed setting shared memory bank size to " + ::std::to_string(config));
55 }
56 
57 inline void apriori_compiled_t::set_attribute(attribute_t attribute, attribute_value_t value) const
58 {
59  // Note: assuming the primary context is active
60  CAW_SET_SCOPE_CONTEXT(context_handle_);
61  cudaFuncAttribute runtime_attribute = [attribute]() {
62  switch (attribute) {
63  case CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
64  return cudaFuncAttributeMaxDynamicSharedMemorySize;
65  case CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
66  return cudaFuncAttributePreferredSharedMemoryCarveout;
67  default:
68  throw cuda::runtime_error(status::not_supported,
69  "Kernel attribute " + ::std::to_string(attribute) + " not supported (with CUDA version "
70  + ::std::to_string(CUDA_VERSION));
71  }
72  }();
73  auto result = cudaFuncSetAttribute(ptr_, runtime_attribute, value);
74  throw_if_error_lazy(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value));
75 }
76 
77 inline attribute_value_t apriori_compiled_t::get_attribute(attribute_t attribute) const
78 {
79  apriori_compiled::attributes_t attrs = attributes();
80  switch(attribute) {
81  case CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK:
82  return attrs.maxThreadsPerBlock;
83  case CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES:
84  return attrs.sharedSizeBytes;
85  case CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES:
86  return attrs.constSizeBytes;
87  case CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES:
88  return attrs.localSizeBytes;
89  case CU_FUNC_ATTRIBUTE_NUM_REGS:
90  return attrs.numRegs;
91  case CU_FUNC_ATTRIBUTE_PTX_VERSION:
92  return attrs.ptxVersion;
93  case CU_FUNC_ATTRIBUTE_BINARY_VERSION:
94  return attrs.binaryVersion;
95  case CU_FUNC_ATTRIBUTE_CACHE_MODE_CA:
96  return attrs.cacheModeCA;
97  case CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES:
98  return attrs.maxDynamicSharedSizeBytes;
99  case CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT:
100  return attrs.preferredShmemCarveout;
101  default:
102  throw cuda::runtime_error(status::not_supported,
103  ::std::string("Attribute ") +
104 #ifdef NDEBUG
105  ::std::to_string(static_cast<::std::underlying_type<attribute_t>::type>(attribute))
106 #else
107  detail_::attribute_name(attribute)
108 #endif
109  + " cannot be obtained for apriori-compiled kernels before CUDA version 11.0"
110  );
111  }
112 }
113 
114 #endif // defined(__CUDACC__)
115 #endif // ! CAW_CAN_GET_APRIORI_KERNEL_HANDLE
116 
117 namespace apriori_compiled {
118 
119 namespace detail_ {
120 
121 template<typename KernelFunctionPtr>
122 apriori_compiled_t get(
124  context::handle_t & primary_context_handle,
125  KernelFunctionPtr function_ptr)
126 {
127  static_assert(
128  ::std::is_pointer<KernelFunctionPtr>::value
129  and ::std::is_function<typename ::std::remove_pointer<KernelFunctionPtr>::type>::value,
130  "function must be a bona fide pointer to a kernel (__global__) function");
131 
132  auto ptr_ = reinterpret_cast<const void *>(function_ptr);
133 #if CAW_CAN_GET_APRIORI_KERNEL_HANDLE
134  auto handle = detail_::get_handle(ptr_);
135 #else
136  auto handle = nullptr;
137 #endif
138  return wrap(device_id, primary_context_handle, handle, ptr_, do_hold_primary_context_refcount_unit);
139 }
140 
141 } // namespace detail_
142 
143 } // namespace apriori_compiled
144 
145 
155 template<typename KernelFunctionPtr>
156 apriori_compiled_t get(const device_t &device, KernelFunctionPtr function_ptr)
157 {
158  auto primary_context_handle = device::primary_context::detail_::obtain_and_increase_refcount(device.id());
159  return apriori_compiled::detail_::get(device.id(), primary_context_handle, function_ptr);
160 }
161 
162 } // namespace kernel
163 
164 namespace detail_ {
165 
166 template<>
167 inline ::cuda::device::primary_context_t
168 get_implicit_primary_context<kernel::apriori_compiled_t>(kernel::apriori_compiled_t kernel)
169 {
170  const kernel_t &kernel_ = kernel;
171  return get_implicit_primary_context(kernel_);
172 }
173 
174 } // namespace detail_
175 
176 } // namespace cuda
177 
178 #endif // MULTI_WRAPPER_IMPLS_APRIORI_COMPILED_KERNEL_HPP_
179 
kernel::handle_t handle() const
Get the raw (intra-context) CUDA handle for this kernel.
Definition: kernel.hpp:181
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
device_t device() const noexcept
Get (a proxy for) the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:28
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard ker...
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard dev...
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
A subclass of the kernel_t interface for kernels being functions marked as global in source files and...
Definition: apriori_compiled.hpp:310
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:264
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
void set_cache_preference(multiprocessor_cache_preference_t preference) const override
See context_t::cache_preference()
device::id_t device_id() const noexcept
Get the id of the device for (a context of) which this kernel is defined.
Definition: kernel.hpp:169
apriori_compiled::attributes_t attributes() const
Obtain the set of all attributes one can obtain individually with get_attribute.
void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config) const override
See context_t::shared_memory_bank_size()
Wrapper class for a CUDA device.
Definition: device.hpp:135