19 #ifndef CUDA_API_WRAPPERS_COMMON_TYPES_HPP_ 20 #define CUDA_API_WRAPPERS_COMMON_TYPES_HPP_ 27 #if (__cplusplus < 201103L && (!defined(_MSVC_LANG) || _MSVC_LANG < 201103L)) 28 #error "The CUDA API headers can only be compiled with C++11 or a later version of the C++ language standard" 31 #include "detail/optional.hpp" 32 #include "detail/optional_ref.hpp" 33 #include "detail/span.hpp" 34 #include "detail/region.hpp" 35 #include "detail/type_traits.hpp" 38 #include <builtin_types.h> 42 #include <type_traits> 66 template <
typename T,
size_t N>
104 template<dimensionality_t NumDimensions>
115 : width(width_), height(height_), depth(depth_) { }
116 constexpr __host__ __device__
dimensions_t(cudaExtent e)
117 : dimensions_t(e.width, e.height, e.depth) { }
118 constexpr __host__ __device__ dimensions_t(
const dimensions_t& other)
119 : dimensions_t(other.width, other.height, other.depth) { }
120 constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
121 : dimensions_t(other.width, other.height, other.depth) { }
122 constexpr __host__ __device__ dimensions_t(
dimension_t linear_size)
123 : dimensions_t(linear_size, 1, 1) { }
125 CPP14_CONSTEXPR dimensions_t& operator=(
const dimensions_t& other) =
default;
126 CPP14_CONSTEXPR dimensions_t& operator=(dimensions_t&& other) =
default;
128 constexpr __host__ __device__
operator cudaExtent()
const 130 return { width, height, depth };
137 constexpr __host__ __device__
size_t volume()
const {
return width * height * depth; }
140 constexpr __host__ __device__
size_t size()
const {
return volume(); }
146 return ((width > 1) + (height> 1) + (depth > 1));
150 static constexpr __host__ __device__ dimensions_t cube(
dimension_t x) {
return dimensions_t{ x, x, x }; }
154 static constexpr __host__ __device__ dimensions_t
zero() {
return cube(0); }
165 : width(width_), height(height_) { }
167 : dimensions_t(other.width, other.height) { }
168 constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
169 : dimensions_t(other.width, other.height) { }
170 constexpr __host__ __device__ dimensions_t(
dimension_t linear_size)
171 : dimensions_t(linear_size, 1) { }
173 CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(
const dimensions_t& other)
175 width = other.width; height = other.height;
179 CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(dimensions_t&& other)
181 width = other.width; height = other.height;
186 constexpr __host__ __device__
size_t area()
const {
return width * height; }
189 constexpr __host__ __device__
size_t size()
const {
return area(); }
195 return ((width > 1) + (height> 1));
201 static constexpr __host__ __device__ dimensions_t square(
dimension_t x) {
return dimensions_t{ x, x }; }
205 static constexpr __host__ __device__ dimensions_t
zero() {
return square(0); }
254 #if CUDA_VERSION >= 10000 260 #if CUDA_VERSION >= 10000 264 enum class mode_t : ::std::underlying_type<CUstreamCaptureMode>::type {
265 global = CU_STREAM_CAPTURE_MODE_GLOBAL,
266 thread = CU_STREAM_CAPTURE_MODE_THREAD_LOCAL,
267 thread_local_ = thread,
268 relaxed = CU_STREAM_CAPTURE_MODE_RELAXED
271 enum class state_t : ::std::underlying_type<CUstreamCaptureStatus>::type {
272 active = CU_STREAM_CAPTURE_STATUS_ACTIVE,
274 invalidated = CU_STREAM_CAPTURE_STATUS_INVALIDATED,
275 none = CU_STREAM_CAPTURE_STATUS_NONE,
281 inline bool is_capturing(capture::state_t status) noexcept
283 return status == capture::state_t::active;
286 #endif // CUDA_VERSION >= 10000 329 : x(x_), y(y_), z(z_) { }
335 constexpr __host__ __device__
operator uint3(
void)
const {
return { x, y, z }; }
339 __host__ __device__
operator dim3(
void)
const noexcept {
return { x, y, z }; }
342 constexpr __host__ __device__
size_t volume() const noexcept {
return static_cast<size_t>(x) * y * z; }
348 return ((z > 1) + (y > 1) + (x > 1));
370 (rhs.x % lhs.x == 0) and
371 (rhs.y % lhs.y == 0) and
372 (rhs.z % lhs.z == 0);
380 return (i == 0) ? x :
384 CPP14_CONSTEXPR
dimension_t& operator[](
int i) noexcept {
385 return (i == 0) ? x :
393 constexpr
inline bool operator==(
const dim3& lhs,
const dim3& rhs) noexcept
395 return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
397 constexpr
inline bool operator!=(
const dim3& lhs,
const dim3& rhs) noexcept
399 return not (lhs == rhs);
403 return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
407 return not (lhs == rhs);
435 dimension_type x, y, z;
438 dimension_type width_, dimension_type height_, dimension_type depth_) noexcept
439 : x(width_), y(height_), z(depth_) { }
442 : x(dims.x), y(dims.y), z(dims.z) { }
445 : x(dims.x), y(dims.y), z(dims.z) { }
459 constexpr __host__ __device__
size_t volume()
const noexcept {
return x * y * z; }
460 constexpr __host__ __device__
size_t size()
const noexcept {
return volume(); }
461 constexpr __host__ __device__
dimensionality_t dimensionality()
const noexcept
463 return ((x > 1) + (y > 1) + (z > 1));
471 return (i == 0) ? x :
475 CPP14_CONSTEXPR dimension_type& operator[](
int i) noexcept {
476 return (i == 0) ? x :
486 return (lhs.x == rhs.x) and (lhs.y == rhs.y) and (lhs.z == rhs.z);
491 return not (lhs == rhs);
516 constexpr
size_t volume() const noexcept {
return flatten().volume(); }
519 constexpr
size_t dimensionality() const noexcept {
return flatten().dimensionality(); }
529 #if __cplusplus >= 202002L 536 #if __cplusplus < 202002L 540 return (lhs.grid == rhs.grid) and (lhs.block == rhs.block);
545 return not (lhs == rhs);
548 #endif // __cplusplus < 202002L 555 #if CUDA_VERSION >= 10020 564 struct permissions_t {
570 operator CUmemAccess_flags()
const noexcept
573 (write ? CU_MEM_ACCESS_FLAGS_PROT_READWRITE : CU_MEM_ACCESS_FLAGS_PROT_READ) :
574 CU_MEM_ACCESS_FLAGS_PROT_NONE;
579 namespace permissions {
581 constexpr
inline permissions_t none() {
return permissions_t{
false,
false }; }
582 constexpr
inline permissions_t read_only() {
return permissions_t{
true,
false }; }
583 constexpr
inline permissions_t write_only() {
return permissions_t{
false,
true }; }
584 constexpr
inline permissions_t read_and_write() {
return permissions_t{
true,
true }; }
589 inline permissions_t from_flags(CUmemAccess_flags access_flags)
591 bool read = (access_flags & CU_MEM_ACCESS_FLAGS_PROT_READ);
592 bool write = (access_flags & CU_MEM_ACCESS_FLAGS_PROT_READWRITE);
593 return permissions_t{read, write};
601 namespace physical_allocation {
605 enum class shared_handle_kind_t : ::std::underlying_type<CUmemAllocationHandleType>::type {
606 #if CUDA_VERSION >= 11020 607 no_export = CU_MEM_HANDLE_TYPE_NONE,
609 posix_file_descriptor = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
610 win32_handle = CU_MEM_HANDLE_TYPE_WIN32,
611 win32_kmt = CU_MEM_HANDLE_TYPE_WIN32_KMT,
616 template<shared_handle_kind_t SharedHandleKind>
struct shared_handle_type_helper;
618 template <>
struct shared_handle_type_helper<shared_handle_kind_t::posix_file_descriptor> {
using type = int; };
619 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 620 template <>
struct shared_handle_type_helper<shared_handle_kind_t::win32_handle> {
using type =
void *; };
627 template<shared_handle_kind_t SharedHandleKind>
628 using shared_handle_t =
typename detail_::shared_handle_type_helper<SharedHandleKind>::type;
631 #endif // CUDA_VERSION >= 10020 632 #if CUDA_VERSION >= 11020 642 using shared_handle_kind_t = physical_allocation::shared_handle_kind_t;
646 using physical_allocation::shared_handle_t;
652 using ptr_handle_t = CUmemPoolPtrExportData;
657 #endif // CUDA_VERSION >= 11020 674 static_assert(
sizeof(
void *) ==
sizeof(
device::address_t),
"Unexpected address size");
684 static_assert(
sizeof(
void*) ==
sizeof(
address_t),
"Incompatible sizes for a void pointer and memory::device::address_t");
685 return reinterpret_cast<address_t>(device_ptr);
702 static_assert(
sizeof(
void*) ==
sizeof(
device::address_t),
"Incompatible sizes for a void pointer and memory::device::address_t");
703 return reinterpret_cast<void*
>(
address);
730 using size_t = unsigned;
755 to_supporters_of_concurrent_managed_access,
760 #if CUDA_VERSION >= 11070 761 enum class barrier_scope_t : typename ::std::underlying_type<CUstreamMemoryBarrier_flags>::type {
765 device = CU_STREAM_MEMORY_BARRIER_TYPE_GPU,
766 system = CU_STREAM_MEMORY_BARRIER_TYPE_SYS
768 #endif // CUDA_VERSION >= 11700 770 #if CUDA_VERSION >= 10000 780 struct subregion_spec_t {
787 #endif // CUDA_VERSION >= 10000 815 equal = equal_l1_and_shared_memory,
816 prefer_shared = prefer_shared_memory_over_l1,
817 prefer_l1 = prefer_l1_over_shared_memory,
831 : ::std::underlying_type<CUsharedconfig>::type
833 device_default = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE,
834 four_bytes_per_bank = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE,
835 eight_bytes_per_bank = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE
862 namespace peer_to_peer {
880 using flags_t = unsigned;
941 using flags_t = context::flags_t;
943 namespace primary_context {
957 template <
typename T,
typename U>
958 inline T identity_cast(U&& x)
960 static_assert(::std::is_same<
961 typename ::std::remove_reference<T>::type,
962 typename ::std::remove_reference<U>::type
964 "Casting to a different type - don't use identity_cast");
965 return static_cast<T
>(::std::forward<U>(x));
991 using handle_t = CUfunction;
995 #if CUDA_VERSION >= 10000 1007 using handle_t = CUgraphNode;
1009 using const_handle_t = CUgraphNode_st
const *;
1011 constexpr
const const_handle_t no_handle =
nullptr;
1020 namespace template_ {
1023 using handle_t = CUgraph;
1024 constexpr
const handle_t null_handle =
nullptr;
1033 namespace instance {
1036 using handle_t = CUgraphExec;
1042 #endif // CUDA_VERSION >= 10000 1046 #endif // CUDA_API_WRAPPERS_COMMON_TYPES_HPP_ int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:860
constexpr size_t dimensionality() const noexcept
Definition: types.hpp:519
Keep control and spin-check for result availability.
Definition: types.hpp:912
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:988
Alias for the default behavior; see heuristic .
Definition: types.hpp:901
cuda::context::handle_t handle_t
Raw CUDA driver handle for a device's primary context.
Definition: types.hpp:946
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:299
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246
Divide the cache resources to maximize available L1 cache at the expense of shared memory...
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
typename cuda::rtc::detail_::types< Kind >::handle_type handle_t
Raw program handle used by the NVIDIA run-time compilation libraries's API calls: // The NVRTC librar...
Definition: types.hpp:124
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
dimension_t width
The three constituent individual dimensions, named.
Definition: types.hpp:112
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
Default behavior; yield or spin based on a heuristic.
Definition: types.hpp:896
Yield control while waiting for results.
Definition: types.hpp:931
constexpr dimension_type operator[](int i) const noexcept
Provides array-like access to the dimensions in different axes.
Definition: types.hpp:470
CUuuid uuid_t
The CUDA-driver-specific representation of a UUID value; see also {device_t::uuid()}.
Definition: types.hpp:971
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:367
constexpr dimension_t operator[](int i) const noexcept
Provides array-like access to the dimensions in different axes.
Definition: types.hpp:379
context::host_thread_sync_scheduling_policy_t host_thread_sync_scheduling_policy_t
Definition: types.hpp:951
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:325
CUevent handle_t
The CUDA driver's raw handle for events.
Definition: types.hpp:217
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:830
CUdevice_attribute attribute_t
CUDA devices have both "attributes" and "properties".
Definition: types.hpp:856
void point(const CharT *description, color_t color=color_t::Black())
Mark a single point on the profiler timeline, giving it also a color and some descriptive text...
Definition: profiling.hpp:185
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:730
dimension_t width
The two constituent individual dimensions, named; no "depth" for the 2D case.
Definition: types.hpp:162
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:884
Block the thread until results are available.
Definition: types.hpp:920
size_t dimensionality_t
The index or number of dimensions of an entity (as opposed to the extent in any dimension) - typicall...
Definition: types.hpp:85
CUpointer_attribute attribute_t
Raw CUDA driver choice type for attributes of pointers.
Definition: types.hpp:662
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
address_t address(memory::const_region_t region) noexcept
Definition: types.hpp:693
CUstreamCallback callback_t
The CUDA driver's raw handle for a host-side callback function.
Definition: types.hpp:257
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:508
CUdevice_P2PAttribute attribute_t
While Individual CUDA devices have individual "attributes" (attribute_t), there are also attributes c...
Definition: types.hpp:869
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:426
Dimensions of a grid in threads, i.e.
Definition: types.hpp:432
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:94
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:239
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:700
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249
constexpr overall_dimensions_t flatten() const noexcept
Definition: types.hpp:513
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:672
constexpr size_t volume() const noexcept
Definition: types.hpp:516
see heuristic
Definition: types.hpp:934
Divide the cache resources equally between actual L1 cache and shared memory.
static constexpr composite_dimensions_t point() noexcept
A named constructor idiom for the composite dimensions of a single-block grid with a single-thread bl...
Definition: types.hpp:523
Divide the cache resources to maximize available shared memory at the expense of L1 cache...
No preference for more L1 cache or for more shared memory; the API can do as it please.
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
CUDA's array memory-objects are multi-dimensional; but their dimensions, or extents, are not the same as cuda::grid::dimensions_t ; they may be much larger in each axis.
Definition: types.hpp:105
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:983
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:416