19 #ifndef CUDA_API_WRAPPERS_COMMON_TYPES_HPP_ 20 #define CUDA_API_WRAPPERS_COMMON_TYPES_HPP_ 27 #include "detail/preamble.hpp" 28 #include "detail/optional.hpp" 29 #include "detail/optional_ref.hpp" 30 #include "detail/span.hpp" 31 #include "detail/region.hpp" 32 #include "detail/type_traits.hpp" 35 #include <builtin_types.h> 39 #include <type_traits> 63 template <
typename T,
size_t N>
101 template<dimensionality_t NumDimensions>
112 : width(width_), height(height_), depth(depth_) { }
113 constexpr __host__ __device__
dimensions_t(cudaExtent e)
114 : dimensions_t(e.width, e.height, e.depth) { }
115 constexpr __host__ __device__ dimensions_t(
const dimensions_t& other)
116 : dimensions_t(other.width, other.height, other.depth) { }
117 constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
118 : dimensions_t(other.width, other.height, other.depth) { }
119 constexpr __host__ __device__ dimensions_t(
dimension_t linear_size)
120 : dimensions_t(linear_size, 1, 1) { }
122 CPP14_CONSTEXPR dimensions_t& operator=(
const dimensions_t& other) =
default;
123 CPP14_CONSTEXPR dimensions_t& operator=(dimensions_t&& other) =
default;
125 constexpr __host__ __device__
operator cudaExtent()
const 127 return { width, height, depth };
134 constexpr __host__ __device__
size_t volume()
const {
return width * height * depth; }
137 constexpr __host__ __device__
size_t size()
const {
return volume(); }
143 return (width > 1) + (height> 1) + (depth > 1);
147 static constexpr __host__ __device__ dimensions_t cube(
dimension_t x) {
return dimensions_t{ x, x, x }; }
151 static constexpr __host__ __device__ dimensions_t
zero() {
return cube(0); }
162 : width(width_), height(height_) { }
164 : dimensions_t(other.width, other.height) { }
165 constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
166 : dimensions_t(other.width, other.height) { }
167 constexpr __host__ __device__ dimensions_t(
dimension_t linear_size)
168 : dimensions_t(linear_size, 1) { }
170 CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(
const dimensions_t& other)
172 width = other.width; height = other.height;
176 CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(dimensions_t&& other)
178 width = other.width; height = other.height;
183 constexpr __host__ __device__
size_t area()
const {
return width * height; }
186 constexpr __host__ __device__
size_t size()
const {
return area(); }
192 return (width > 1) + (height> 1);
198 static constexpr __host__ __device__ dimensions_t square(
dimension_t x) {
return dimensions_t{ x, x }; }
202 static constexpr __host__ __device__ dimensions_t
zero() {
return square(0); }
251 #if CUDA_VERSION >= 10000 257 #if CUDA_VERSION >= 10000 261 enum class mode_t : ::std::underlying_type<CUstreamCaptureMode>::type {
262 global = CU_STREAM_CAPTURE_MODE_GLOBAL,
263 thread = CU_STREAM_CAPTURE_MODE_THREAD_LOCAL,
264 thread_local_ = thread,
265 relaxed = CU_STREAM_CAPTURE_MODE_RELAXED
268 enum class state_t : ::std::underlying_type<CUstreamCaptureStatus>::type {
269 active = CU_STREAM_CAPTURE_STATUS_ACTIVE,
271 invalidated = CU_STREAM_CAPTURE_STATUS_INVALIDATED,
272 none = CU_STREAM_CAPTURE_STATUS_NONE,
278 inline bool is_capturing(capture::state_t status) noexcept
280 return status == capture::state_t::active;
283 #endif // CUDA_VERSION >= 10000 326 : x(x_), y(y_), z(z_) { }
332 constexpr __host__ __device__
operator uint3(
void)
const {
return { x, y, z }; }
336 __host__ __device__
operator dim3(
void)
const noexcept {
return { x, y, z }; }
339 constexpr __host__ __device__
size_t volume() const noexcept {
return static_cast<size_t>(x) * y * z; }
345 return (z > 1) + (y > 1) + (x > 1);
367 (rhs.x % lhs.x == 0) and
368 (rhs.y % lhs.y == 0) and
369 (rhs.z % lhs.z == 0);
377 return (i == 0) ? x :
381 CPP14_CONSTEXPR
dimension_t& operator[](
int i) noexcept {
382 return (i == 0) ? x :
390 constexpr
inline bool operator==(
const dim3& lhs,
const dim3& rhs) noexcept
392 return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
394 constexpr
inline bool operator!=(
const dim3& lhs,
const dim3& rhs) noexcept
396 return not (lhs == rhs);
400 return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
404 return not (lhs == rhs);
432 dimension_type x, y, z;
435 dimension_type width_, dimension_type height_, dimension_type depth_) noexcept
436 : x(width_), y(height_), z(depth_) { }
439 : x(dims.x), y(dims.y), z(dims.z) { }
442 : x(dims.x), y(dims.y), z(dims.z) { }
456 constexpr __host__ __device__
size_t volume()
const noexcept {
return x * y * z; }
457 constexpr __host__ __device__
size_t size()
const noexcept {
return volume(); }
458 constexpr __host__ __device__
dimensionality_t dimensionality()
const noexcept
460 return ((x > 1) + (y > 1) + (z > 1));
468 return (i == 0) ? x :
472 CPP14_CONSTEXPR dimension_type& operator[](
int i) noexcept {
473 return (i == 0) ? x :
483 return (lhs.x == rhs.x) and (lhs.y == rhs.y) and (lhs.z == rhs.z);
488 return not (lhs == rhs);
513 constexpr
size_t volume() const noexcept {
return flatten().volume(); }
516 constexpr
size_t dimensionality() const noexcept {
return flatten().dimensionality(); }
526 #if __cplusplus >= 202002L 533 #if __cplusplus < 202002L 537 return (lhs.grid == rhs.grid) and (lhs.block == rhs.block);
542 return not (lhs == rhs);
545 #endif // __cplusplus < 202002L 557 #if CUDA_VERSION >= 10020 566 struct permissions_t {
572 operator CUmemAccess_flags()
const noexcept
575 (write ? CU_MEM_ACCESS_FLAGS_PROT_READWRITE : CU_MEM_ACCESS_FLAGS_PROT_READ) :
576 CU_MEM_ACCESS_FLAGS_PROT_NONE;
581 namespace permissions {
583 constexpr
inline permissions_t none() {
return permissions_t{
false,
false }; }
584 constexpr
inline permissions_t read_only() {
return permissions_t{
true,
false }; }
585 constexpr
inline permissions_t write_only() {
return permissions_t{
false,
true }; }
586 constexpr
inline permissions_t read_and_write() {
return permissions_t{
true,
true }; }
591 inline permissions_t from_flags(CUmemAccess_flags access_flags)
593 bool read = (access_flags & CU_MEM_ACCESS_FLAGS_PROT_READ);
594 bool write = (access_flags & CU_MEM_ACCESS_FLAGS_PROT_READWRITE);
595 return permissions_t{read, write};
603 namespace physical_allocation {
607 enum class shared_handle_kind_t : ::std::underlying_type<CUmemAllocationHandleType>::type {
608 #if CUDA_VERSION >= 11020 609 no_export = CU_MEM_HANDLE_TYPE_NONE,
611 posix_file_descriptor = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR,
612 win32_handle = CU_MEM_HANDLE_TYPE_WIN32,
613 win32_kmt = CU_MEM_HANDLE_TYPE_WIN32_KMT,
618 template<shared_handle_kind_t SharedHandleKind>
struct shared_handle_type_helper;
620 template <>
struct shared_handle_type_helper<shared_handle_kind_t::posix_file_descriptor> {
using type = int; };
621 #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 622 template <>
struct shared_handle_type_helper<shared_handle_kind_t::win32_handle> {
using type =
void *; };
629 template<shared_handle_kind_t SharedHandleKind>
630 using shared_handle_t =
typename detail_::shared_handle_type_helper<SharedHandleKind>::type;
633 #endif // CUDA_VERSION >= 10020 634 #if CUDA_VERSION >= 11020 644 using shared_handle_kind_t = physical_allocation::shared_handle_kind_t;
648 using physical_allocation::shared_handle_t;
654 using ptr_handle_t = CUmemPoolPtrExportData;
659 #endif // CUDA_VERSION >= 11020 676 static_assert(
sizeof(
void *) ==
sizeof(
device::address_t),
"Unexpected address size");
686 static_assert(
sizeof(
void*) ==
sizeof(
address_t),
"Incompatible sizes for a void pointer and memory::device::address_t");
687 return reinterpret_cast<address_t>(device_ptr);
704 static_assert(
sizeof(
void*) ==
sizeof(
device::address_t),
"Incompatible sizes for a void pointer and memory::device::address_t");
705 return reinterpret_cast<void*
>(
address);
732 using size_t = unsigned;
757 to_supporters_of_concurrent_managed_access,
762 #if CUDA_VERSION >= 11070 763 enum class barrier_scope_t : ::std::underlying_type<CUstreamMemoryBarrier_flags>::type {
767 device = CU_STREAM_MEMORY_BARRIER_TYPE_GPU,
768 system = CU_STREAM_MEMORY_BARRIER_TYPE_SYS
770 #endif // CUDA_VERSION >= 11700 772 #if CUDA_VERSION >= 10000 782 struct subregion_spec_t {
789 #endif // CUDA_VERSION >= 10000 817 equal = equal_l1_and_shared_memory,
818 prefer_shared = prefer_shared_memory_over_l1,
819 prefer_l1 = prefer_l1_over_shared_memory,
833 : ::std::underlying_type<CUsharedconfig>::type
835 device_default = CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE,
836 four_bytes_per_bank = CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE,
837 eight_bytes_per_bank = CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE
864 namespace peer_to_peer {
882 using flags_t = unsigned;
943 using flags_t = context::flags_t;
945 namespace primary_context {
959 template <
typename T,
typename U>
960 inline T identity_cast(U&& x)
962 static_assert(::std::is_same<
963 typename ::std::remove_reference<T>::type,
964 typename ::std::remove_reference<U>::type
966 "Casting to a different type - don't use identity_cast");
967 return static_cast<T
>(::std::forward<U>(x));
993 using handle_t = CUfunction;
997 #if CUDA_VERSION >= 10000 1006 #if CUDA_VERSION >= 13010 1007 using id_t = unsigned;
1014 using handle_t = CUgraphNode;
1016 using const_handle_t = CUgraphNode_st
const *;
1018 constexpr
const const_handle_t no_handle =
nullptr;
1027 namespace template_ {
1030 using handle_t = CUgraph;
1031 constexpr
const handle_t null_handle =
nullptr;
1040 namespace instance {
1043 using handle_t = CUgraphExec;
1049 #endif // CUDA_VERSION >= 10000 1053 #endif // CUDA_API_WRAPPERS_COMMON_TYPES_HPP_ int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:862
constexpr size_t dimensionality() const noexcept
Definition: types.hpp:516
Keep control and spin-check for result availability.
Definition: types.hpp:914
int attribute_value_t
The uniform type the CUDA driver uses for all kernel attributes; it is typically more appropriate to ...
Definition: types.hpp:990
Alias for the default behavior; see heuristic .
Definition: types.hpp:903
cuda::context::handle_t handle_t
Raw CUDA driver handle for a device's primary context.
Definition: types.hpp:948
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:296
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:243
Divide the cache resources to maximize available L1 cache at the expense of shared memory...
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
typename cuda::rtc::detail_::types< Kind >::handle_type handle_t
Raw program handle used by the NVIDIA run-time compilation libraries's API calls: // The NVRTC librar...
Definition: types.hpp:124
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
dimension_t width
The three constituent individual dimensions, named.
Definition: types.hpp:109
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:309
Default behavior; yield or spin based on a heuristic.
Definition: types.hpp:898
Yield control while waiting for results.
Definition: types.hpp:933
CUmemLocation location_t
Used in a limited number of API functions which can relate both to CUDA device memory and system memo...
Definition: types.hpp:555
constexpr dimension_type operator[](int i) const noexcept
Provides array-like access to the dimensions in different axes.
Definition: types.hpp:467
CUuuid uuid_t
The CUDA-driver-specific representation of a UUID value; see also {device_t::uuid()}.
Definition: types.hpp:973
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
static constexpr bool divides(dimensions_t lhs, dimensions_t rhs)
Definition: types.hpp:364
constexpr dimension_t operator[](int i) const noexcept
Provides array-like access to the dimensions in different axes.
Definition: types.hpp:376
context::host_thread_sync_scheduling_policy_t host_thread_sync_scheduling_policy_t
Definition: types.hpp:953
A richer (kind-of-a-)wrapper for CUDA's dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:322
CUevent handle_t
The CUDA driver's raw handle for events.
Definition: types.hpp:214
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)'s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:832
CUdevice_attribute attribute_t
CUDA devices have both "attributes" and "properties".
Definition: types.hpp:858
void point(const CharT *description, color_t color=color_t::Black())
Mark a single point on the profiler timeline, giving it also a color and some descriptive text...
Definition: profiling.hpp:185
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:732
dimension_t width
The two constituent individual dimensions, named; no "depth" for the 2D case.
Definition: types.hpp:159
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:886
Block the thread until results are available.
Definition: types.hpp:922
size_t dimensionality_t
The index or number of dimensions of an entity (as opposed to the extent in any dimension) - typicall...
Definition: types.hpp:82
CUpointer_attribute attribute_t
Raw CUDA driver choice type for attributes of pointers.
Definition: types.hpp:664
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:78
address_t address(memory::const_region_t region) noexcept
Definition: types.hpp:695
CUstreamCallback callback_t
The CUDA driver's raw handle for a host-side callback function.
Definition: types.hpp:254
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:806
Composite dimensions for a grid - in terms of blocks, then also down into the block dimensions comple...
Definition: types.hpp:505
CUdevice_P2PAttribute attribute_t
While Individual CUDA devices have individual "attributes" (attribute_t), there are also attributes c...
Definition: types.hpp:871
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:768
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
size_t overall_dimension_t
Dimension of a grid in threads along one axis, i.e.
Definition: types.hpp:423
Dimensions of a grid in threads, i.e.
Definition: types.hpp:429
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1976
size_t dimension_t
An individual dimension extent for an array.
Definition: types.hpp:91
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:236
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:702
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:246
constexpr overall_dimensions_t flatten() const noexcept
Definition: types.hpp:510
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:674
constexpr size_t volume() const noexcept
Definition: types.hpp:513
see heuristic
Definition: types.hpp:936
Divide the cache resources equally between actual L1 cache and shared memory.
static constexpr composite_dimensions_t point() noexcept
A named constructor idiom for the composite dimensions of a single-block grid with a single-thread bl...
Definition: types.hpp:520
Divide the cache resources to maximize available shared memory at the expense of L1 cache...
No preference for more L1 cache or for more shared memory; the API can do as it please.
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:755
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
CUDA's array memory-objects are multi-dimensional; but their dimensions, or extents, are not the same as cuda::grid::dimensions_t ; they may be much larger in each axis.
Definition: types.hpp:102
CUfunction_attribute attribute_t
Raw CUDA driver selector of a kernel attribute.
Definition: types.hpp:985
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:418