9 #ifndef CUDA_API_WRAPPERS_DEVICE_HPP_ 10 #define CUDA_API_WRAPPERS_DEVICE_HPP_ 20 #include <cuda_runtime_api.h> 24 #include <type_traits> 51 class primary_context_t;
54 using limit_t = context::limit_t;
73 bool holds_primary_context_refcount_unit =
false) NOEXCEPT_IF_NDEBUG;
84 device_t
wrap(
id_t id) NOEXCEPT_IF_NDEBUG;
86 using stream_priority_range_t = context::stream_priority_range_t;
90 inline ::std::string get_name(
id_t id)
92 using size_type = int;
93 static constexpr
const size_type initial_size_reservation { 100 };
94 static constexpr
const size_type larger_size { 1000 };
95 char stack_buffer[initial_size_reservation];
96 auto buffer_size =
static_cast<size_type
>((
sizeof(stack_buffer) /
sizeof(
char)));
97 auto try_getting_name = [&](
char* buffer, size_type buffer_size_) -> size_type {
98 auto status = cuDeviceGetName(buffer, buffer_size-1,
id);
99 throw_if_error_lazy(status,
"Failed obtaining the CUDA device name of device " + ::std::to_string(
id));
100 buffer[buffer_size_-1] =
'\0';
101 return static_cast<size_type
>(::std::strlen(buffer));
103 auto prospective_name_length = try_getting_name(stack_buffer, initial_size_reservation);
104 if (prospective_name_length < buffer_size - 1) {
105 return { stack_buffer,
static_cast<::std::string::size_type
>(prospective_name_length) };
107 ::std::string result;
108 result.reserve(prospective_name_length);
109 prospective_name_length = try_getting_name(&result[0], buffer_size);
111 if (prospective_name_length >= buffer_size - 1) {
112 throw ::std::runtime_error(
"CUDA device name longer than expected maximum size " + ::std::to_string(larger_size));
137 using flags_type = device::flags_t;
143 #if CUDA_VERSION >= 11040 159 size_t amount_used_for_graphs(
160 bool reserved =
false,
161 bool high_watermark =
false)
const 163 auto attribute = reserved ?
164 (high_watermark ? CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH) :
165 (high_watermark ? CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_USED_MEM_HIGH);
167 auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
169 "Obtaining the current amount of memory used for execution graphs on " 170 + device::detail_::identify(device_id_));
177 void free_unused_graph_memory()
const 179 auto status = cuDeviceGraphMemTrim(device_id_);
181 + device::detail_::identify(device_id_));
193 size_t amount_used_for_graphs(
bool high_watermark =
false)
const 196 auto status = cuDeviceGetGraphMemAttribute(
199 CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT :
200 CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
203 "Obtaining the current amount of memory used for execution graphs on " 204 + device::detail_::identify(device_id_));
209 #endif // CUDA_VERSION >= 11040 218 return primary_context().memory();
233 CAW_SET_SCOPE_CONTEXT(primary_context_handle());
235 auto status = cuDeviceCanAccessPeer(&result,
id(), peer.
id());
237 + device::detail_::identify(id_) +
" can access " 238 + device::detail_::identify(peer.id_));
239 return (result == 1);
263 #if CUDA_VERSION >= 9020 266 auto status = cuDeviceGetUuid(&result, id_);
267 throw_if_error_lazy(status,
"Failed obtaining UUID for " + device::detail_::identify(id_));
270 #endif // CUDA_VERSION >= 9020 273 void cache_and_ensure_primary_context_activation()
const {
274 if (primary_context_handle_ == context::detail_::none) {
275 primary_context_handle_ = device::primary_context::detail_::obtain_and_increase_refcount(id_);
276 holds_pc_refcount_unit_ =
true;
282 cache_and_ensure_primary_context_activation();
283 return primary_context_handle_;
286 void set_flags(flags_type new_flags)
const 288 new_flags &= ~CU_CTX_MAP_HOST;
294 auto status = cuDevicePrimaryCtxSetFlags(
id(), new_flags);
295 throw_if_error_lazy(status,
"Failed setting (primary context) flags for device " + device::detail_::identify(id_));
298 context::flags_t flags()
const 300 return device::primary_context::detail_::flags(id_);
314 #if CUDA_VERSION >= 11020 315 memory::pool_t default_memory_pool()
const;
327 auto status = cudaGetDeviceProperties(&properties,
id());
328 throw_if_error_lazy(status,
"Failed obtaining device properties for " + device::detail_::identify(id_));
334 auto status = cudaChooseDevice(&
id, &properties);
335 throw_if_error_lazy(status,
"Failed choosing a best matching device by a a property set.");
347 return cuda::device::detail_::get_name(id_);
358 attribute_value_t attribute_value;
359 auto status = cuDeviceGetAttribute(&attribute_value, attribute, id_);
360 throw_if_error_lazy(status,
"Failed obtaining device properties for " + device::detail_::identify(id_));
361 return attribute_value;
366 return get_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
375 auto pci_domain_id =
get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID);
376 auto pci_bus_id =
get_attribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID);
377 auto pci_device_id =
get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID);
378 return {pci_domain_id, pci_bus_id, pci_device_id, {}};
383 return get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
386 #if CUDA_VERSION >= 10020 392 bool supports_virtual_memory_management()
const 394 #if CUDA_VERSION >= 11030 395 return get_attribute(CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED);
397 return get_attribute(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED);
398 #endif // CUDA_VERSION >= 11030 400 #endif // CUDA_VERSION >= 10020 408 unsigned major =
get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
417 auto major = architecture();
418 unsigned minor =
get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
419 return {major, minor};
428 return (
get_attribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) != 0);
437 return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
440 #if CUDA_VERSION >= 12000 445 bool supports_block_clustering()
const 451 #if CUDA_VERSION >= 11020 456 bool supports_memory_pools()
const 458 return get_attribute(CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED);
461 #endif // CUDA_VERSION >= 11020 469 device::limit_value_t
get_limit(device::limit_t limit)
const 471 return primary_context().
get_limit(limit);
478 void set_limit(device::limit_t limit, device::limit_value_t new_value)
const 480 primary_context().
set_limit(limit, new_value);
504 const device_t& make_current()
const 506 device::current::set(*
this);
512 device::current::set(*
this);
532 auto pc_handle = (primary_context_handle_ == context::detail_::none) ?
533 device::primary_context::detail_::obtain_and_increase_refcount(id_) :
534 primary_context_handle_;
535 CAW_SET_SCOPE_CONTEXT(pc_handle);
536 auto status = cudaDeviceReset();
560 #if CUDA_VERSION < 12030 582 #endif // CUDA_VERSION < 12030 607 stream_t default_stream(
bool hold_primary_context_refcount_unit =
false)
const;
611 bool will_synchronize_with_default_stream,
617 bool records_timing = event::do_record_timings,
623 bool keep_larger_local_mem_after_resize =
false)
const;
625 #if CUDA_VERSION >= 11020 628 template <memory::pool::shared_handle_kind_t Kind = memory::pool::shared_handle_kind_t::no_export>
629 memory::pool_t create_memory_pool()
const;
648 template<
typename Kernel,
typename ... KernelParameters>
652 KernelParameters... arguments)
const;
674 auto other_flags = flags() & ~CU_CTX_SCHED_MASK;
675 set_flags(other_flags | static_cast<flags_type>(new_policy));
683 return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
691 auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
692 flags_type new_flags = other_flags | (keep ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
693 set_flags(new_flags);
701 keep_larger_local_mem_after_resize(
false);
705 void maybe_decrease_primary_context_refcount()
const 707 if (holds_pc_refcount_unit_) {
708 device::primary_context::detail_::decrease_refcount(id_);
716 ::std::swap(lhs.id_, rhs.id_);
717 ::std::swap(lhs.primary_context_handle_, rhs.primary_context_handle_);
718 ::std::swap(lhs.holds_pc_refcount_unit_, rhs.holds_pc_refcount_unit_);
724 maybe_decrease_primary_context_refcount();
726 if (holds_pc_refcount_unit_) {
727 device::primary_context::detail_::decrease_refcount_nothrow(id_);
748 maybe_decrease_primary_context_refcount();
750 primary_context_handle_ = other.primary_context_handle_;
751 holds_pc_refcount_unit_ =
false;
770 bool hold_primary_context_refcount_unit =
false) NOEXCEPT_IF_NDEBUG
773 primary_context_handle_(primary_context_handle),
774 holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
778 throw ::std::invalid_argument(
"Attempt to construct a CUDA device object for a negative device ID of " + ::std::to_string(id_));
787 bool hold_primary_context_refcount_unit) NOEXCEPT_IF_NDEBUG;
794 mutable bool holds_pc_refcount_unit_ {
false };
802 return lhs.
id() == rhs.
id();
807 return lhs.
id() != rhs.
id();
818 bool hold_primary_context_refcount_unit) NOEXCEPT_IF_NDEBUG
820 return device_t{ id, primary_context_handle, hold_primary_context_refcount_unit };
841 throw ::std::invalid_argument(
"Attempt to obtain a CUDA device with a negative device ID " + ::std::to_string(
id));
873 auto id = detail_::get_id();
874 auto pc_handle = primary_context::detail_::obtain_and_increase_refcount(
id);
880 auto pc = device.primary_context();
881 context::current::detail_::set(pc.handle());
894 auto resolved_id = device::detail_::resolve_id(pci_id);
895 return get(resolved_id);
909 inline device_t get(const ::std::string& pci_id_str)
911 auto parsed_pci_id = pci_location_t::parse(pci_id_str);
912 return get(parsed_pci_id);
919 #endif // CUDA_API_WRAPPERS_DEVICE_HPP_ int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:860
void ensure_driver_is_initialized()
A mechanism for ensuring a cuInit() call has been made, to use before making any other driver API cal...
Definition: miscellany.hpp:40
Proxy class for a CUDA stream.
Definition: stream.hpp:246
cuda::context::handle_t handle_t
Raw CUDA driver handle for a device's primary context.
Definition: types.hpp:946
void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: device.hpp:567
void dont_keep_larger_local_mem_after_resize()
Instructs the (primary context of) the device to discard allocations of larger amounts of global devi...
Definition: device.hpp:699
CUsharedconfig shared_memory_bank_size_t
Choice of the number of bytes in each bank of the shared memory.
Definition: context.hpp:44
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
properties_t properties() const
Obtains the (mostly) non-numeric properties for this device.
Definition: device.hpp:324
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
void keep_larger_local_mem_after_resize(bool keep=true)
Instructs the (primary context of) the device to keep larger amounts of global device memory allocate...
Definition: device.hpp:689
Wrapper class for a CUDA event.
Definition: event.hpp:133
A class for holding the primary context of a CUDA device.
Definition: primary_context.hpp:112
attribute_value_t get_attribute(attribute_t attribute, const device_t &first, const device_t &second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:113
A class to create a faux member in a context_t, in lieu of an in-class namespace (which C++ does not ...
Definition: context.hpp:262
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
A range of priorities supported by a CUDA context; ranges from the higher numeric value to the lower...
Definition: context.hpp:50
context::limit_value_t get_limit(context::limit_t limit_id) const
Get one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:530
void enable_access_to(const device_t &peer) const
Enable access by this device to the global memory of another device.
Definition: device.hpp:247
bool keeping_larger_local_mem_after_resize() const
Definition: device.hpp:681
CUuuid uuid_t
The CUDA-driver-specific representation of a UUID value; see also {device_t::uuid()}.
Definition: types.hpp:971
context::stream_priority_range_t stream_priority_range() const
Get the range of priority values one can set for streams in this context.
Definition: context.hpp:518
::std::string name() const
Obtains this device's human-readable name, e.g.
Definition: device.hpp:342
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
device::id_t id() const noexcept
Return the proxied device's ID.
Definition: device.hpp:594
CUlimit limit_t
Features of contexts which can be configured individually during a context's lifetime.
Definition: context.hpp:37
device::limit_value_t get_limit(device::limit_t limit) const
Obtains the upper limit on the amount of a certain kind of resource this device offers.
Definition: device.hpp:469
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const
Set one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:653
device_t default_()
Obtains (a proxy for) the default CUDA device, being the device with the default CUDA device id...
Definition: device.hpp:851
void reset() const
Invalidates all memory allocations and resets all state regarding this CUDA device on the current ope...
Definition: device.hpp:522
void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: context.hpp:632
CUdevice_attribute attribute_t
CUDA devices have both "attributes" and "properties".
Definition: types.hpp:856
bool can_access(const device_t &peer) const
Determine whether this device can access the global memory of another CUDA device.
Definition: device.hpp:231
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:884
device_t cpu()
A named constructor idiom for a "dummy" CUDA device representing the CPU.
Definition: device.hpp:863
void disable_access_to(const device_t &peer) const
Disable access by this device to the global memory of another device.
Definition: device.hpp:257
size_t limit_value_t
Type for the actual values for context (see limit_t for the possible kinds of limits whose value can ...
Definition: context.hpp:41
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing on this dev...
Definition: device.hpp:555
device::primary_context_t primary_context(bool hold_pc_refcount_unit=false) const
Produce a proxy for the device's primary context - the one used by runtime API calls.
Definition: device.hpp:152
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
context_t::global_memory_type memory() const
Definition: device.hpp:217
device::pci_location_t pci_id() const
Obtains this device's location on the PCI express bus in terms of domain, bus and device id...
Definition: device.hpp:373
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing within this...
Definition: context.hpp:419
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
Wrappers for getting and setting CUDA's choice of which device is 'current'.
device::compute_architecture_t architecture() const
Obtains the device's hardware architecture generation numeric designator see cuda::device::compute_ar...
Definition: device.hpp:406
bool supports_concurrent_managed_access() const
Determine whether this device can coherently access managed memory concurrently with the CPU...
Definition: device.hpp:426
Location "coordinates" for a CUDA device on a PCIe bus.
Definition: pci_id.hpp:24
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
device::compute_capability_t compute_capability() const
Obtains the device's compute capability; see cuda::device::compute_capability_t.
Definition: device.hpp:415
const device_t & synchronize() const
Waits for all previously-scheduled tasks on all streams (= queues) on this device to conclude...
Definition: device.hpp:492
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
A numeric designator of an architectural generation of CUDA devices.
Definition: device_properties.hpp:45
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing on this device...
Definition: device.hpp:546
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249
A structure holding a collection various properties of a device.
Definition: device_properties.hpp:149
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
Returns a wrapper for the CUDA device with a given id.
Definition: device.hpp:825
Can only be used by the process which created it.
Definition: constants.hpp:95
void set_limit(device::limit_t limit, device::limit_value_t new_value) const
Set the upper limit of one of the named numeric resources on this device.
Definition: device.hpp:478
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing within this conte...
Definition: context.hpp:645
int multiprocessor_count_t
Type of the number of mutiprocessors within a single GPU.
Definition: device_properties.hpp:37
context::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: context.hpp:500
device::stream_priority_range_t stream_priority_range() const
Determines the range of possible priorities for streams on this device.
Definition: device.hpp:661
Wrapper class for a CUDA device.
Definition: device.hpp:135
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356
Fundamental CUDA-related type definitions.
freestanding wrapper functions for working with CUDA's various kinds of memory spaces, arranged into a relevant namespace hierarchy.
device::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: device.hpp:578
Classes representing specific and overall properties of CUDA devices.
Definition of a wrapper class for CUDA PCI device ID information.