eyalroz/cuda-api-wrappers/device_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_DEVICE_HPP_
 #define CUDA_API_WRAPPERS_DEVICE_HPP_

 #include "types.hpp"
 #include "current_device.hpp"
 #include "device_properties.hpp"
 #include "memory.hpp"
 #include "pci_id.hpp"
 #include "primary_context.hpp"
 #include "error.hpp"

 #include <cuda_runtime_api.h>

 #include <string>
 #include <cstring>
 #include <type_traits>

 namespace cuda {

 class event_t;
 class stream_t;
 class device_t;
 namespace memory {
 class pool_t;
 } // namespace memory

 void synchronize(const device_t& device);

 namespace device {

 class primary_context_t;

 using limit_t = context::limit_t;
 using limit_value_t = context::limit_value_t;
 using shared_memory_bank_size_t = context::shared_memory_bank_size_t;

 namespace detail_ {

 device_t wrap(
     id_t id,
     primary_context::handle_t primary_context_handle = context::detail_::none,
     bool holds_primary_context_refcount_unit = false) NOEXCEPT_IF_NDEBUG;

 } // namespace detail

 device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG;

 using stream_priority_range_t = context::stream_priority_range_t;

 namespace detail_ {

 inline ::std::string get_name(id_t id)
 {
     using size_type = int; // Yes, an int, that's what cuDeviceName takes
     static constexpr const size_type initial_size_reservation { 100 };
     static constexpr const size_type larger_size { 1000 }; // Just in case
     char stack_buffer[initial_size_reservation];
     auto buffer_size = static_cast<size_type>((sizeof(stack_buffer) / sizeof(char)));
     auto try_getting_name = [&](char* buffer, size_type buffer_size_) -> size_type {
         auto status = cuDeviceGetName(buffer, buffer_size-1, id);
         throw_if_error_lazy(status, "Failed obtaining the CUDA device name of device " + ::std::to_string(id));
         buffer[buffer_size_-1] = '\0';
         return static_cast<size_type>(::std::strlen(buffer));
     };
     auto prospective_name_length = try_getting_name(stack_buffer, initial_size_reservation);
     if (prospective_name_length < buffer_size - 1) {
         return { stack_buffer, static_cast<::std::string::size_type>(prospective_name_length) };
     }
     ::std::string result;
     result.reserve(prospective_name_length);
     prospective_name_length = try_getting_name(&result[0], buffer_size);
         // We can't use result.data() since it's const until C++20ץץץ
     if (prospective_name_length >= buffer_size - 1) {
         throw ::std::runtime_error("CUDA device name longer than expected maximum size " + ::std::to_string(larger_size));
     }
     return result;
 }

 } // namespace detail

 } // namespace device

 class device_t {
 protected: // types
     using flags_type = device::flags_t;

 public: // types
     using properties_t = device::properties_t;
     using attribute_value_t = device::attribute_value_t;

 #if CUDA_VERSION >= 11040
     class global_memory_type : public context_t::global_memory_type {

         size_t amount_used_for_graphs(
             bool reserved = false,
             bool high_watermark = false) const
         {
             auto attribute = reserved ?
                 (high_watermark ? CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH) :
                 (high_watermark ? CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_USED_MEM_HIGH);
             size_t result;
             auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
             throw_if_error_lazy(status,
                 "Obtaining the current amount of memory used for execution graphs on "
                 + device::detail_::identify(device_id_));
             return result;
         }

         void free_unused_graph_memory() const
         {
             auto status = cuDeviceGraphMemTrim(device_id_);
             throw_if_error_lazy(status, "Freeing unused execution graph memory on "
                 + device::detail_::identify(device_id_));
         }

         size_t amount_used_for_graphs(bool high_watermark = false) const
         {
             size_t result;
             auto status = cuDeviceGetGraphMemAttribute(
                 device_id_,
                 high_watermark ?
                 CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT :
                 CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
                 &result);
             throw_if_error_lazy(status,
                 "Obtaining the current amount of memory used for execution graphs on "
                 + device::detail_::identify(device_id_));
             return result;
         }

     };
 #endif // CUDA_VERSION >= 11040

     context_t::global_memory_type memory() const {
         return primary_context().memory();
     }

 protected: // types

 public:
     bool can_access(const device_t& peer) const
     {
         CAW_SET_SCOPE_CONTEXT(primary_context_handle());
         int result;
         auto status = cuDeviceCanAccessPeer(&result, id(), peer.id());
         throw_if_error_lazy(status, "Failed determining whether "
             + device::detail_::identify(id_) + " can access "
             + device::detail_::identify(peer.id_));
         return (result == 1);
     }

     void enable_access_to(const device_t& peer) const
     {
         primary_context().enable_access_to(peer.primary_context());
     }

     void disable_access_to(const device_t& peer) const
     {
         primary_context().disable_access_to(peer.primary_context());
     }


 #if CUDA_VERSION >= 9020
     uuid_t uuid () const {
         uuid_t result;
         auto status = cuDeviceGetUuid(&result, id_);
         throw_if_error_lazy(status, "Failed obtaining UUID for " + device::detail_::identify(id_));
         return result;
     }
 #endif // CUDA_VERSION >= 9020

 protected:
     void cache_and_ensure_primary_context_activation() const {
         if (primary_context_handle_ == context::detail_::none) {
             primary_context_handle_ = device::primary_context::detail_::obtain_and_increase_refcount(id_);
             holds_pc_refcount_unit_ = true;
         }
     }

     context::handle_t primary_context_handle() const
     {
         cache_and_ensure_primary_context_activation();
         return primary_context_handle_;
     }

     void set_flags(flags_type new_flags) const
     {
         new_flags &= ~CU_CTX_MAP_HOST;
         // CU_CTX_MAP_HOST is (mostly) ignored since CUDA 3.2, and has been officially
         // deprecated in CUDA 11. Moreover, in CUDA 11 (and possibly other versions),
         // the flags you get with cuDevicePrimaryCtxGetState() and cuCtxGetFlag()
         // differ on this particular flag - and cuDevicePrimaryCtxSetFlags() doesn't
         // like seeing it.
         auto status = cuDevicePrimaryCtxSetFlags(id(), new_flags);
         throw_if_error_lazy(status, "Failed setting (primary context) flags for device " + device::detail_::identify(id_));
     }

     context::flags_t flags() const
     {
         return device::primary_context::detail_::flags(id_);
     }

 public:
     device::primary_context_t primary_context(bool hold_pc_refcount_unit = false) const;

 #if CUDA_VERSION >= 11020
     memory::pool_t default_memory_pool() const;
 #endif
 public:

     properties_t properties() const
     {
         properties_t properties;
         auto status = cudaGetDeviceProperties(&properties, id());
         throw_if_error_lazy(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
         return properties;
     }

     static device_t choose_best_match(const properties_t& properties) {
         device::id_t id;
         auto status = cudaChooseDevice(&id, &properties);
         throw_if_error_lazy(status, "Failed choosing a best matching device by a a property set.");
         return device::wrap(id);
     }

     ::std::string name() const
     {
         // If I were lazy, I would just write:
         // return properties().name;
         // and let you wait for all of that to get populated. But not me!
         return cuda::device::detail_::get_name(id_);
     }

     attribute_value_t get_attribute(device::attribute_t attribute) const
     {
         attribute_value_t attribute_value;
         auto status = cuDeviceGetAttribute(&attribute_value, attribute, id_);
         throw_if_error_lazy(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
         return attribute_value;
     }

     grid::block_dimension_t maximum_threads_per_block() const
     {
         return get_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
     }

     device::pci_location_t pci_id() const
     {
         auto pci_domain_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID);
         auto pci_bus_id    = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID);
         auto pci_device_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID);
         return {pci_domain_id, pci_bus_id, pci_device_id, {}};
     }

     device::multiprocessor_count_t multiprocessor_count() const
     {
         return get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
     }

 #if CUDA_VERSION >= 10020

     bool supports_virtual_memory_management() const
     {
 #if CUDA_VERSION >= 11030
         return get_attribute(CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED);
 #else
         return get_attribute(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED);
 #endif // CUDA_VERSION >= 11030
     }
 #endif // CUDA_VERSION >= 10020

     device::compute_architecture_t architecture() const
     {
         unsigned major = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
         return { major };
     }

     device::compute_capability_t compute_capability() const
     {
         auto major = architecture();
         unsigned minor = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
         return {major, minor};
     }

     bool supports_concurrent_managed_access() const
     {
         return (get_attribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) != 0);
     }

     bool supports_block_cooperation() const
     {
         return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
     }

 #if CUDA_VERSION >= 12000

     bool supports_block_clustering() const
     {
         return get_attribute(CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH);
     }
 #endif

 #if CUDA_VERSION >= 11020

     bool supports_memory_pools() const
     {
         return get_attribute(CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED);
     }

 #endif // CUDA_VERSION >= 11020

     device::limit_value_t get_limit(device::limit_t limit) const
     {
         return primary_context().get_limit(limit);
     }

     void set_limit(device::limit_t limit, device::limit_value_t new_value) const
     {
         primary_context().set_limit(limit, new_value);
     }

     const device_t& synchronize() const
     {
         cuda::synchronize(*this);
         return *this;
     }

     device_t& synchronize()
     {
         cuda::synchronize(*this);
         return *this;
     }

     const device_t& make_current() const
     {
         device::current::set(*this);
         return *this;
     }

     device_t& make_current()
     {
         device::current::set(*this);
         return *this;
     }

     void reset() const
     {
         // Notes:
         //
         // 1. We _cannot_ use cuDevicePrimaryCtxReset() - because that one only affects
         // the device's primary context, while cudaDeviceReset() destroys _all_ contexts for
         // the device.
         // 2. We don't need the primary context to be active here, so not using the usual
         //    primary_context_handle() getter mechanism.

         auto pc_handle = (primary_context_handle_ == context::detail_::none) ?
             device::primary_context::detail_::obtain_and_increase_refcount(id_) :
             primary_context_handle_;
         CAW_SET_SCOPE_CONTEXT(pc_handle);
         auto status = cudaDeviceReset();
         throw_if_error_lazy(status, "Resetting " + device::detail_::identify(id_));
     }

     void set_cache_preference(multiprocessor_cache_preference_t preference) const
     {
         primary_context().set_cache_preference(preference);
     }

     multiprocessor_cache_preference_t cache_preference() const
     {
         return primary_context().cache_preference();
     }

 #if CUDA_VERSION < 12030

     void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const
     {
         primary_context().set_shared_memory_bank_size(new_bank_size);
     }

     device::shared_memory_bank_size_t shared_memory_bank_size() const
     {
         return primary_context().shared_memory_bank_size();
     }
 #endif // CUDA_VERSION < 12030

     // For some reason, there is no cudaFuncGetCacheConfig. Weird.
     //
     // template <typename KernelFunction>
     // inline multiprocessor_cache_preference_t kernel_cache_preference(
     //  const KernelFunction* kernel, multiprocessor_cache_preference_t preference);

     device::id_t id() const noexcept
     {
         return id_;
     }

     stream_t default_stream(bool hold_primary_context_refcount_unit = false) const;

     stream_t create_stream(
         bool                will_synchronize_with_default_stream,
         stream::priority_t  priority = cuda::stream::default_priority) const;

     event_t create_event(
         bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default
         bool records_timing     = event::do_record_timings,
         bool interprocess       = event::not_interprocess);

     context_t create_context(
         context::host_thread_sync_scheduling_policy_t   sync_scheduling_policy = context::heuristic,
         bool                                            keep_larger_local_mem_after_resize = false) const;

 #if CUDA_VERSION >= 11020

     template <memory::pool::shared_handle_kind_t Kind = memory::pool::shared_handle_kind_t::no_export>
     memory::pool_t create_memory_pool() const;

 #endif

     template<typename Kernel, typename ... KernelParameters>
     void launch(
         Kernel                  kernel,
         launch_configuration_t  launch_configuration,
         KernelParameters...     arguments) const;

     device::stream_priority_range_t stream_priority_range() const
     {
         return primary_context().stream_priority_range();
     }

 public:
     context::host_thread_sync_scheduling_policy_t sync_scheduling_policy() const
     {
         return context::host_thread_sync_scheduling_policy_t(flags() & CU_CTX_SCHED_MASK);
     }

     void set_sync_scheduling_policy(context::host_thread_sync_scheduling_policy_t new_policy)
     {
         auto other_flags = flags() & ~CU_CTX_SCHED_MASK;
         set_flags(other_flags | static_cast<flags_type>(new_policy));
     }

     bool keeping_larger_local_mem_after_resize() const
     {
         return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
     }

     void keep_larger_local_mem_after_resize(bool keep = true)
     {
         auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
         flags_type new_flags = other_flags | (keep ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
         set_flags(new_flags);
     }

     void dont_keep_larger_local_mem_after_resize()
     {
         keep_larger_local_mem_after_resize(false);
     }

 protected:
     void maybe_decrease_primary_context_refcount() const
     {
         if (holds_pc_refcount_unit_) {
             device::primary_context::detail_::decrease_refcount(id_);
         }
     }

 public:     // constructors and destructor

     friend void swap(device_t& lhs, device_t& rhs) noexcept
     {
         ::std::swap(lhs.id_, rhs.id_);
         ::std::swap(lhs.primary_context_handle_, rhs.primary_context_handle_);
         ::std::swap(lhs.holds_pc_refcount_unit_, rhs.holds_pc_refcount_unit_);
     }

     ~device_t() NOEXCEPT_IF_NDEBUG
     {
 #ifndef NDEBUG
         maybe_decrease_primary_context_refcount();
 #else
         if (holds_pc_refcount_unit_)  {
             device::primary_context::detail_::decrease_refcount_nothrow(id_);
                 // Swallow any error to avoid termination on throwing from a dtor
         }
 #endif
     }

     device_t(device_t&& other) noexcept : id_(other.id_)
     {
         swap(*this, other);
     }

     device_t(const device_t& other) noexcept : id_(other.id_) { }
         // Device proxies are not owning - as devices aren't allocated nor de-allocated.
         // Also, the proxies don't hold any state (except for one bit regarding whether
         // or not the device proxy has increased the primary context refcount); it's
         // the devices _themselves_ which have state; so there's no problem copying
         // the proxies around. This is unlike events and streams, which get created
         // and destroyed.

     device_t& operator=(const device_t& other) noexcept
     {
         maybe_decrease_primary_context_refcount();
         id_ = other.id_;
         primary_context_handle_ = other.primary_context_handle_;
         holds_pc_refcount_unit_ = false;
         return *this;
     }

     device_t& operator=(device_t&& other) noexcept
     {
         swap(*this, other);
         return *this;
     }

 protected: // constructors

     explicit device_t(
         device::id_t device_id,
         device::primary_context::handle_t primary_context_handle = context::detail_::none,
         bool hold_primary_context_refcount_unit = false) NOEXCEPT_IF_NDEBUG
     :
         id_(device_id),
         primary_context_handle_(primary_context_handle),
         holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
     {
 #ifndef NDEBUG
         if (id_ < 0) {
             throw ::std::invalid_argument("Attempt to construct a CUDA device object for a negative device ID of " + ::std::to_string(id_));
         }
 #endif
     }

 public: // friends
     friend device_t device::detail_::wrap(
         device::id_t,
         device::primary_context::handle_t handle,
         bool hold_primary_context_refcount_unit) NOEXCEPT_IF_NDEBUG;

 protected: // data members
     device::id_t id_;
     mutable device::primary_context::handle_t primary_context_handle_ { context::detail_::none };
     mutable bool holds_pc_refcount_unit_ {false };
 };

 inline bool operator==(const device_t& lhs, const device_t& rhs)
 {
     return lhs.id() == rhs.id();
 }

 inline bool operator!=(const device_t& lhs, const device_t& rhs)
 {
     return lhs.id() != rhs.id();
 }

 namespace device {

 namespace detail_ {

 inline device_t wrap(
     id_t id,
     primary_context::handle_t primary_context_handle,
     bool hold_primary_context_refcount_unit) NOEXCEPT_IF_NDEBUG
 {
     return device_t{ id, primary_context_handle, hold_primary_context_refcount_unit };
 }

 } // namespace detail_

 inline device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
 {
     return detail_::wrap(id);
 }

 inline device_t get(id_t id)
 {
 #ifndef NDEBUG
     if (id < 0) {
         throw ::std::invalid_argument("Attempt to obtain a CUDA device with a negative device ID " + ::std::to_string(id));
     }
 #endif
     ensure_driver_is_initialized(); // The device_t class mostly assumes the driver has been initialized
     return wrap(id);
 }

 inline device_t default_()
 {
     return get(cuda::device::default_device_id);
 }

 inline device_t cpu() { return get(CU_DEVICE_CPU); }

 namespace current {

 inline device_t get()
 {
     ensure_driver_is_initialized();
     auto id = detail_::get_id();
     auto pc_handle = primary_context::detail_::obtain_and_increase_refcount(id);
     return device::detail_::wrap(id, pc_handle);
 }

 inline void set(const device_t& device)
 {
     auto pc = device.primary_context();
     context::current::detail_::set(pc.handle());
 }

 } // namespace current

 inline device_t get(pci_location_t pci_id)
 {
     auto resolved_id = device::detail_::resolve_id(pci_id);
     return get(resolved_id);
 }

 inline device_t get(const ::std::string& pci_id_str)
 {
     auto parsed_pci_id = pci_location_t::parse(pci_id_str);
     return get(parsed_pci_id);
 }

 } // namespace device

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_DEVICE_HPP_
cuda::device::attribute_value_t
int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:860

cuda::ensure_driver_is_initialized
void ensure_driver_is_initialized()
A mechanism for ensuring a cuInit() call has been made, to use before making any other driver API cal...
Definition: miscellany.hpp:40

cuda::stream_t
Proxy class for a CUDA stream.
Definition: stream.hpp:246

cuda::device::primary_context::handle_t
cuda::context::handle_t handle_t
Raw CUDA driver handle for a device&#39;s primary context.
Definition: types.hpp:946

cuda::device_t::set_shared_memory_bank_size
void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: device.hpp:567

cuda::device_t::dont_keep_larger_local_mem_after_resize
void dont_keep_larger_local_mem_after_resize()
Instructs the (primary context of) the device to discard allocations of larger amounts of global devi...
Definition: device.hpp:699

cuda::context::shared_memory_bank_size_t
CUsharedconfig shared_memory_bank_size_t
Choice of the number of bytes in each bank of the shared memory.
Definition: context.hpp:44

cuda::context_t
Wrapper class for a CUDA context.
Definition: context.hpp:244

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::stream::priority_t
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246

cuda::device::default_device_id
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53

cuda::device_t::properties
properties_t properties() const
Obtains the (mostly) non-numeric properties for this device.
Definition: device.hpp:324

cuda::launch_configuration_t
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69

cuda::context::handle_t
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878

cuda::device_t::keep_larger_local_mem_after_resize
void keep_larger_local_mem_after_resize(bool keep=true)
Instructs the (primary context of) the device to keep larger amounts of global device memory allocate...
Definition: device.hpp:689

cuda::event_t
Wrapper class for a CUDA event.
Definition: event.hpp:133

cuda::device::primary_context_t
A class for holding the primary context of a CUDA device.
Definition: primary_context.hpp:112

cuda::device::peer_to_peer::get_attribute
attribute_value_t get_attribute(attribute_t attribute, const device_t &first, const device_t &second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:113

cuda::context_t::global_memory_type
A class to create a faux member in a context_t, in lieu of an in-class namespace (which C++ does not ...
Definition: context.hpp:262

cuda::grid::block_dimension_t
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312

cuda::context::stream_priority_range_t
A range of priorities supported by a CUDA context; ranges from the higher numeric value to the lower...
Definition: context.hpp:50

primary_context.hpp

cuda::context_t::get_limit
context::limit_value_t get_limit(context::limit_t limit_id) const
Get one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:530

cuda::device_t::enable_access_to
void enable_access_to(const device_t &peer) const
Enable access by this device to the global memory of another device.
Definition: device.hpp:247

cuda::device_t::keeping_larger_local_mem_after_resize
bool keeping_larger_local_mem_after_resize() const
Definition: device.hpp:681

cuda::uuid_t
CUuuid uuid_t
The CUDA-driver-specific representation of a UUID value; see also {device_t::uuid()}.
Definition: types.hpp:971

cuda::context_t::stream_priority_range
context::stream_priority_range_t stream_priority_range() const
Get the range of priority values one can set for streams in this context.
Definition: context.hpp:518

cuda::device_t::name
::std::string name() const
Obtains this device&#39;s human-readable name, e.g.
Definition: device.hpp:342

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::device::compute_capability_t
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75

cuda::device_t::id
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594

cuda::context::limit_t
CUlimit limit_t
Features of contexts which can be configured individually during a context&#39;s lifetime.
Definition: context.hpp:37

cuda::device_t::get_limit
device::limit_value_t get_limit(device::limit_t limit) const
Obtains the upper limit on the amount of a certain kind of resource this device offers.
Definition: device.hpp:469

cuda::launch
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394

cuda::context_t::set_limit
void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const
Set one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:653

cuda::device::default_
device_t default_()
Obtains (a proxy for) the default CUDA device, being the device with the default CUDA device id...
Definition: device.hpp:851

cuda::device_t::reset
void reset() const
Invalidates all memory allocations and resets all state regarding this CUDA device on the current ope...
Definition: device.hpp:522

cuda::context_t::set_shared_memory_bank_size
void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: context.hpp:632

cuda::device::attribute_t
CUdevice_attribute attribute_t
CUDA devices have both "attributes" and "properties".
Definition: types.hpp:856

cuda::device_t::can_access
bool can_access(const device_t &peer) const
Determine whether this device can access the global memory of another CUDA device.
Definition: device.hpp:231

cuda::context::host_thread_sync_scheduling_policy_t
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:884

cuda::device::cpu
device_t cpu()
A named constructor idiom for a "dummy" CUDA device representing the CPU.
Definition: device.hpp:863

cuda::device_t::disable_access_to
void disable_access_to(const device_t &peer) const
Disable access by this device to the global memory of another device.
Definition: device.hpp:257

cuda::context::limit_value_t
size_t limit_value_t
Type for the actual values for context (see limit_t for the possible kinds of limits whose value can ...
Definition: context.hpp:41

cuda::synchronize
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968

cuda::device_t::cache_preference
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing on this dev...
Definition: device.hpp:555

cuda::device_t::primary_context
device::primary_context_t primary_context(bool hold_pc_refcount_unit=false) const
Produce a proxy for the device&#39;s primary context - the one used by runtime API calls.
Definition: device.hpp:152

cuda::multiprocessor_cache_preference_t
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804

cuda::device_t::memory
context_t::global_memory_type memory() const
Definition: device.hpp:217

cuda::device_t::pci_id
device::pci_location_t pci_id() const
Obtains this device&#39;s location on the PCI express bus in terms of domain, bus and device id...
Definition: device.hpp:373

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::operator==
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762

cuda::array::handle_t
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34

cuda::context_t::cache_preference
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing within this...
Definition: context.hpp:419

cuda::device_t::supports_block_cooperation
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435

current_device.hpp
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.

cuda::device_t::architecture
device::compute_architecture_t architecture() const
Obtains the device&#39;s hardware architecture generation numeric designator see cuda::device::compute_ar...
Definition: device.hpp:406

cuda::device_t::supports_concurrent_managed_access
bool supports_concurrent_managed_access() const
Determine whether this device can coherently access managed memory concurrently with the CPU...
Definition: device.hpp:426

cuda::device::pci_location_t
Location "coordinates" for a CUDA device on a PCIe bus.
Definition: pci_id.hpp:24

error.hpp
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...

cuda::device_t::compute_capability
device::compute_capability_t compute_capability() const
Obtains the device&#39;s compute capability; see cuda::device::compute_capability_t.
Definition: device.hpp:415

cuda::device_t::synchronize
const device_t & synchronize() const
Waits for all previously-scheduled tasks on all streams (= queues) on this device to conclude...
Definition: device.hpp:492

cuda::event::interprocess
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96

cuda::device::compute_architecture_t
A numeric designator of an architectural generation of CUDA devices.
Definition: device_properties.hpp:45

cuda::device_t::set_cache_preference
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing on this device...
Definition: device.hpp:546

cuda::stream::default_priority
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249

cuda::device::properties_t
A structure holding a collection various properties of a device.
Definition: device_properties.hpp:149

cuda::event::sync_by_busy_waiting
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70

cuda::device::wrap
device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
Returns a wrapper for the CUDA device with a given id.
Definition: device.hpp:825

cuda::event::not_interprocess
Can only be used by the process which created it.
Definition: constants.hpp:95

cuda::device_t::set_limit
void set_limit(device::limit_t limit, device::limit_value_t new_value) const
Set the upper limit of one of the named numeric resources on this device.
Definition: device.hpp:478

cuda::context_t::set_cache_preference
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing within this conte...
Definition: context.hpp:645

cuda::device::multiprocessor_count_t
int multiprocessor_count_t
Type of the number of mutiprocessors within a single GPU.
Definition: device_properties.hpp:37

cuda::context_t::shared_memory_bank_size
context::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: context.hpp:500

cuda::device_t::stream_priority_range
device::stream_priority_range_t stream_priority_range() const
Determines the range of possible priorities for streams on this device.
Definition: device.hpp:661

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

cuda::device_t::get_attribute
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356

types.hpp
Fundamental CUDA-related type definitions.

memory.hpp
freestanding wrapper functions for working with CUDA&#39;s various kinds of memory spaces, arranged into a relevant namespace hierarchy.

cuda::device_t::shared_memory_bank_size
device::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: device.hpp:578

device_properties.hpp
Classes representing specific and overall properties of CUDA devices.

pci_id.hpp
Definition of a wrapper class for CUDA PCI device ID information.