cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
device.hpp
Go to the documentation of this file.
1 
8 #pragma once
9 #ifndef CUDA_API_WRAPPERS_DEVICE_HPP_
10 #define CUDA_API_WRAPPERS_DEVICE_HPP_
11 
12 #include "types.hpp"
13 #include "current_device.hpp"
14 #include "device_properties.hpp"
15 #include "memory.hpp"
16 #include "pci_id.hpp"
17 #include "primary_context.hpp"
18 #include "error.hpp"
19 
20 #include <cuda_runtime_api.h>
21 
22 #include <string>
23 #include <cstring>
24 #include <type_traits>
25 
26 namespace cuda {
27 
29 class event_t;
30 class stream_t;
31 class device_t;
32 namespace memory {
33 class pool_t;
34 } // namespace memory
36 
46 void synchronize(const device_t& device);
47 
48 namespace device {
49 
51 class primary_context_t;
53 
54 using limit_t = context::limit_t;
55 using limit_value_t = context::limit_value_t;
56 using shared_memory_bank_size_t = context::shared_memory_bank_size_t;
57 
58 namespace detail_ {
59 
70 device_t wrap(
71  id_t id,
72  primary_context::handle_t primary_context_handle = context::detail_::none,
73  bool holds_primary_context_refcount_unit = false) NOEXCEPT_IF_NDEBUG;
74 
75 } // namespace detail
76 
84 device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG;
85 
86 using stream_priority_range_t = context::stream_priority_range_t;
87 
88 namespace detail_ {
89 
90 inline ::std::string get_name(id_t id)
91 {
92  using size_type = int; // Yes, an int, that's what cuDeviceName takes
93  static constexpr const size_type initial_size_reservation { 100 };
94  static constexpr const size_type larger_size { 1000 }; // Just in case
95  char stack_buffer[initial_size_reservation];
96  auto buffer_size = static_cast<size_type>((sizeof(stack_buffer) / sizeof(char)));
97  auto try_getting_name = [&](char* buffer, size_type buffer_size_) -> size_type {
98  auto status = cuDeviceGetName(buffer, buffer_size-1, id);
99  throw_if_error_lazy(status, "Failed obtaining the CUDA device name of device " + ::std::to_string(id));
100  buffer[buffer_size_-1] = '\0';
101  return static_cast<size_type>(::std::strlen(buffer));
102  };
103  auto prospective_name_length = try_getting_name(stack_buffer, initial_size_reservation);
104  if (prospective_name_length < buffer_size - 1) {
105  return { stack_buffer, static_cast<::std::string::size_type>(prospective_name_length) };
106  }
107  ::std::string result;
108  result.reserve(prospective_name_length);
109  prospective_name_length = try_getting_name(&result[0], buffer_size);
110  // We can't use result.data() since it's const until C++20ץץץ
111  if (prospective_name_length >= buffer_size - 1) {
112  throw ::std::runtime_error("CUDA device name longer than expected maximum size " + ::std::to_string(larger_size));
113  }
114  return result;
115 }
116 
117 } // namespace detail
118 
119 } // namespace device
120 
135 class device_t {
136 protected: // types
137  using flags_type = device::flags_t;
138 
139 public: // types
141  using attribute_value_t = device::attribute_value_t;
142 
143 #if CUDA_VERSION >= 11040
144  class global_memory_type : public context_t::global_memory_type {
145 
159  size_t amount_used_for_graphs(
160  bool reserved = false,
161  bool high_watermark = false) const
162  {
163  auto attribute = reserved ?
164  (high_watermark ? CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_RESERVED_MEM_HIGH) :
165  (high_watermark ? CU_GRAPH_MEM_ATTR_USED_MEM_CURRENT : CU_GRAPH_MEM_ATTR_USED_MEM_HIGH);
166  size_t result;
167  auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
168  throw_if_error_lazy(status,
169  "Obtaining the current amount of memory used for execution graphs on "
170  + device::detail_::identify(device_id_));
171  return result;
172  }
173 
177  void free_unused_graph_memory() const
178  {
179  auto status = cuDeviceGraphMemTrim(device_id_);
180  throw_if_error_lazy(status, "Freeing unused execution graph memory on "
181  + device::detail_::identify(device_id_));
182  }
183 
193  size_t amount_used_for_graphs(bool high_watermark = false) const
194  {
195  size_t result;
196  auto status = cuDeviceGetGraphMemAttribute(
197  device_id_,
198  high_watermark ?
199  CU_GRAPH_MEM_ATTR_RESERVED_MEM_CURRENT :
200  CU_GRAPH_MEM_ATTR_USED_MEM_HIGH,
201  &result);
202  throw_if_error_lazy(status,
203  "Obtaining the current amount of memory used for execution graphs on "
204  + device::detail_::identify(device_id_));
205  return result;
206  }
207 
208  };
209 #endif // CUDA_VERSION >= 11040
210 
218  return primary_context().memory();
219  }
220 
221 protected: // types
222 
223 public:
231  bool can_access(const device_t& peer) const
232  {
233  CAW_SET_SCOPE_CONTEXT(primary_context_handle());
234  int result;
235  auto status = cuDeviceCanAccessPeer(&result, id(), peer.id());
236  throw_if_error_lazy(status, "Failed determining whether "
237  + device::detail_::identify(id_) + " can access "
238  + device::detail_::identify(peer.id_));
239  return (result == 1);
240  }
241 
247  void enable_access_to(const device_t& peer) const
248  {
249  primary_context().enable_access_to(peer.primary_context());
250  }
251 
257  void disable_access_to(const device_t& peer) const
258  {
259  primary_context().disable_access_to(peer.primary_context());
260  }
261 
262 
263 #if CUDA_VERSION >= 9020
264  uuid_t uuid () const {
265  uuid_t result;
266  auto status = cuDeviceGetUuid(&result, id_);
267  throw_if_error_lazy(status, "Failed obtaining UUID for " + device::detail_::identify(id_));
268  return result;
269  }
270 #endif // CUDA_VERSION >= 9020
271 
272 protected:
273  void cache_and_ensure_primary_context_activation() const {
274  if (primary_context_handle_ == context::detail_::none) {
275  primary_context_handle_ = device::primary_context::detail_::obtain_and_increase_refcount(id_);
276  holds_pc_refcount_unit_ = true;
277  }
278  }
279 
280  context::handle_t primary_context_handle() const
281  {
282  cache_and_ensure_primary_context_activation();
283  return primary_context_handle_;
284  }
285 
286  void set_flags(flags_type new_flags) const
287  {
288  new_flags &= ~CU_CTX_MAP_HOST;
289  // CU_CTX_MAP_HOST is (mostly) ignored since CUDA 3.2, and has been officially
290  // deprecated in CUDA 11. Moreover, in CUDA 11 (and possibly other versions),
291  // the flags you get with cuDevicePrimaryCtxGetState() and cuCtxGetFlag()
292  // differ on this particular flag - and cuDevicePrimaryCtxSetFlags() doesn't
293  // like seeing it.
294  auto status = cuDevicePrimaryCtxSetFlags(id(), new_flags);
295  throw_if_error_lazy(status, "Failed setting (primary context) flags for device " + device::detail_::identify(id_));
296  }
297 
298  context::flags_t flags() const
299  {
300  return device::primary_context::detail_::flags(id_);
301  }
302 
303 public:
312  device::primary_context_t primary_context(bool hold_pc_refcount_unit = false) const;
313 
314 #if CUDA_VERSION >= 11020
315  memory::pool_t default_memory_pool() const;
316 #endif
317 public:
318 
325  {
326  properties_t properties;
327  auto status = cudaGetDeviceProperties(&properties, id());
328  throw_if_error_lazy(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
329  return properties;
330  }
331 
332  static device_t choose_best_match(const properties_t& properties) {
333  device::id_t id;
334  auto status = cudaChooseDevice(&id, &properties);
335  throw_if_error_lazy(status, "Failed choosing a best matching device by a a property set.");
336  return device::wrap(id);
337  }
338 
342  ::std::string name() const
343  {
344  // If I were lazy, I would just write:
345  // return properties().name;
346  // and let you wait for all of that to get populated. But not me!
347  return cuda::device::detail_::get_name(id_);
348  }
349 
356  attribute_value_t get_attribute(device::attribute_t attribute) const
357  {
358  attribute_value_t attribute_value;
359  auto status = cuDeviceGetAttribute(&attribute_value, attribute, id_);
360  throw_if_error_lazy(status, "Failed obtaining device properties for " + device::detail_::identify(id_));
361  return attribute_value;
362  }
363 
364  grid::block_dimension_t maximum_threads_per_block() const
365  {
366  return get_attribute(CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK);
367  }
368 
374  {
375  auto pci_domain_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID);
376  auto pci_bus_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_BUS_ID);
377  auto pci_device_id = get_attribute(CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID);
378  return {pci_domain_id, pci_bus_id, pci_device_id, {}};
379  }
380 
381  device::multiprocessor_count_t multiprocessor_count() const
382  {
383  return get_attribute(CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT);
384  }
385 
386 #if CUDA_VERSION >= 10020
387 
392  bool supports_virtual_memory_management() const
393  {
394 #if CUDA_VERSION >= 11030
395  return get_attribute(CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED);
396 #else
397  return get_attribute(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED);
398 #endif // CUDA_VERSION >= 11030
399  }
400 #endif // CUDA_VERSION >= 10020
401 
407  {
408  unsigned major = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR);
409  return { major };
410  }
411 
416  {
417  auto major = architecture();
418  unsigned minor = get_attribute(CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR);
419  return {major, minor};
420  }
421 
427  {
428  return (get_attribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) != 0);
429  }
430 
436  {
437  return get_attribute(CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH);
438  }
439 
440 #if CUDA_VERSION >= 12000
441 
445  bool supports_block_clustering() const
446  {
447  return get_attribute(CU_DEVICE_ATTRIBUTE_CLUSTER_LAUNCH);
448  }
449 #endif
450 
451 #if CUDA_VERSION >= 11020
452 
456  bool supports_memory_pools() const
457  {
458  return get_attribute(CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED);
459  }
460 
461 #endif // CUDA_VERSION >= 11020
462 
469  device::limit_value_t get_limit(device::limit_t limit) const
470  {
471  return primary_context().get_limit(limit);
472  }
473 
478  void set_limit(device::limit_t limit, device::limit_value_t new_value) const
479  {
480  primary_context().set_limit(limit, new_value);
481  }
482 
492  const device_t& synchronize() const
493  {
494  cuda::synchronize(*this);
495  return *this;
496  }
497 
499  {
500  cuda::synchronize(*this);
501  return *this;
502  }
503 
504  const device_t& make_current() const
505  {
506  device::current::set(*this);
507  return *this;
508  }
509 
510  device_t& make_current()
511  {
512  device::current::set(*this);
513  return *this;
514  }
515 
522  void reset() const
523  {
524  // Notes:
525  //
526  // 1. We _cannot_ use cuDevicePrimaryCtxReset() - because that one only affects
527  // the device's primary context, while cudaDeviceReset() destroys _all_ contexts for
528  // the device.
529  // 2. We don't need the primary context to be active here, so not using the usual
530  // primary_context_handle() getter mechanism.
531 
532  auto pc_handle = (primary_context_handle_ == context::detail_::none) ?
533  device::primary_context::detail_::obtain_and_increase_refcount(id_) :
534  primary_context_handle_;
535  CAW_SET_SCOPE_CONTEXT(pc_handle);
536  auto status = cudaDeviceReset();
537  throw_if_error_lazy(status, "Resetting " + device::detail_::identify(id_));
538  }
539 
547  {
548  primary_context().set_cache_preference(preference);
549  }
550 
556  {
557  return primary_context().cache_preference();
558  }
559 
560 #if CUDA_VERSION < 12030
561 
567  void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const
568  {
569  primary_context().set_shared_memory_bank_size(new_bank_size);
570  }
571 
578  device::shared_memory_bank_size_t shared_memory_bank_size() const
579  {
580  return primary_context().shared_memory_bank_size();
581  }
582 #endif // CUDA_VERSION < 12030
583 
584  // For some reason, there is no cudaFuncGetCacheConfig. Weird.
585  //
586  // template <typename KernelFunction>
587  // inline multiprocessor_cache_preference_t kernel_cache_preference(
588  // const KernelFunction* kernel, multiprocessor_cache_preference_t preference);
589 
594  device::id_t id() const noexcept
595  {
596  return id_;
597  }
598 
607  stream_t default_stream(bool hold_primary_context_refcount_unit = false) const;
608 
610  stream_t create_stream(
611  bool will_synchronize_with_default_stream,
613 
615  event_t create_event(
616  bool uses_blocking_sync = event::sync_by_busy_waiting, // Yes, that's the runtime default
617  bool records_timing = event::do_record_timings,
619 
621  context_t create_context(
622  context::host_thread_sync_scheduling_policy_t sync_scheduling_policy = context::heuristic,
623  bool keep_larger_local_mem_after_resize = false) const;
624 
625 #if CUDA_VERSION >= 11020
626 
628  template <memory::pool::shared_handle_kind_t Kind = memory::pool::shared_handle_kind_t::no_export>
629  memory::pool_t create_memory_pool() const;
630 
631 #endif
632 
648  template<typename Kernel, typename ... KernelParameters>
649  void launch(
650  Kernel kernel,
651  launch_configuration_t launch_configuration,
652  KernelParameters... arguments) const;
653 
662  {
663  return primary_context().stream_priority_range();
664  }
665 
666 public:
667  context::host_thread_sync_scheduling_policy_t sync_scheduling_policy() const
668  {
669  return context::host_thread_sync_scheduling_policy_t(flags() & CU_CTX_SCHED_MASK);
670  }
671 
672  void set_sync_scheduling_policy(context::host_thread_sync_scheduling_policy_t new_policy)
673  {
674  auto other_flags = flags() & ~CU_CTX_SCHED_MASK;
675  set_flags(other_flags | static_cast<flags_type>(new_policy));
676  }
677 
682  {
683  return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
684  }
685 
689  void keep_larger_local_mem_after_resize(bool keep = true)
690  {
691  auto other_flags = flags() & ~CU_CTX_LMEM_RESIZE_TO_MAX;
692  flags_type new_flags = other_flags | (keep ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
693  set_flags(new_flags);
694  }
695 
700  {
701  keep_larger_local_mem_after_resize(false);
702  }
703 
704 protected:
705  void maybe_decrease_primary_context_refcount() const
706  {
707  if (holds_pc_refcount_unit_) {
708  device::primary_context::detail_::decrease_refcount(id_);
709  }
710  }
711 
712 public: // constructors and destructor
713 
714  friend void swap(device_t& lhs, device_t& rhs) noexcept
715  {
716  ::std::swap(lhs.id_, rhs.id_);
717  ::std::swap(lhs.primary_context_handle_, rhs.primary_context_handle_);
718  ::std::swap(lhs.holds_pc_refcount_unit_, rhs.holds_pc_refcount_unit_);
719  }
720 
721  ~device_t() NOEXCEPT_IF_NDEBUG
722  {
723 #ifndef NDEBUG
724  maybe_decrease_primary_context_refcount();
725 #else
726  if (holds_pc_refcount_unit_) {
727  device::primary_context::detail_::decrease_refcount_nothrow(id_);
728  // Swallow any error to avoid termination on throwing from a dtor
729  }
730 #endif
731  }
732 
733  device_t(device_t&& other) noexcept : id_(other.id_)
734  {
735  swap(*this, other);
736  }
737 
738  device_t(const device_t& other) noexcept : id_(other.id_) { }
739  // Device proxies are not owning - as devices aren't allocated nor de-allocated.
740  // Also, the proxies don't hold any state (except for one bit regarding whether
741  // or not the device proxy has increased the primary context refcount); it's
742  // the devices _themselves_ which have state; so there's no problem copying
743  // the proxies around. This is unlike events and streams, which get created
744  // and destroyed.
745 
746  device_t& operator=(const device_t& other) noexcept
747  {
748  maybe_decrease_primary_context_refcount();
749  id_ = other.id_;
750  primary_context_handle_ = other.primary_context_handle_;
751  holds_pc_refcount_unit_ = false;
752  return *this;
753  }
754 
755  device_t& operator=(device_t&& other) noexcept
756  {
757  swap(*this, other);
758  return *this;
759  }
760 
761 protected: // constructors
762 
767  explicit device_t(
768  device::id_t device_id,
769  device::primary_context::handle_t primary_context_handle = context::detail_::none,
770  bool hold_primary_context_refcount_unit = false) NOEXCEPT_IF_NDEBUG
771  :
772  id_(device_id),
773  primary_context_handle_(primary_context_handle),
774  holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
775  {
776 #ifndef NDEBUG
777  if (id_ < 0) {
778  throw ::std::invalid_argument("Attempt to construct a CUDA device object for a negative device ID of " + ::std::to_string(id_));
779  }
780 #endif
781  }
782 
783 public: // friends
785  device::id_t,
787  bool hold_primary_context_refcount_unit) NOEXCEPT_IF_NDEBUG;
788 
789 protected: // data members
790  device::id_t id_;
791  mutable device::primary_context::handle_t primary_context_handle_ { context::detail_::none };
794  mutable bool holds_pc_refcount_unit_ {false };
797 };
798 
800 inline bool operator==(const device_t& lhs, const device_t& rhs)
801 {
802  return lhs.id() == rhs.id();
803 }
804 
805 inline bool operator!=(const device_t& lhs, const device_t& rhs)
806 {
807  return lhs.id() != rhs.id();
808 }
810 
811 namespace device {
812 
813 namespace detail_ {
814 
815 inline device_t wrap(
816  id_t id,
817  primary_context::handle_t primary_context_handle,
818  bool hold_primary_context_refcount_unit) NOEXCEPT_IF_NDEBUG
819 {
820  return device_t{ id, primary_context_handle, hold_primary_context_refcount_unit };
821 }
822 
823 } // namespace detail_
824 
825 inline device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
826 {
827  return detail_::wrap(id);
828 }
829 
837 inline device_t get(id_t id)
838 {
839 #ifndef NDEBUG
840  if (id < 0) {
841  throw ::std::invalid_argument("Attempt to obtain a CUDA device with a negative device ID " + ::std::to_string(id));
842  }
843 #endif
844  ensure_driver_is_initialized(); // The device_t class mostly assumes the driver has been initialized
845  return wrap(id);
846 }
847 
852 {
854 }
855 
863 inline device_t cpu() { return get(CU_DEVICE_CPU); }
864 
865 namespace current {
866 
870 inline device_t get()
871 {
873  auto id = detail_::get_id();
874  auto pc_handle = primary_context::detail_::obtain_and_increase_refcount(id);
875  return device::detail_::wrap(id, pc_handle);
876 }
877 
878 inline void set(const device_t& device)
879 {
880  auto pc = device.primary_context();
881  context::current::detail_::set(pc.handle());
882 }
883 
884 } // namespace current
885 
892 inline device_t get(pci_location_t pci_id)
893 {
894  auto resolved_id = device::detail_::resolve_id(pci_id);
895  return get(resolved_id);
896 }
897 
909 inline device_t get(const ::std::string& pci_id_str)
910 {
911  auto parsed_pci_id = pci_location_t::parse(pci_id_str);
912  return get(parsed_pci_id);
913 }
914 
915 } // namespace device
916 
917 } // namespace cuda
918 
919 #endif // CUDA_API_WRAPPERS_DEVICE_HPP_
int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:860
void ensure_driver_is_initialized()
A mechanism for ensuring a cuInit() call has been made, to use before making any other driver API cal...
Definition: miscellany.hpp:40
Proxy class for a CUDA stream.
Definition: stream.hpp:246
cuda::context::handle_t handle_t
Raw CUDA driver handle for a device&#39;s primary context.
Definition: types.hpp:946
void set_shared_memory_bank_size(device::shared_memory_bank_size_t new_bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: device.hpp:567
void dont_keep_larger_local_mem_after_resize()
Instructs the (primary context of) the device to discard allocations of larger amounts of global devi...
Definition: device.hpp:699
CUsharedconfig shared_memory_bank_size_t
Choice of the number of bytes in each bank of the shared memory.
Definition: context.hpp:44
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
properties_t properties() const
Obtains the (mostly) non-numeric properties for this device.
Definition: device.hpp:324
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
void keep_larger_local_mem_after_resize(bool keep=true)
Instructs the (primary context of) the device to keep larger amounts of global device memory allocate...
Definition: device.hpp:689
Wrapper class for a CUDA event.
Definition: event.hpp:133
A class for holding the primary context of a CUDA device.
Definition: primary_context.hpp:112
attribute_value_t get_attribute(attribute_t attribute, const device_t &first, const device_t &second)
Get one of the numeric attributes for a(n ordered) pair of devices, relating to their interaction...
Definition: device.hpp:113
A class to create a faux member in a context_t, in lieu of an in-class namespace (which C++ does not ...
Definition: context.hpp:262
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:312
A range of priorities supported by a CUDA context; ranges from the higher numeric value to the lower...
Definition: context.hpp:50
context::limit_value_t get_limit(context::limit_t limit_id) const
Get one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:530
void enable_access_to(const device_t &peer) const
Enable access by this device to the global memory of another device.
Definition: device.hpp:247
bool keeping_larger_local_mem_after_resize() const
Definition: device.hpp:681
CUuuid uuid_t
The CUDA-driver-specific representation of a UUID value; see also {device_t::uuid()}.
Definition: types.hpp:971
context::stream_priority_range_t stream_priority_range() const
Get the range of priority values one can set for streams in this context.
Definition: context.hpp:518
::std::string name() const
Obtains this device&#39;s human-readable name, e.g.
Definition: device.hpp:342
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
CUlimit limit_t
Features of contexts which can be configured individually during a context&#39;s lifetime.
Definition: context.hpp:37
device::limit_value_t get_limit(device::limit_t limit) const
Obtains the upper limit on the amount of a certain kind of resource this device offers.
Definition: device.hpp:469
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const
Set one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:653
device_t default_()
Obtains (a proxy for) the default CUDA device, being the device with the default CUDA device id...
Definition: device.hpp:851
void reset() const
Invalidates all memory allocations and resets all state regarding this CUDA device on the current ope...
Definition: device.hpp:522
void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: context.hpp:632
CUdevice_attribute attribute_t
CUDA devices have both "attributes" and "properties".
Definition: types.hpp:856
bool can_access(const device_t &peer) const
Determine whether this device can access the global memory of another CUDA device.
Definition: device.hpp:231
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:884
device_t cpu()
A named constructor idiom for a "dummy" CUDA device representing the CPU.
Definition: device.hpp:863
void disable_access_to(const device_t &peer) const
Disable access by this device to the global memory of another device.
Definition: device.hpp:257
size_t limit_value_t
Type for the actual values for context (see limit_t for the possible kinds of limits whose value can ...
Definition: context.hpp:41
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing on this dev...
Definition: device.hpp:555
device::primary_context_t primary_context(bool hold_pc_refcount_unit=false) const
Produce a proxy for the device&#39;s primary context - the one used by runtime API calls.
Definition: device.hpp:152
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
context_t::global_memory_type memory() const
Definition: device.hpp:217
device::pci_location_t pci_id() const
Obtains this device&#39;s location on the PCI express bus in terms of domain, bus and device id...
Definition: device.hpp:373
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing within this...
Definition: context.hpp:419
bool supports_block_cooperation() const
True if this device supports executing kernels in which blocks can directly cooperate beyond the use ...
Definition: device.hpp:435
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
device::compute_architecture_t architecture() const
Obtains the device&#39;s hardware architecture generation numeric designator see cuda::device::compute_ar...
Definition: device.hpp:406
bool supports_concurrent_managed_access() const
Determine whether this device can coherently access managed memory concurrently with the CPU...
Definition: device.hpp:426
Location "coordinates" for a CUDA device on a PCIe bus.
Definition: pci_id.hpp:24
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
device::compute_capability_t compute_capability() const
Obtains the device&#39;s compute capability; see cuda::device::compute_capability_t.
Definition: device.hpp:415
const device_t & synchronize() const
Waits for all previously-scheduled tasks on all streams (= queues) on this device to conclude...
Definition: device.hpp:492
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
A numeric designator of an architectural generation of CUDA devices.
Definition: device_properties.hpp:45
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing on this device...
Definition: device.hpp:546
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249
A structure holding a collection various properties of a device.
Definition: device_properties.hpp:149
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
Returns a wrapper for the CUDA device with a given id.
Definition: device.hpp:825
Can only be used by the process which created it.
Definition: constants.hpp:95
void set_limit(device::limit_t limit, device::limit_value_t new_value) const
Set the upper limit of one of the named numeric resources on this device.
Definition: device.hpp:478
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing within this conte...
Definition: context.hpp:645
int multiprocessor_count_t
Type of the number of mutiprocessors within a single GPU.
Definition: device_properties.hpp:37
context::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: context.hpp:500
device::stream_priority_range_t stream_priority_range() const
Determines the range of possible priorities for streams on this device.
Definition: device.hpp:661
Wrapper class for a CUDA device.
Definition: device.hpp:135
attribute_value_t get_attribute(device::attribute_t attribute) const
Obtain a numeric-value attribute of the device.
Definition: device.hpp:356
Fundamental CUDA-related type definitions.
freestanding wrapper functions for working with CUDA&#39;s various kinds of memory spaces, arranged into a relevant namespace hierarchy.
device::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: device.hpp:578
Classes representing specific and overall properties of CUDA devices.
Definition of a wrapper class for CUDA PCI device ID information.