cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
memory.hpp
Go to the documentation of this file.
1 
7 #pragma once
8 #ifndef MULTI_WRAPPER_IMPLS_MEMORY_HPP_
9 #define MULTI_WRAPPER_IMPLS_MEMORY_HPP_
10 
11 #include "context.hpp"
12 #include "ipc.hpp"
13 
14 #include <cuda_runtime_api.h>
15 
16 #include "../memory.hpp"
17 #include "../array.hpp"
18 #include "../device.hpp"
19 #include "../event.hpp"
20 #include "../pointer.hpp"
21 #include "../stream.hpp"
22 #include "../primary_context.hpp"
23 #include "../kernel.hpp"
24 #include "../virtual_memory.hpp"
25 #include "../memory_pool.hpp"
26 #include "../current_device.hpp"
27 
28 namespace cuda {
29 
30 namespace memory {
31 
32 template <typename T, dimensionality_t NumDimensions>
33 inline void copy(array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream)
34 {
35  if (not stream) {
36  memory::copy<T, NumDimensions>(destination, source);
37  return;
38  }
39 #ifndef NDEBUG
40  if (source.size() != destination.size()) {
41  throw ::std::invalid_argument(
42  "Attempt to copy " + ::std::to_string(source.size()) +
43  " elements into an array of " + ::std::to_string(destination.size()) + " elements");
44  }
45 #endif
46  detail_::copy<T, NumDimensions>(destination, source.data(), stream->handle());
47 }
48 
49 // Note: Assumes the destination, source and stream are all usable on the same content
50 template <typename T, dimensionality_t NumDimensions>
51 inline void copy(T* destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream)
52 {
53  if (not stream) {
54  memory::copy(context_of(destination), destination, source);
55  return;
56  }
57  if (stream->context_handle() != source.context_handle()) {
58  throw ::std::invalid_argument("Attempt to copy an array in"
59  + context::detail_::identify(source.context_handle()) + " via "
60  + stream::detail_::identify(*stream));
61  }
62  detail_::copy<T, NumDimensions>(destination, source, stream->handle());
63 }
64 
65 template<dimensionality_t NumDimensions>
66 void copy(copy_parameters_t<NumDimensions> params, optional_ref<const stream_t> stream)
67 {
68  stream::handle_t stream_handle = stream ? stream->handle() : nullptr;
69  status_t status = detail_::multidim_copy(params, stream_handle);
70  throw_if_error_lazy(status, "Copying using a general copy parameters structure");
71 }
72 
73 
74 template <typename T>
75 void copy_single(T* destination, const T* source, optional_ref<const stream_t> stream)
76 {
77  memory::copy(destination, source, sizeof(T), stream);
78 }
79 
80 // Note: Assumes the source pointer is valid in the stream's context
81 template <typename T, dimensionality_t NumDimensions>
82 inline void copy(array_t<T, NumDimensions>& destination, const T* source, optional_ref<const stream_t> stream)
83 {
84  if (not stream) {
85  memory::copy(destination, context_of(source), source);
86  return;
87  }
88  detail_::copy<T, NumDimensions>(destination, source, stream->handle());
89 }
90 
91 inline void copy(void *destination, const void *source, size_t num_bytes, optional_ref<const stream_t> stream)
92 {
93  if (not stream) {
94  context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
95  auto result = cuMemcpy(device::address(destination), device::address(source), num_bytes);
96  // TODO: Determine whether it was from host to device, device to host etc and
97  // add this information to the error string
98  throw_if_error_lazy(result, "Synchronously copying data");
99  return;
100  }
101  detail_::copy(destination, source, num_bytes, stream->handle());
102 }
103 
104 namespace device {
105 
106 inline region_t allocate(const context_t& context, size_t size_in_bytes)
107 {
108  return detail_::allocate(context.handle(), size_in_bytes);
109 }
110 
111 inline region_t allocate(const device_t& device, size_t size_in_bytes)
112 {
113  auto pc = device.primary_context();
114  return allocate(pc, size_in_bytes);
115 }
116 
117 #if CUDA_VERSION >= 11020
118 inline region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stream = {})
119 {
120  return stream ?
121  detail_::allocate(stream->context().handle(), size_in_bytes, stream->handle()) :
122  detail_::allocate_in_current_context(size_in_bytes);
123 }
124 
125 #endif // CUDA_VERSION >= 11020
126 
127 #if CUDA_VERSION >= 11020
128 inline void free(void* region_start, optional_ref<const stream_t> stream)
129 #else
130 inline void free(void* region_start)
131 #endif // CUDA_VERSION >= 11020
132 {
133 #if CUDA_VERSION >= 11020
134  if (stream) {
135  detail_::free_on_stream(region_start, stream->handle());
136  return;
137  }
138 #endif
139  context::current::detail_::scoped_existence_ensurer_t ensurer;
140  detail_::free_in_current_context(ensurer.context_handle,region_start);
141 }
142 
143 } // namespace device
144 
145 namespace inter_context {
146 
147 inline void copy(
148  void * destination,
149  const context_t& destination_context,
150  const void * source,
151  const context_t& source_context,
152  size_t num_bytes,
153  optional_ref<const stream_t> stream = {})
154 {
155  auto status = stream ?
156  cuMemcpyPeer(
157  device::address(destination),
158  destination_context.handle(),
159  device::address(source),
160  source_context.handle(),
161  num_bytes) :
162  cuMemcpyPeerAsync(
163  device::address(destination),
164  destination_context.handle(),
165  device::address(source),
166  source_context.handle(),
167  num_bytes,
168  stream->handle());
169 
170  // TODO: Determine whether it was from host to device, device to host etc and
171  // add this information to the error string
172  throw_if_error_lazy(status,
173  ::std::string("Failed copying data between devices: From address ")
174  + cuda::detail_::ptr_as_hex(source) + " in "
175  + context::detail_::identify(source_context.handle()) + " to address "
176  + cuda::detail_::ptr_as_hex(destination) + " in "
177  + context::detail_::identify(destination_context.handle()) +
178  (stream ? " on " + stream::detail_::identify(*stream) : ""));
179 }
180 
181 } // namespace inter_context
182 
183 namespace managed {
184 
185 namespace detail_ {
186 
187 template <typename GenericRegion>
188 inline device_t region_helper<GenericRegion>::preferred_location() const
189 {
190  auto device_id = range::detail_::get_scalar_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION);
191  return cuda::device::get(device_id);
192 }
193 
194 template <typename GenericRegion>
195 inline void region_helper<GenericRegion>::set_preferred_location(device_t& device) const
196 {
197  range::detail_::set_attribute(*this,CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, device.id());
198 }
199 
200 template <typename GenericRange>
201 inline void region_helper<GenericRange>::clear_preferred_location() const
202 {
203  range::detail_::unset_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION);
204 }
205 
206 } // namespace detail_
207 
209 {
210  range::detail_::advise(region, CU_MEM_ADVISE_SET_ACCESSED_BY, device.id());
211 }
212 
214 {
215  range::detail_::advise(region, CU_MEM_ADVISE_UNSET_ACCESSED_BY, device.id());
216 }
217 
218 template <typename Allocator>
219 ::std::vector<device_t, Allocator> expected_accessors(const_region_t region, const Allocator& allocator)
220 {
221  auto num_devices = cuda::device::count();
222  ::std::vector<device_t, Allocator> devices(num_devices, allocator);
223  auto device_ids = reinterpret_cast<cuda::device::id_t *>(devices.data());
224 
225  auto status = cuMemRangeGetAttribute(
226  device_ids, sizeof(device_t) * devices.size(),
227  CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, device::address(region.start()), region.size());
228  throw_if_error_lazy(status, "Obtaining the IDs of devices with access to the managed memory range at "
229  + cuda::detail_::ptr_as_hex(region.start()));
230  auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId);
231  // We may have gotten less results that the set of all devices, so let's whittle that down
232 
233  if (first_invalid_element - device_ids != num_devices) {
234  devices.resize(first_invalid_element - device_ids);
235  }
236 
237  return devices;
238 }
239 
240 inline void prefetch(
241  const_region_t region,
242  const cuda::device_t& destination,
243  const stream_t& stream)
244 {
245  detail_::prefetch(region, destination.id(), stream.handle());
246 }
247 
248 inline void prefetch_to_host(const_region_t region, const stream_t& stream)
249 {
250  detail_::prefetch(region, CU_DEVICE_CPU, stream.handle());
251 }
252 
254  const context_t& context,
255  size_t num_bytes,
256  initial_visibility_t initial_visibility)
257 {
258  return detail_::allocate(context.handle(), num_bytes, initial_visibility);
259 }
260 
262  const device_t& device,
263  size_t num_bytes,
264  initial_visibility_t initial_visibility)
265 {
266  auto pc = device.primary_context();
267  return allocate(pc, num_bytes, initial_visibility);
268 }
269 
270 inline region_t allocate(size_t num_bytes)
271 {
272  auto context_handle = context::current::detail_::get_with_fallback_push();
273  return allocate(context_handle, num_bytes, initial_visibility_t::to_all_devices);
274 }
275 
276 } // namespace managed
277 
278 namespace mapped {
279 
281  cuda::device_t& device,
282  size_t size_in_bytes,
283  allocation_options options)
284 {
285  auto pc = device.primary_context();
286  return cuda::memory::mapped::detail_::allocate(pc.handle(), size_in_bytes, options);
287 }
288 
289 
291  cuda::context_t& context,
292  size_t size_in_bytes,
293  allocation_options options)
294 {
295  return cuda::memory::mapped::detail_::allocate(context.handle(), size_in_bytes, options);
296 }
297 
298 } // namespace mapped
299 
300 namespace host {
301 
302 namespace detail_ {
303 
310 inline region_t allocate_in_current_context(
311  size_t size_in_bytes,
312  allocation_options options)
313 {
314  void* allocated = nullptr;
315  auto flags = memory::detail_::make_cuda_host_alloc_flags(options);
316  auto result = cuMemHostAlloc(&allocated, size_in_bytes, flags);
317  if (is_success(result) && allocated == nullptr) {
318  // Can this even happen? hopefully not
319  result = static_cast<status_t>(status::named_t::unknown);
320  }
321  throw_if_error_lazy(result, "Failed allocating " + ::std::to_string(size_in_bytes) + " bytes of host memory");
322  return { allocated, size_in_bytes };
323 }
324 
325 inline region_t allocate(
326  const context::handle_t context_handle,
327  size_t size_in_bytes,
328  allocation_options options)
329 {
330  CAW_SET_SCOPE_CONTEXT(context_handle);
331  return allocate_in_current_context(size_in_bytes, options);
332 }
333 
334 } // namespace detail_
335 
345  size_t size_in_bytes,
346  allocation_options options)
347 {
348  static constexpr const bool dont_decrease_pc_refcount_on_destruct { false };
349  context::current::detail_::scoped_existence_ensurer_t context_ensurer{ dont_decrease_pc_refcount_on_destruct };
350  // Note: We allow a PC to leak here, in case no other context existed, so as not to risk
351  // the allocation being invalidated by the only CUDA context getting destroyed when
352  // leaving this function
353  return detail_::allocate_in_current_context(size_in_bytes, options);
354 }
355 
356 } // namespace host
357 
358 namespace pointer {
359 namespace detail_ {
360 
361 template<attribute_t attribute>
362 status_and_attribute_value<attribute> get_attribute_with_status(const void *ptr)
363 {
364  context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
365  attribute_value_t <attribute> attribute_value;
366  auto status = cuPointerGetAttribute(&attribute_value, attribute, device::address(ptr));
367  return { status, attribute_value };
368 }
369 
370 
371 template<attribute_t attribute>
372 attribute_value_t<attribute> get_attribute(const void *ptr)
373 {
374  auto status_and_attribute_value = get_attribute_with_status<attribute>(ptr);
375  throw_if_error_lazy(status_and_attribute_value.status,
376  "Obtaining attribute " + ::std::to_string(static_cast<int>(attribute))
377  + " for pointer " + cuda::detail_::ptr_as_hex(ptr) );
378  return status_and_attribute_value.value;
379 }
380 
381 // TODO: Consider switching to a span with C++20
382 inline void get_attributes(unsigned num_attributes, pointer::attribute_t* attributes, void** value_ptrs, const void* ptr)
383 {
384  context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
385  auto status = cuPointerGetAttributes( num_attributes, attributes, value_ptrs, device::address(ptr) );
386  throw_if_error_lazy(status, "Obtaining multiple attributes for pointer " + cuda::detail_::ptr_as_hex(ptr));
387 }
388 
389 } // namespace detail_
390 } // namespace pointer
391 
392 namespace device {
393 
394 template <typename T>
395 inline void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream)
396 {
397  if (stream) {
398  detail_::set(start, value, num_elements, stream->handle());
399  }
400  context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
401  static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
402  static_assert(sizeof(T) == 1 or sizeof(T) == 2 or sizeof(T) == 4,
403  "Unsupported type size - only sizes 1, 2 and 4 are supported");
404  // TODO: Consider checking for alignment when compiling without NDEBUG
405  status_t result {CUDA_SUCCESS};
406  switch(sizeof(T)) {
407  case 1: result = stream ?
408  cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream->handle()) :
409  cuMemsetD8 (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements); break;
410  case 2: result = stream ?
411  cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream->handle()) :
412  cuMemsetD16 (address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements); break;
413  case 4: result = stream ?
414  cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream->handle()) :
415  cuMemsetD32 (address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements); break;
416  }
417  throw_if_error_lazy(result, "Setting global device memory bytes");
418 }
419 
420 } // namespace device
421 
422 inline void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream)
423 {
424  switch ( type_of(ptr) ) {
425  case device_:
426 // case managed_:
427  case unified_:
428  memory::device::set(ptr, byte_value, num_bytes, stream); break;
429 // case unregistered_:
430  case host_:
431  if (stream) {
432  throw ::std::invalid_argument("Asynchronous host-memory set's not currently supported");
433  } else { ::std::memset(ptr, byte_value, num_bytes); }
434  break;
435  default:
436  throw runtime_error(
437  cuda::status::invalid_value,
438  "CUDA returned an invalid memory type for the pointer 0x" + cuda::detail_::ptr_as_hex(ptr));
439  }
440 }
441 
442 #if CUDA_VERSION >= 11020
443 namespace pool {
444 
445 template<shared_handle_kind_t SharedHandleKind>
446 pool_t create(const cuda::device_t& device)
447 {
448  return detail_::create<SharedHandleKind>(device.id());
449 }
450 
451 
452 inline region_t allocate(const pool_t& pool, const stream_t &stream, size_t num_bytes)
453 {
454  CUdeviceptr dptr;
455  auto status = cuMemAllocFromPoolAsync(&dptr, num_bytes, pool.handle(), stream.handle());
456  throw_if_error_lazy(status, "Failed scheduling an allocation of " + ::std::to_string(num_bytes)
457  + " bytes of memory from " + detail_::identify(pool) + ", on " + stream::detail_::identify(stream));
458  return {as_pointer(dptr), num_bytes };
459 }
460 
461 namespace ipc {
462 
463 template <shared_handle_kind_t Kind>
464 shared_handle_t<Kind> export_(const pool_t& pool)
465 {
466  shared_handle_t<Kind> result;
467  static constexpr const unsigned long long flags { 0 };
468  auto status = cuMemPoolExportToShareableHandle(&result, pool.handle(), static_cast<CUmemAllocationHandleType>(Kind), flags);
469  throw_if_error_lazy(status, "Exporting " + pool::detail_::identify(pool) +" for inter-process use");
470  return result;
471 }
472 
473 template <shared_handle_kind_t Kind>
474 pool_t import(const device_t& device, const shared_handle_t<Kind>& shared_pool_handle)
475 {
476  auto handle = detail_::import<Kind>(shared_pool_handle);
477  // TODO: MUST SUPPORT SAYING THIS POOL CAN'T ALLOCATE - NOT AN EXTRA FLAG IN THE POOL CLASS
478  return memory::pool::wrap(device.id(), handle, do_not_take_ownership);
479 }
480 
481 } // namespace ipc
482 
483 
484 } // namespace pool
485 
486 inline region_t pool_t::allocate(const stream_t& stream, size_t num_bytes) const
487 {
488  return pool::allocate(*this, stream, num_bytes);
489 }
490 
491 inline cuda::device_t pool_t::device() const noexcept
492 {
493  return cuda::device::wrap(device_id_);
494 }
495 
496 inline pool::ipc::imported_ptr_t pool_t::import(const memory::pool::ipc::ptr_handle_t& exported_handle) const
497 {
498  return pool::ipc::import_ptr(*this, exported_handle);
499 }
500 
501 inline permissions_t get_permissions(const cuda::device_t& device, const pool_t& pool)
502 {
503  return cuda::memory::detail_::get_permissions(device.id(), pool.handle());
504 }
505 
506 inline void set_permissions(const cuda::device_t& device, const pool_t& pool, permissions_t permissions)
507 {
508  if (pool.device_id() == device.id()) {
509  throw ::std::invalid_argument("Cannot change the access get_permissions to a pool of the device "
510  "on which the pool's memory is allocated (" + cuda::device::detail_::identify(device.id()) + ')');
511  }
512  cuda::memory::detail_::set_permissions(device.id(), pool.handle(), permissions);
513 }
514 
515 template <typename DeviceRange>
516 void set_permissions(DeviceRange devices, const pool_t& pool, permissions_t permissions)
517 {
518  // Not depending on unique_span here :-(
519  auto device_ids = ::std::unique_ptr<cuda::device::id_t[]>(new cuda::device::id_t[devices.size()]);
520  auto device_to_id = [](device_t const& device){ return device.id(); };
521  ::std::transform(::std::begin(devices), ::std::end(devices), device_ids.get(), device_to_id);
522  cuda::memory::detail_::set_permissions( { device_ids.get(), devices.size() }, pool.handle(), permissions);
523 }
524 #endif // #if CUDA_VERSION >= 11020
525 
526 } // namespace memory
527 
528 #if CUDA_VERSION >= 11020
529 
530 template <memory::pool::shared_handle_kind_t Kind>
531 memory::pool_t device_t::create_memory_pool() const
532 {
533  return cuda::memory::pool::detail_::create<Kind>(id_);
534 }
535 
536 inline memory::region_t stream_t::enqueue_t::allocate(const memory::pool_t& pool, size_t num_bytes)
537 {
538  return memory::pool::allocate(pool, associated_stream, num_bytes);
539 }
540 
541 inline memory::pool_t device_t::default_memory_pool() const
542 {
543  memory::pool::handle_t handle;
544  auto status = cuDeviceGetDefaultMemPool(&handle, id_);
545  throw_if_error_lazy(status, "Failed obtaining the default memory pool for " + device::detail_::identify(id_));
546  return memory::pool::wrap(id_, handle, do_not_take_ownership);
547 }
548 
549 #endif // CUDA_VERSION >= 11020
550 } // namespace cuda
551 
552 #endif // MULTI_WRAPPER_IMPLS_MEMORY_HPP_
553 
Proxy class for a CUDA stream.
Definition: stream.hpp:246
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory.
Definition: memory.hpp:248
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context.
Definition: memory.hpp:106
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:29
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:395
Implementations of inter-processing-communications related functions and classes requiring the defini...
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region.
Definition: memory.hpp:208
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
memory::type_t type_of(const void *ptr)
Determine the type of memory at a given address vis-a-vis the CUDA ecosystem: Was it allocated by the...
Definition: pointer.hpp:112
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130
context_t context_of(const void *ptr)
Obtain (a non-owning wrapper for) the CUDA context with which a memory address is associated (e...
Definition: pointer.hpp:50
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
CUpointer_attribute attribute_t
Raw CUDA driver choice type for attributes of pointers.
Definition: types.hpp:662
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard con...
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator)
Definition: memory.hpp:219
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:91
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:158
device::primary_context_t primary_context(bool hold_pc_refcount_unit=false) const
Produce a proxy for the device&#39;s primary context - the one used by runtime API calls.
Definition: device.hpp:152
void set(void *start, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to a fixed value.
Definition: memory.hpp:385
void set(region_t region, int byte_value)
Definition: memory.hpp:1822
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:837
region_pair_t allocate(cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:280
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension)
Definition: array.hpp:34
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance.
Definition: array.hpp:264
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA&#39;s complex copyin...
Definition: copy_parameters.hpp:68
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:700
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region.
Definition: memory.hpp:213
device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
Returns a wrapper for the CUDA device with a given id.
Definition: device.hpp:825
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:75
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:240
detail_::all_devices devices()
Definition: devices.hpp:224
Wrapper class for a CUDA device.
Definition: device.hpp:135
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:203
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77