8 #ifndef MULTI_WRAPPER_IMPLS_MEMORY_HPP_     9 #define MULTI_WRAPPER_IMPLS_MEMORY_HPP_    14 #include <cuda_runtime_api.h>    16 #include "../memory.hpp"    17 #include "../array.hpp"    18 #include "../device.hpp"    19 #include "../event.hpp"    20 #include "../pointer.hpp"    21 #include "../stream.hpp"    22 #include "../primary_context.hpp"    23 #include "../kernel.hpp"    24 #include "../virtual_memory.hpp"    25 #include "../memory_pool.hpp"    26 #include "../current_device.hpp"    32 template <
typename T, dimensionality_t NumDimensions>
    33 inline void copy(array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream)
    36         memory::copy<T, NumDimensions>(destination, source);
    40     if (source.size() != destination.size()) {
    41         throw ::std::invalid_argument(
    42             "Attempt to copy " + ::std::to_string(source.size()) +
    43             " elements into an array of " + ::std::to_string(destination.size()) + 
" elements");
    46     detail_::copy<T, NumDimensions>(destination, source.data(), stream->handle());
    50 template <
typename T, dimensionality_t NumDimensions>
    57     if (stream->context_handle() != source.context_handle()) {
    58         throw ::std::invalid_argument(
"Attempt to copy an array in"    59                                       + context::detail_::identify(source.context_handle()) + 
" via "    60                                       + stream::detail_::identify(*stream));
    62     detail_::copy<T, NumDimensions>(destination, source, stream->handle());
    65 template<dimensionality_t NumDimensions>
    69     status_t status = detail_::multidim_copy(params, stream_handle);
    75 void copy_single(T* destination, 
const T* source, optional_ref<const stream_t> stream)
    81 template <
typename T, dimensionality_t NumDimensions>
    88     detail_::copy<T, NumDimensions>(destination, source, stream->handle());
    91 inline void copy(
void *destination, 
const void *source, 
size_t num_bytes, optional_ref<const stream_t> stream)
    94         context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
   101     detail_::copy(destination, source, num_bytes, stream->handle());
   117 #if CUDA_VERSION >= 11020   118 inline region_t allocate(
size_t size_in_bytes, optional_ref<const stream_t> stream = {})
   122         detail_::allocate_in_current_context(size_in_bytes);
   125 #endif // CUDA_VERSION >= 11020   127 #if CUDA_VERSION >= 11020   128 inline void free(
void* region_start, optional_ref<const stream_t> stream)
   130 inline void free(
void* region_start)
   131 #endif // CUDA_VERSION >= 11020   133 #if CUDA_VERSION >= 11020   135         detail_::free_on_stream(region_start, stream->handle());
   139     context::current::detail_::scoped_existence_ensurer_t ensurer;
   140     detail_::free_in_current_context(ensurer.context_handle,region_start);
   145 namespace inter_context {
   153     optional_ref<const stream_t> stream = {})
   155     auto status = stream ?
   158           destination_context.handle(),
   160           source_context.handle(),
   164           destination_context.handle(),
   166           source_context.handle(),
   173         ::std::string(
"Failed copying data between devices: From address ")
   174         + cuda::detail_::ptr_as_hex(source) + 
" in "   175         + context::detail_::identify(source_context.handle()) + 
" to address "   176         + cuda::detail_::ptr_as_hex(destination) + 
" in "   177         + context::detail_::identify(destination_context.handle()) +
   178         (stream ? 
" on " + stream::detail_::identify(*stream) : 
""));
   187 template <
typename GenericRegion>
   188 inline device_t region_helper<GenericRegion>::preferred_location()
 const   190     auto device_id = range::detail_::get_scalar_attribute<bool>(*
this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION);
   194 template <
typename GenericRegion>
   195 inline void region_helper<GenericRegion>::set_preferred_location(
device_t& device)
 const   197     range::detail_::set_attribute(*
this,CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION, device.
id());
   200 template <
typename GenericRange>
   201 inline void region_helper<GenericRange>::clear_preferred_location()
 const   203     range::detail_::unset_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION);
   210     range::detail_::advise(region, CU_MEM_ADVISE_SET_ACCESSED_BY, device.
id());
   215     range::detail_::advise(region, CU_MEM_ADVISE_UNSET_ACCESSED_BY, device.
id());
   218 template <
typename Allocator>
   222     ::std::vector<device_t, Allocator> 
devices(num_devices, allocator);
   225     auto status = cuMemRangeGetAttribute(
   226     device_ids, 
sizeof(
device_t) * devices.size(),
   227     CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY, 
device::address(region.start()), region.size());
   228     throw_if_error_lazy(status, 
"Obtaining the IDs of devices with access to the managed memory range at "   229                            + cuda::detail_::ptr_as_hex(region.start()));
   230     auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId);
   233     if (first_invalid_element - device_ids != num_devices) {
   234         devices.resize(first_invalid_element - device_ids);
   267     return allocate(pc, num_bytes, initial_visibility);
   272     auto context_handle = context::current::detail_::get_with_fallback_push();
   273     return allocate(context_handle, num_bytes, initial_visibility_t::to_all_devices);
   282     size_t              size_in_bytes,
   292     size_t              size_in_bytes,
   310 inline region_t allocate_in_current_context(
   311     size_t              size_in_bytes,
   314     void* allocated = 
nullptr;
   315     auto flags = memory::detail_::make_cuda_host_alloc_flags(options);
   316     auto result = cuMemHostAlloc(&allocated, size_in_bytes, flags);
   317     if (
is_success(result) && allocated == 
nullptr) {
   319         result = 
static_cast<status_t>(status::named_t::unknown);
   321     throw_if_error_lazy(result, 
"Failed allocating " + ::std::to_string(size_in_bytes) + 
" bytes of host memory");
   322     return { allocated, size_in_bytes };
   327     size_t                   size_in_bytes,
   330     CAW_SET_SCOPE_CONTEXT(context_handle);
   331     return allocate_in_current_context(size_in_bytes, options);
   345     size_t              size_in_bytes,
   348     static constexpr 
const bool dont_decrease_pc_refcount_on_destruct { 
false };
   349     context::current::detail_::scoped_existence_ensurer_t context_ensurer{ dont_decrease_pc_refcount_on_destruct };
   353     return detail_::allocate_in_current_context(size_in_bytes, options);
   361 template<attribute_t attribute>
   362 status_and_attribute_value<attribute> get_attribute_with_status(
const void *ptr)
   364     context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
   365     attribute_value_t <attribute> attribute_value;
   366     auto status = cuPointerGetAttribute(&attribute_value, attribute, 
device::address(ptr));
   367     return { status, attribute_value };
   371 template<attribute_t attribute>
   372 attribute_value_t<attribute> get_attribute(
const void *ptr)
   374     auto status_and_attribute_value = get_attribute_with_status<attribute>(ptr);
   376         "Obtaining attribute " + ::std::to_string(static_cast<int>(attribute))
   377         + 
" for pointer " + cuda::detail_::ptr_as_hex(ptr) );
   378     return status_and_attribute_value.value;
   382 inline void get_attributes(
unsigned num_attributes, 
pointer::attribute_t* attributes, 
void** value_ptrs, 
const void* ptr)
   384     context::current::detail_::scoped_existence_ensurer_t ensure_we_have_some_context;
   385     auto status = cuPointerGetAttributes( num_attributes, attributes, value_ptrs, 
device::address(ptr) );
   386     throw_if_error_lazy(status, 
"Obtaining multiple attributes for pointer " + cuda::detail_::ptr_as_hex(ptr));
   394 template <
typename T>
   395 inline void typed_set(T* 
start, 
const T& value, 
size_t num_elements, optional_ref<const stream_t> stream)
   398         detail_::set(start, value, num_elements, stream->handle());
   400     context::current::detail_::scoped_existence_ensurer_t ensure_some_context{};
   401     static_assert(::std::is_trivially_copyable<T>::value, 
"Non-trivially-copyable types cannot be used for setting memory");
   402     static_assert(
sizeof(T) == 1 or 
sizeof(T) == 2 or 
sizeof(T) == 4,
   403         "Unsupported type size - only sizes 1, 2 and 4 are supported");
   407     case 1: result = stream ?
   408         cuMemsetD8Async (
address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream->handle()) :
   409         cuMemsetD8      (
address(start), 
reinterpret_cast<const ::std::uint8_t& 
>(value), num_elements); 
break;
   410     case 2: result = stream ?
   411         cuMemsetD16Async(
address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream->handle()) :
   412         cuMemsetD16     (
address(start), 
reinterpret_cast<const ::std::uint16_t&
>(value), num_elements); 
break;
   413     case 4: result = stream ?
   414         cuMemsetD32Async(
address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream->handle()) :
   415         cuMemsetD32     (
address(start), 
reinterpret_cast<const ::std::uint32_t&
>(value), num_elements); 
break;
   422 inline void set(
void* ptr, 
int byte_value, 
size_t num_bytes, optional_ref<const stream_t> stream)
   432             throw ::std::invalid_argument(
"Asynchronous host-memory set's not currently supported");
   433         } 
else { ::std::memset(ptr, byte_value, num_bytes); }
   437             cuda::status::invalid_value,
   438             "CUDA returned an invalid memory type for the pointer 0x" + cuda::detail_::ptr_as_hex(ptr));
   442 #if CUDA_VERSION >= 11020   445 template<shared_handle_kind_t SharedHandleKind>
   448     return detail_::create<SharedHandleKind>(device.
id());
   455     auto status = cuMemAllocFromPoolAsync(&dptr, num_bytes, pool.handle(), stream.
handle());
   456     throw_if_error_lazy(status, 
"Failed scheduling an allocation of " + ::std::to_string(num_bytes)
   457         + 
" bytes of memory from " + detail_::identify(pool) + 
", on " + stream::detail_::identify(stream));
   463 template <shared_handle_kind_t Kind>
   464 shared_handle_t<Kind> export_(
const pool_t& pool)
   466     shared_handle_t<Kind> result;
   467     static constexpr 
const unsigned long long flags { 0 };
   468     auto status = cuMemPoolExportToShareableHandle(&result, pool.handle(), 
static_cast<CUmemAllocationHandleType
>(Kind), flags);
   469     throw_if_error_lazy(status, 
"Exporting " + pool::detail_::identify(pool) +
" for inter-process use");
   473 template <shared_handle_kind_t Kind>
   474 pool_t 
import(
const device_t& device, 
const shared_handle_t<Kind>& shared_pool_handle)
   476     auto handle = detail_::import<Kind>(shared_pool_handle);
   496 inline pool::ipc::imported_ptr_t pool_t::import(
const memory::pool::ipc::ptr_handle_t& exported_handle)
 const   498     return pool::ipc::import_ptr(*
this, exported_handle);
   501 inline permissions_t get_permissions(
const cuda::device_t& device, 
const pool_t& pool)
   503     return cuda::memory::detail_::get_permissions(device.
id(), pool.handle());
   506 inline void set_permissions(
const cuda::device_t& device, 
const pool_t& pool, permissions_t permissions)
   508     if (pool.device_id() == device.
id()) {
   509         throw ::std::invalid_argument(
"Cannot change the access get_permissions to a pool of the device "   510             "on which the pool's memory is allocated (" + cuda::device::detail_::identify(device.
id()) + 
')');
   512     cuda::memory::detail_::set_permissions(device.
id(), pool.handle(), permissions);
   515 template <
typename DeviceRange>
   516 void set_permissions(DeviceRange 
devices, 
const pool_t& pool, permissions_t permissions)
   519     auto device_ids = ::std::unique_ptr<cuda::device::id_t[]>(
new cuda::device::id_t[devices.size()]);
   520     auto device_to_id = [](
device_t const& device){ 
return device.
id(); };
   521     ::std::transform(::std::begin(devices), ::std::end(devices), device_ids.get(), device_to_id);
   522     cuda::memory::detail_::set_permissions( { device_ids.get(), devices.size() }, pool.handle(), permissions);
   524 #endif // #if CUDA_VERSION >= 11020   528 #if CUDA_VERSION >= 11020   530 template <memory::pool::shared_handle_kind_t Kind>
   531 memory::pool_t device_t::create_memory_pool()
 const   533     return cuda::memory::pool::detail_::create<Kind>(id_);
   541 inline memory::pool_t device_t::default_memory_pool()
 const   544     auto status = cuDeviceGetDefaultMemPool(&handle, id_);
   545     throw_if_error_lazy(status, 
"Failed obtaining the default memory pool for " + device::detail_::identify(id_));
   549 #endif //  CUDA_VERSION >= 11020   552 #endif // MULTI_WRAPPER_IMPLS_MEMORY_HPP_ Proxy class for a CUDA stream. 
Definition: stream.hpp:246
 
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory. 
Definition: memory.hpp:248
 
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps. 
Definition: stream.hpp:257
 
Wrapper class for a CUDA context. 
Definition: context.hpp:244
 
Definitions and functionality wrapping CUDA APIs. 
Definition: array.hpp:22
 
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63
 
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality. 
Definition: memory.hpp:1960
 
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}. 
Definition: types.hpp:878
 
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context. 
Definition: memory.hpp:106
 
Owning wrapper for CUDA 2D and 3D arrays. 
Definition: array.hpp:29
 
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width. 
Definition: memory.hpp:395
 
Implementations of inter-processing-communications related functions and classes requiring the defini...
 
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API. 
Definition: types.hpp:850
 
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region. 
Definition: memory.hpp:208
 
device::id_t id() const noexcept
Return the proxied device's ID. 
Definition: device.hpp:594
 
memory::type_t type_of(const void *ptr)
Determine the type of memory at a given address vis-a-vis the CUDA ecosystem: Was it allocated by the...
Definition: pointer.hpp:112
 
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated) 
Definition: memory.hpp:130
 
context_t context_of(const void *ptr)
Obtain (a non-owning wrapper for) the CUDA context with which a memory address is associated (e...
Definition: pointer.hpp:50
 
void start()
Start CUDA profiling for the current process. 
Definition: profiling.hpp:229
 
CUpointer_attribute attribute_t
Raw CUDA driver choice type for attributes of pointers. 
Definition: types.hpp:662
 
Implementations requiring the definitions of multiple CUDA entity proxy classes, and which regard con...
 
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements. 
Definition: memory.hpp:625
 
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator)
Definition: memory.hpp:219
 
options accepted by CUDA's allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:91
 
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
 
A pair of memory regions, one in system (=host) memory and one on a CUDA device's memory - mapped to ...
Definition: memory.hpp:158
 
device::primary_context_t primary_context(bool hold_pc_refcount_unit=false) const
Produce a proxy for the device's primary context - the one used by runtime API calls. 
Definition: device.hpp:152
 
void set(void *start, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to a fixed value. 
Definition: memory.hpp:385
 
void set(region_t region, int byte_value)
Definition: memory.hpp:1822
 
device_t get(id_t id)
Returns a proxy for the CUDA device with a given id. 
Definition: device.hpp:837
 
region_pair_t allocate(cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:280
 
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
 
CUarray handle_t
Raw CUDA driver handle for arrays (of any dimension) 
Definition: array.hpp:34
 
array_t< T, NumDimensions > wrap(device::id_t device_id, context::handle_t context_handle, handle_t handle, dimensions_t< NumDimensions > dimensions) noexcept
Wrap an existing CUDA array in an array_t instance. 
Definition: array.hpp:264
 
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA's complex copyin...
Definition: copy_parameters.hpp:68
 
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality. 
Definition: memory.hpp:1962
 
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682
 
CUstream handle_t
The CUDA driver's raw handle for streams. 
Definition: types.hpp:239
 
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:700
 
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region. 
Definition: memory.hpp:213
 
device_t wrap(id_t id) NOEXCEPT_IF_NDEBUG
Returns a wrapper for the CUDA device with a given id. 
Definition: device.hpp:825
 
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations. 
Definition: memory.hpp:75
 
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:240
 
detail_::all_devices devices()
Definition: devices.hpp:224
 
Wrapper class for a CUDA device. 
Definition: device.hpp:135
 
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753
 
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded. 
Definition: error.hpp:203
 
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77