26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_ 27 #define CUDA_API_WRAPPERS_MEMORY_HPP_ 37 #include <cuda_runtime.h> 99 (options.portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) |
100 (options.write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
128 size_t size_in_bytes;
150 auto status = cuMemAlloc(&allocated, num_bytes);
153 status =
static_cast<status_t>(status::unknown);
155 throw_if_error_lazy(status,
"Failed allocating " + ::std::to_string(num_bytes) +
156 " bytes of global memory on the current CUDA device");
157 return {as_pointer(allocated), num_bytes};
160 inline region_t allocate(context::handle_t context_handle,
size_t size_in_bytes)
162 CAW_SET_SCOPE_CONTEXT(context_handle);
163 return allocate_in_current_context(size_in_bytes);
168 #if CUDA_VERSION >= 11020 177 context::handle_t context_handle,
184 auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle);
187 status =
static_cast<decltype(status)
>(status::unknown);
189 throw_if_error_lazy(status,
190 "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
191 " bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) );
192 return {as_pointer(allocated), num_bytes};
218 inline void free(
void* ptr)
221 auto result = cuMemFree(
address(ptr));
222 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT 223 if (result == status::success) {
return; }
225 if (result == status::success or result == status::context_is_destroyed) {
return; }
227 throw runtime_error(result,
"Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr));
232 #if CUDA_VERSION >= 11020 238 context::handle_t context_handle,
240 void* allocated_region_start)
242 auto status = cuMemFreeAsync(
device::address(allocated_region_start), stream_handle);
243 throw_if_error_lazy(status,
244 "Failed scheduling an asynchronous freeing of the global memory region starting at " 245 + cuda::detail_::ptr_as_hex(allocated_region_start) +
" on " 246 + stream::detail_::identify(stream_handle, context_handle) );
263 free(stream, region.data());
299 inline region_t allocate(
const device_t& device,
size_t size_in_bytes);
305 void* operator()(
size_t num_bytes)
const {
return detail_::allocate_in_current_context(num_bytes).start(); }
325 template <
typename T>
326 void typed_set(T* start,
const T& value,
size_t num_elements);
341 inline void set(
void* start,
int byte_value,
size_t num_bytes)
343 return typed_set<unsigned char>(
static_cast<unsigned char*
>(start), static_cast<unsigned char>(byte_value), num_bytes);
351 set(region.start(), byte_value, region.size());
362 inline void zero(
void* start,
size_t num_bytes)
364 set(start, 0, num_bytes);
374 zero(region.start(), region.size());
384 template <
typename T>
387 zero(ptr,
sizeof(T));
410 void copy(
void *destination,
const void *source,
size_t num_bytes);
418 return copy(destination, source.start(), source.size());
431 if (destination.size() < source.size()) {
432 throw ::std::logic_error(
"Can't copy a large region into a smaller one");
435 return copy(destination.start(), source);
443 template <
typename T,
size_t N>
444 inline void copy(
region_t destination,
const T(&source)[N])
447 if (destination.size() < N) {
448 throw ::std::logic_error(
"Source size exceeds destination size");
451 return copy(destination.start(), source,
sizeof(T) * N);
459 template <
typename T,
size_t N>
463 size_t required_size = N *
sizeof(T);
464 if (source.size() != required_size) {
465 throw ::std::invalid_argument(
466 "Attempt to copy a region of " + ::std::to_string(source.size()) +
467 " bytes into an array of size " + ::std::to_string(required_size) +
" bytes");
470 return copy(destination, source.start(),
sizeof(T) * N);
473 template <
typename T,
size_t N>
474 inline void copy(
void* destination, T (&source)[N])
476 return copy(destination, source,
sizeof(T) * N);
484 template <
typename T,
size_t N>
485 inline void copy(T(&destination)[N], T* source)
487 return copy(destination, source,
sizeof(T) * N);
496 inline void copy(
region_t destination,
void* source,
size_t num_bytes)
499 if (destination.size() < num_bytes) {
500 throw ::std::logic_error(
"Number of bytes to copy exceeds destination size");
503 return copy(destination.start(), source, num_bytes);
506 inline void copy(
region_t destination,
void* source)
508 return copy(destination, source, destination.size());
523 void set(
void* ptr,
int byte_value,
size_t num_bytes);
537 return set(region.start(), byte_value, region.size());
548 return set(region, 0);
557 inline void zero(
void* ptr,
size_t num_bytes)
559 return set(ptr, 0, num_bytes);
569 template <
typename T>
570 inline void zero(T* ptr)
572 zero(ptr,
sizeof(T));
580 auto context_handle = context::current::detail_::get_handle();
581 if (context_handle != context::detail_::none) {
582 return cuMemcpy2D(¶ms);
584 auto current_device_id = cuda::device::current::detail_::get_id();
585 context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
586 context::current::detail_::push(context_handle);
589 auto status = cuMemcpy2D(¶ms);
590 context::current::detail_::pop();
591 cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
597 if (params.srcContext == params.dstContext) {
598 context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
599 auto *intra_context_params =
reinterpret_cast<base_copy_params<3>::intra_context_type *
>(¶ms);
600 return cuMemcpy3D(intra_context_params);
602 return cuMemcpy3DPeer(¶ms);
605 template<dimensionality_t NumDimensions>
608 return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params);
624 template<dimensionality_t NumDimensions>
627 status_t status = detail_::multidim_copy(params);
628 throw_if_error_lazy(status,
"Copying using a general copy parameters structure");
642 template<
typename T, dimensionality_t NumDimensions>
645 auto dims = destination.dimensions();
647 params.clear_offsets();
648 params.template set_extent<T>(dims);
649 params.set_endpoint(endpoint_t::source, source_context.
handle(),
const_cast<T*
>(source), dims);
650 params.set_endpoint(endpoint_t::destination, destination);
665 template<
typename T, dimensionality_t NumDimensions>
668 copy(destination, context_of(source), source);
681 template <
typename T, dimensionality_t NumDimensions>
684 auto dims = source.dimensions();
686 params.clear_offset(endpoint_t::source);
687 params.clear_offset(endpoint_t::destination);
688 params.template set_extent<T>(dims);
689 params.set_endpoint(endpoint_t::source, source);
690 params.template set_endpoint<T>(endpoint_t::destination, context.
handle(), destination, dims);
691 params.set_default_pitches();
706 template <
typename T, dimensionality_t NumDimensions>
709 copy(context_of(destination), destination, source);
712 template <
typename T, dimensionality_t NumDimensions>
715 auto dims = source.dimensions();
717 params.clear_offset(endpoint_t::source);
718 params.clear_offset(endpoint_t::destination);
719 params.template set_extent<T>(dims);
720 params.set_endpoint(endpoint_t::source, source);
721 params.set_endpoint(endpoint_t::destination, destination);
722 params.set_default_pitches();
723 params.clear_rest();;
725 detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
726 throw_if_error_lazy(status,
"Copying from a CUDA array into a regular memory region");
729 template <
typename T, dimensionality_t NumDimensions>
732 if (destination.size() < source.size_bytes()) {
733 throw ::std::logic_error(
"Attempt to copy an array into a memory region too small to hold the copy");
735 copy(destination.start(), source);
738 template <
typename T, dimensionality_t NumDimensions>
741 if (destination.size_bytes() < source.size()) {
742 throw ::std::logic_error(
"Attempt to copy into an array from a source region larger than the array's size");
744 copy(destination, source.start());
755 template <
typename T>
758 copy(destination, source,
sizeof(T));
787 inline void copy(
void* destination,
const void* source,
size_t num_bytes,
stream::handle_t stream_handle)
793 throw_if_error_lazy(result,
"Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
806 if (destination.size() < source.size()) {
807 throw ::std::logic_error(
"Source size exceeds destination size");
810 copy(destination.start(), source.start(), source.size(), stream_handle);
816 inline status_t multidim_copy_in_current_context(
817 ::std::integral_constant<dimensionality_t, 2>,
823 return cuMemcpy2DAsync(¶ms, stream_handle);
826 inline status_t multidim_copy_in_current_context(
827 ::std::integral_constant<dimensionality_t, 3>,
831 if (params.srcContext == params.dstContext) {
832 using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
833 auto* intra_context_params =
reinterpret_cast<intra_context_type *
>(¶ms);
834 return cuMemcpy3DAsync(intra_context_params, stream_handle);
836 return cuMemcpy3DPeerAsync(¶ms, stream_handle);
840 template<dimensionality_t NumDimensions>
842 return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
846 template<dimensionality_t NumDimensions>
848 context::handle_t context_handle,
852 CAW_SET_SCOPE_CONTEXT(context_handle);
853 return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
859 template <
typename T, dimensionality_t NumDimensions>
862 using memory::endpoint_t;
863 auto dims = source.dimensions();
866 params.clear_offset(endpoint_t::source);
867 params.clear_offset(endpoint_t::destination);
868 params.template set_extent<T>(dims);
869 params.set_endpoint(endpoint_t::source, source);
870 params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
871 params.set_default_pitches();
873 auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
874 throw_if_error(status,
"Scheduling an asynchronous copy from an array into a regular memory region");
878 template <
typename T, dimensionality_t NumDimensions>
881 using memory::endpoint_t;
882 auto dims = destination.dimensions();
885 params.clear_offset(endpoint_t::source);
886 params.clear_offset(endpoint_t::destination);
887 params.template set_extent<T>(dims);
888 params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
889 params.set_endpoint(endpoint_t::destination, destination);
890 params.set_default_pitches();
892 auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
893 throw_if_error(status,
"Scheduling an asynchronous copy from regular memory into an array");
910 template <
typename T>
913 copy(&destination, &source,
sizeof(T), stream_handle);
936 void copy(
void* destination,
void const* source,
size_t num_bytes,
const stream_t& stream);
942 if (source.size() < num_bytes) {
943 throw ::std::logic_error(
"Attempt to copy more than the source region's size");
946 copy(destination, source.start(), num_bytes, stream);
952 if (destination.size() < num_bytes) {
953 throw ::std::logic_error(
"Attempt to copy beyond the end of the destination region");
956 copy(destination.start(), source.start(), num_bytes, stream);
961 copy(destination, source, source.size(), stream);
966 copy(destination, source, source.size(), stream);
969 inline void copy(
region_t destination,
void* source,
const stream_t& stream)
971 return copy(destination.start(), source, destination.size(), stream);
978 template <
typename T,
size_t N>
982 if (destination.size() < N) {
983 throw ::std::logic_error(
"Source size exceeds destination size");
986 return copy(destination.start(), source,
sizeof(T) * N, stream);
989 inline void copy(
region_t destination,
void* source,
size_t num_bytes,
const stream_t& stream)
992 if (destination.size() < num_bytes) {
993 throw ::std::logic_error(
"Number of bytes to copy exceeds destination size");
996 return copy(destination.start(), source, num_bytes, stream);
1009 template <
typename T, dimensionality_t NumDimensions>
1012 template <
typename T, dimensionality_t NumDimensions>
1016 size_t required_size = destination.size() *
sizeof(T);
1017 if (source.size() != required_size) {
1018 throw ::std::invalid_argument(
1019 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1020 " bytes into an array of size " + ::std::to_string(required_size) +
" bytes");
1023 copy(destination, source.start(), stream);
1035 template <
typename T, dimensionality_t NumDimensions>
1038 template <
typename T, dimensionality_t NumDimensions>
1042 size_t required_size = source.size() *
sizeof(T);
1043 if (destination.size() < required_size) {
1044 throw ::std::invalid_argument(
1045 "Attempt to copy " + ::std::to_string(required_size) +
" bytes from an array into a " 1046 "region of smaller size (" + ::std::to_string(destination.size()) +
" bytes)");
1049 copy(destination.start(), source, stream);
1057 template <
typename T,
size_t N>
1058 inline void copy(T(&destination)[N], T* source,
const stream_t& stream)
1060 return copy(destination, source,
sizeof(T) * N, stream);
1068 template <
typename T,
size_t N>
1072 size_t required_size = N *
sizeof(T);
1073 if (source.size() != required_size) {
1074 throw ::std::invalid_argument(
1075 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1076 " bytes into an array of size " + ::std::to_string(required_size) +
" bytes");
1079 return copy(destination, source.start(),
sizeof(T) * N, stream);
1094 template <
typename T>
1105 inline void set(
void* start,
int byte_value,
size_t num_bytes,
stream::handle_t stream_handle)
1108 auto result = cuMemsetD8Async(
address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
1109 throw_if_error_lazy(result,
"asynchronously memsetting an on-device buffer");
1114 set(region.start(), byte_value, region.size(), stream_handle);
1117 inline void zero(
void* start,
size_t num_bytes,
stream::handle_t stream_handle)
1119 set(start, 0, num_bytes, stream_handle);
1124 zero(region.start(), region.size(), stream_handle);
1128 template <
typename T>
1129 inline void typed_set(T* start,
const T& value,
size_t num_elements,
stream::handle_t stream_handle)
1131 static_assert(::std::is_trivially_copyable<T>::value,
"Non-trivially-copyable types cannot be used for setting memory");
1133 sizeof(T) == 1 or
sizeof(T) == 2 or
1134 sizeof(T) == 4 or
sizeof(T) == 8,
1135 "Unsupported type size - only sizes 1, 2 and 4 are supported");
1139 case(1): result = cuMemsetD8Async (
address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle);
break;
1140 case(2): result = cuMemsetD16Async(
address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle);
break;
1141 case(4): result = cuMemsetD32Async(
address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle);
break;
1143 throw_if_error_lazy(result,
"Setting global device memory bytes");
1161 template <
typename T>
1162 void typed_set(T* start,
const T& value,
size_t num_elements,
const stream_t& stream);
1175 inline void set(
void* start,
int byte_value,
size_t num_bytes,
const stream_t& stream)
1177 return typed_set<unsigned char>(
1178 static_cast<unsigned char*
>(start),
1179 static_cast<unsigned char>(byte_value),
1187 void zero(
void* start,
size_t num_bytes,
const stream_t& stream);
1199 template <
typename T>
1202 zero(ptr,
sizeof(T), stream);
1210 namespace inter_context {
1215 void * destination_address,
1216 context::handle_t destination_context,
1217 const void * source_address,
1218 context::handle_t source_context,
1221 auto status = cuMemcpyPeer(
1222 reinterpret_cast<device::address_t>(destination_address),
1223 destination_context,
1224 reinterpret_cast<device::address_t>(source_address),
1225 source_context, num_bytes);
1226 throw_if_error_lazy(status,
1227 ::std::string(
"Failed copying data between devices: From address ")
1228 + cuda::detail_::ptr_as_hex(source_address) +
" in " 1229 + context::detail_::identify(source_context) +
" to address " 1230 + cuda::detail_::ptr_as_hex(destination_address) +
" in " 1231 + context::detail_::identify(destination_context) );
1239 const void * source_address,
1249 copy(destination, destination_context, source.start(), source_context, source.size());
1259 if (destination.size() < destination.size()) {
1260 throw ::std::invalid_argument(
1261 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1262 " bytes into a region of size " + ::std::to_string(destination.size()) +
" bytes");
1265 copy(destination.start(), destination_context, source, source_context);
1268 template <
typename T, dimensionality_t NumDimensions>
1274 return memory::copy(destination, source);
1283 context::handle_t destination_context_handle,
1285 context::handle_t source_context_handle,
1289 auto result = cuMemcpyPeerAsync(
1291 destination_context_handle,
1293 source_context_handle,
1294 num_bytes, stream_handle);
1298 throw_if_error_lazy(result,
"Scheduling an inter-context memory copy from " 1299 + context::detail_::identify(source_context_handle) +
" to " 1300 + context::detail_::identify(destination_context_handle) +
" on " 1301 + stream::detail_::identify(stream_handle));
1313 context::handle_t destination_context_handle,
1315 context::handle_t source_context_handle,
1319 if (destination.size() < source.size()) {
1320 throw ::std::logic_error(
"Can't copy a large region into a smaller one");
1323 copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(),
1331 void * destination_address,
1333 const void * source_address,
1352 template <
typename T, dimensionality_t NumDimensions>
1359 return memory::async::copy(destination, source, stream);
1396 size_t size_in_bytes,
1401 size_t size_in_bytes,
1418 inline void free(
void* host_ptr)
1420 auto result = cuMemFreeHost(host_ptr);
1421 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT 1422 if (result == status::success) {
return; }
1424 if (result == status::success or result == status::context_is_destroyed) {
return; }
1426 throw runtime_error(result,
"Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1429 inline void free(
region_t region) {
return free(region.data()); }
1434 void* operator()(
size_t num_bytes)
const {
return cuda::memory::host::allocate(num_bytes).data(); }
1437 void operator()(
void* ptr)
const { cuda::memory::host::free(ptr); }
1452 inline void register_(
const void *ptr,
size_t size,
unsigned flags)
1454 auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1455 throw_if_error_lazy(result,
1456 "Could not register and page-lock the region of " + ::std::to_string(size) +
1457 " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
1458 " with flags " + cuda::detail_::as_hex(flags));
1463 register_(region.start(), region.size(), flags);
1474 is_mapped_io_space =
true,
1475 is_not_mapped_io_space =
false 1485 do_not_map_into_device_memory =
false 1500 inline void register_(
const void *ptr,
size_t size,
1501 bool register_mapped_io_space,
1502 bool map_into_device_space,
1503 bool make_device_side_accesible_to_all
1504 #
if CUDA_VERSION >= 11010
1505 ,
bool considered_read_only_by_device
1511 (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1512 | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1513 | (make_device_side_accesible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1514 #
if CUDA_VERSION >= 11010
1515 | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
1520 inline void register_(
1522 bool register_mapped_io_space,
1523 bool map_into_device_space,
1524 bool make_device_side_accesible_to_all
1525 #
if CUDA_VERSION >= 11010
1526 ,
bool considered_read_only_by_device
1533 register_mapped_io_space,
1534 map_into_device_space,
1535 make_device_side_accesible_to_all
1536 #if CUDA_VERSION >= 11010 1537 , considered_read_only_by_device
1538 #endif // CUDA_VERSION >= 11010 1543 inline void register_(
void const *ptr,
size_t size)
1545 unsigned no_flags_set { 0 };
1546 detail_::register_(ptr, size, no_flags_set);
1551 register_(region.start(), region.size());
1557 inline void deregister(
const void *ptr)
1559 auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1560 throw_if_error_lazy(result,
1561 "Could not unregister the memory segment starting at address *a");
1566 deregister(region.start());
1579 inline void set(
void* start,
int byte_value,
size_t num_bytes)
1581 ::std::memset(start, byte_value, num_bytes);
1585 inline void zero(
void* start,
size_t num_bytes)
1587 set(start, 0, num_bytes);
1590 template <
typename T>
1591 inline void zero(T* ptr)
1593 zero(ptr,
sizeof(T));
1621 using advice_t = CUmem_advise;
1623 template <
typename T>
1629 template <
typename T>
1630 struct base_region_t :
public memory::detail_::base_region_t<T> {
1631 using parent = memory::detail_::base_region_t<T>;
1632 using parent::parent;
1634 bool is_read_mostly()
const 1636 return get_scalar_range_attribute<bool>(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1639 void designate_read_mostly()
const 1641 set_range_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1644 void undesignate_read_mostly()
const 1646 unset_range_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1649 device_t preferred_location()
const;
1650 void set_preferred_location(device_t& device)
const;
1651 void clear_preferred_location()
const;
1658 struct region_t :
public detail_::base_region_t<void> {
1659 using base_region_t<void>::base_region_t;
1664 using base_region_t<void const>::base_region_t;
1671 template <
typename Allocator = ::std::allocator<cuda::device_t> >
1672 typename ::std::vector<device_t, Allocator> accessors(
managed::const_region_t region,
const Allocator& allocator = Allocator() );
1676 template <
typename T>
1679 uint32_t attribute_value { 0 };
1680 auto result = cuMemRangeGetAttribute(
1681 &attribute_value,
sizeof(attribute_value), attribute,
device::address(region.start()), region.size());
1682 throw_if_error_lazy(result,
1683 "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1684 return static_cast<T
>(attribute_value);
1691 auto result = cuMemAdvise(
device::address(region.start()), region.size(), advice, device_id);
1692 throw_if_error_lazy(result,
"Setting an attribute for a managed memory range at " 1693 + cuda::detail_::ptr_as_hex(region.start()));
1698 inline advice_t as_advice(range_attribute_t attribute,
bool set)
1700 switch (attribute) {
1701 case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1702 return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1703 case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1704 return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1705 case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1706 return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1708 throw ::std::invalid_argument(
1709 "CUDA memory range attribute does not correspond to any range advice value");
1715 static constexpr
const bool set {
true };
1716 advise(region, as_advice(settable_attribute,
set), device_id);
1721 static constexpr
const bool unset {
false };
1723 advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1729 enum class attachment_t : unsigned {
1730 global = CU_MEM_ATTACH_GLOBAL,
1731 host = CU_MEM_ATTACH_HOST,
1732 single_stream = CU_MEM_ATTACH_SINGLE,
1738 inline region_t allocate_in_current_context(
1740 initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1743 auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
1744 attachment_t::global : attachment_t::host;
1754 auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
1757 status =
static_cast<status_t>(status::unknown);
1759 throw_if_error_lazy(status,
"Failed allocating " 1760 + ::std::to_string(num_bytes) +
" bytes of managed CUDA memory");
1761 return {as_pointer(allocated), num_bytes};
1769 inline void free(
void* ptr)
1774 throw_if_error_lazy(result,
"Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
1778 free(region.start());
1782 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
1785 void* operator()(
size_t num_bytes)
const 1787 return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
1796 context::handle_t context_handle,
1798 initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1800 CAW_SET_SCOPE_CONTEXT(context_handle);
1801 return allocate_in_current_context(num_bytes, initial_visibility);
1822 initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
1838 const device_t& device,
1840 initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
1851 region_t allocate(
size_t num_bytes);
1858 inline void free(
void* managed_ptr)
1861 throw_if_error_lazy(result,
1862 "Freeing managed memory (host and device regions) at address " 1863 + cuda::detail_::ptr_as_hex(managed_ptr));
1868 free(region.start());
1874 read_mostly = CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
1875 preferred_location = CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
1876 accessor = CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
1884 auto result = cuMemAdvise(
device::address(region.start()), region.size(),
1885 static_cast<managed::detail_::advice_t
>(advice), device_id);
1886 throw_if_error_lazy(result,
"Setting advice on a (managed) memory region at" 1887 + cuda::detail_::ptr_as_hex(region.start()) +
" w.r.t. " + cuda::device::detail_::identify(device_id));
1892 void set(
const_region_t region, kind_t advice,
const device_t& device);
1900 inline void prefetch(
1905 auto result = cuMemPrefetchAsync(
device::address(region.start()), region.size(), destination, source_stream_handle);
1906 throw_if_error_lazy(result,
1907 "Prefetching " + ::std::to_string(region.size()) +
" bytes of managed memory at address " 1908 + cuda::detail_::ptr_as_hex(region.start()) +
" to " + (
1909 (destination == CU_DEVICE_CPU) ?
"the host" : cuda::device::detail_::identify(destination)) );
1921 const cuda::device_t& destination,
1928 void prefetch_to_host(
1942 template <
typename T>
1946 auto get_device_pointer_flags = 0u;
1947 auto status = cuMemHostGetDevicePointer(
1950 get_device_pointer_flags);
1951 throw_if_error_lazy(status,
1952 "Failed obtaining the device-side pointer for host-memory pointer " 1953 + cuda::detail_::ptr_as_hex(host_memory_ptr) +
" supposedly mapped to device memory");
1954 return as_pointer(device_side_ptr);
1969 context::handle_t current_context_handle,
1970 size_t size_in_bytes,
1975 allocated.size_in_bytes = size_in_bytes;
1976 auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
1977 auto status = cuMemHostAlloc(&allocated.host_side, size_in_bytes, flags);
1978 if (
is_success(status) && (allocated.host_side ==
nullptr)) {
1980 status =
static_cast<status_t>(status::named_t::unknown);
1982 throw_if_error_lazy(status,
1983 "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
1984 +
" bytes of global memory in " + context::detail_::identify(current_context_handle));
1990 context::handle_t context_handle,
1991 size_t size_in_bytes,
1994 CAW_SET_SCOPE_CONTEXT(context_handle);
1995 return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
1998 inline void free(
void* host_side_pair)
2000 auto result = cuMemFreeHost(host_side_pair);
2001 throw_if_error_lazy(result,
"Freeing a mapped memory region pair with host-side address " 2002 + cuda::detail_::ptr_as_hex(host_side_pair));
2018 size_t size_in_bytes,
2030 cuda::device_t& device,
2031 size_t size_in_bytes,
2043 detail_::free(pair.host_side);
2056 void* host_side_ptr;
2057 auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER,
memory::device::address(ptr));
2058 throw_if_error_lazy(status,
"Failed obtaining the host-side address of supposedly-device-side pointer " 2059 + cuda::detail_::ptr_as_hex(ptr));
2060 detail_::free(host_side_ptr);
2093 template <
typename T>
2098 auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2099 throw_if_error_lazy(api_call_result,
"Could not locate the device memory address for a symbol");
2100 api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2101 throw_if_error_lazy(api_call_result,
"Could not locate the device memory address for the symbol at address" 2102 + cuda::detail_::ptr_as_hex(start));
2103 return { start, symbol_size };
2110 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_ cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:80
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:60
Proxy class for a CUDA stream.
Definition: stream.hpp:213
is_not_accessible_on_all_devices
Definition: memory.hpp:1495
Wrapper class for a CUDA context.
Definition: context.hpp:221
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:25
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
context::handle_t handle() const noexcept
The CUDA context ID this object is wrapping.
Definition: context.hpp:309
void copy(void *destination, const void *source, size_t num_bytes)
Synchronously copies data between memory spaces or within a memory space.
Definition: memory.hpp:533
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2074
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:30
void typed_set(T *start, const T &value, size_t num_elements)
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:545
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:898
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1493
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:332
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:219
T * get() const
Definition: pointer.hpp:133
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2052
void copy_single(T *destination, const T *source)
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:756
Memory regions appearing in both on the host-side and device-side address spaces with the regions in ...
Definition: memory.hpp:1658
The copy_parameters class template and related definitions.
options accepted by CUDA's allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:89
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:269
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:201
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:126
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2094
Definition: memory.hpp:1663
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA's complex copyin...
Definition: copy_parameters.hpp:23
Wrappers for getting and setting CUDA's choice of which device is 'current'.
is_accessible_on_all_devices
Definition: memory.hpp:1494
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Return a pointers address as a numeric value of the type appropriate for device.
Definition: types.hpp:661
CUstream handle_t
The CUDA API's handle for streams.
Definition: types.hpp:337
A wrapper class for host and/or device pointers, allowing easy access to CUDA's pointer attributes...
Representation, allocation and manipulation of CUDA-related memory, of different kinds.
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) for the device-side memory mapped ...
Definition: memory.hpp:1943
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1473
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1483
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:652
Definition: types.hpp:774
A pair of memory regions, one in system (=host) memory and one on a CUDA device's memory - mapped to ...
Definition: memory.hpp:125
Definition: types.hpp:765
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:205
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:192
void zero(void *start, size_t num_bytes)
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:362
Host-side (= system) memory which is "pinned", i.e.