26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_ 27 #define CUDA_API_WRAPPERS_MEMORY_HPP_ 36 #include "detail/unique_span.hpp" 39 #include <cuda_runtime.h> 63 isnt_portable =
false,
101 template <
typename T,
bool CheckConstructibility = false>
102 inline void check_allocation_type() noexcept
104 static_assert(::std::is_trivially_constructible<T>::value,
105 "Attempt to create a typed buffer of a non-trivially-constructive type");
106 static_assert(not CheckConstructibility or ::std::is_trivially_destructible<T>::value,
107 "Attempt to create a typed buffer of a non-trivially-destructible type " 108 "without allowing for its destruction");
109 static_assert(::std::is_trivially_copyable<T>::value,
110 "Attempt to create a typed buffer of a non-trivially-copyable type");
116 (options.
portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) |
117 (options.
write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
139 template <
typename T>
146 constexpr operator ::std::pair<span<T>, span<T>>()
const {
return { host_side, device_side }; }
147 constexpr operator ::std::pair<region_t, region_t>()
const {
return { host_side, device_side }; }
164 template <
typename T>
167 return { host_side.as_span<T>(), device_side.as_span<T>() };
183 #if CUDA_VERSION >= 11020 185 size_t num_bytes, optional<stream::handle_t> stream_handle = {})
190 #if CUDA_VERSION >= 11020 195 auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle);
198 status =
static_cast<decltype(status)
>(status::unknown);
201 "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
202 " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) );
207 auto status = cuMemAlloc(&allocated, num_bytes);
210 status =
static_cast<status_t>(status::unknown);
213 " bytes of global memory on the current CUDA device");
217 #if CUDA_VERSION >= 11020 220 size_t size_in_bytes,
221 optional<stream::handle_t> stream_handle = {})
223 CAW_SET_SCOPE_CONTEXT(context_handle);
224 return allocate_in_current_context(size_in_bytes, stream_handle);
229 size_t size_in_bytes)
231 CAW_SET_SCOPE_CONTEXT(context_handle);
232 return allocate_in_current_context(size_in_bytes);
236 #if CUDA_VERSION >= 11020 237 inline void free_on_stream(
238 void* allocated_region_start,
241 auto status = cuMemFreeAsync(
device::address(allocated_region_start), stream_handle);
243 "Failed scheduling an asynchronous freeing of the global memory region starting at " 244 + cuda::detail_::ptr_as_hex(allocated_region_start) +
" on " 245 + stream::detail_::identify(stream_handle));
247 #endif // CUDA_VERSION >= 11020 249 inline void free_in_current_context(
251 void* allocated_region_start)
253 auto result = cuMemFree(
address(allocated_region_start));
254 if (result == status::success) {
return; }
255 #ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT 256 if (result == status::context_is_destroyed) {
return; }
259 + cuda::detail_::ptr_as_hex(allocated_region_start)
260 +
" in " + context::detail_::identify(current_context_handle));
266 #if CUDA_VERSION >= 11020 267 inline void free(
void* region_start, optional_ref<const stream_t> stream = {});
269 inline void free(
void* ptr);
272 #if CUDA_VERSION >= 11020 273 inline void free(
region_t region, optional_ref<const stream_t> stream = {})
276 free(region.start(), stream);
282 free(region.start());
286 #if CUDA_VERSION >= 11020 300 region_t allocate(
size_t size_in_bytes, optional_ref<const stream_t> stream);
337 void* operator()(
size_t num_bytes)
const {
338 return detail_::allocate_in_current_context(num_bytes).start();
360 template <
typename T>
361 void typed_set(T*
start,
const T& value,
size_t num_elements, optional_ref<const stream_t> stream = {});
385 inline void set(
void*
start,
int byte_value,
size_t num_bytes, optional_ref<const stream_t> stream = {})
387 return typed_set<unsigned char>(
388 static_cast<unsigned char*
>(
start),
389 static_cast<unsigned char>(byte_value),
403 inline void set(
region_t region,
int byte_value, optional_ref<const stream_t> stream = {})
405 set(region.start(), byte_value, region.size(), stream);
416 inline void zero(
void* start,
size_t num_bytes, optional_ref<const stream_t> stream = {})
418 set(
start, 0, num_bytes, stream);
428 inline void zero(
region_t region, optional_ref<const stream_t> stream = {})
430 zero(region.start(), region.size(), stream);
440 template <
typename T>
441 inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
443 zero(ptr,
sizeof(T), stream);
464 inline void copy(
void* destination,
const void* source,
size_t num_bytes,
stream::handle_t stream_handle)
470 throw_if_error_lazy(result,
"Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
483 if (destination.size() < source.size()) {
484 throw ::std::logic_error(
"Source size exceeds destination size");
487 copy(destination.start(), source.start(), source.size(), stream_handle);
493 inline status_t multidim_copy_in_current_context(
494 ::std::integral_constant<dimensionality_t, 2>,
496 optional<stream::handle_t> stream_handle)
504 return stream_handle ?
505 cuMemcpy2DAsync(¶ms, *stream_handle) :
509 inline status_t multidim_copy_in_current_context(
510 ::std::integral_constant<dimensionality_t, 3>,
512 optional<stream::handle_t> stream_handle)
514 if (params.srcContext == params.dstContext) {
516 using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
517 auto* intra_context_params =
reinterpret_cast<intra_context_type *
>(¶ms);
518 return stream_handle ?
519 cuMemcpy3DAsync(intra_context_params, *stream_handle) :
520 cuMemcpy3D(intra_context_params);
522 return stream_handle ?
523 cuMemcpy3DPeerAsync(¶ms, *stream_handle) :
524 cuMemcpy3DPeer(¶ms);
527 template<dimensionality_t NumDimensions>
529 return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
533 template<dimensionality_t NumDimensions>
537 optional<stream::handle_t> stream_handle)
539 CAW_SET_SCOPE_CONTEXT(context_handle);
540 return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
545 template <
typename T, dimensionality_t NumDimensions>
549 auto dims = source.dimensions();
554 params.template set_extent<T>(dims);
556 params.
set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
557 params.set_default_pitches();
559 auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
560 throw_if_error(status,
"Scheduling an asynchronous copy from an array into a regular memory region");
564 template <
typename T, dimensionality_t NumDimensions>
568 auto dims = destination.dimensions();
573 params.template set_extent<T>(dims);
574 params.
set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
575 params.
set_endpoint(endpoint_t::destination, destination);
576 params.set_default_pitches();
578 auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
579 throw_if_error(status,
"Scheduling an asynchronous copy from regular memory into an array");
596 template <
typename T>
597 void copy_single(T* destination,
const T* source, optional<stream::handle_t> stream_handle)
599 copy(destination, source,
sizeof(T), stream_handle);
624 template <
typename T,
size_t N>
625 inline void copy(span<T> destination, c_array<const T,N>
const& source, optional_ref<const stream_t> stream = {})
628 if (destination.size() < N) {
629 throw ::std::logic_error(
"Source size exceeds destination size");
632 return copy(destination.data(), source,
sizeof(T) * N, stream);
644 template <
typename T,
size_t N>
645 void copy(c_array<T,N>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
648 if (source.size() > N) {
649 throw ::std::invalid_argument(
650 "Attempt to copy a span of " + ::std::to_string(source.size()) +
651 " elements into an array of " + ::std::to_string(N) +
" elements");
654 return copy(destination, source.start(),
sizeof(T) * N, stream);
666 template <
typename T,
size_t N>
667 inline void copy(
void* destination, c_array<const T,N>
const& source, optional_ref<const stream_t> stream = {})
669 return copy(destination, source,
sizeof(T) * N, stream);
689 template <
typename T,
size_t N>
690 inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream_t> stream = {})
692 return copy(destination, source,
sizeof(T) * N, stream);
709 void set(
void* ptr,
int byte_value,
size_t num_bytes, optional_ref<const stream_t> stream = {});
722 inline void set(
region_t region,
int byte_value, optional_ref<const stream_t> stream = {})
724 return set(region.start(), byte_value, region.size(), stream);
734 inline void zero(
region_t region, optional_ref<const stream_t> stream = {})
736 return set(region, 0, stream);
747 inline void zero(
void* ptr,
size_t num_bytes, optional_ref<const stream_t> stream = {})
749 return set(ptr, 0, num_bytes, stream);
760 template <
typename T>
763 zero(ptr,
sizeof(T));
768 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2> two,
copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
771 auto context_handle = context::current::detail_::get_handle();
772 if (context_handle != context::detail_::none) {
773 return detail_::multidim_copy_in_current_context(two, params, stream_handle);
775 auto current_device_id = cuda::device::current::detail_::get_id();
776 context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
777 context::current::detail_::push(context_handle);
780 auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle);
781 context::current::detail_::pop();
782 cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
788 context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
789 return multidim_copy(::std::integral_constant<dimensionality_t, 2>{}, params, stream_handle);
792 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>,
copy_parameters_t<3> params, optional<stream::handle_t> stream_handle)
794 if (params.srcContext == params.dstContext) {
795 context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
796 return detail_::multidim_copy_in_current_context(params, stream_handle);
798 return stream_handle ?
799 cuMemcpy3DPeerAsync(¶ms, *stream_handle) :
800 cuMemcpy3DPeer(¶ms);
803 template<dimensionality_t NumDimensions>
806 return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
822 template<dimensionality_t NumDimensions>
838 template<
typename T, dimensionality_t NumDimensions>
841 auto dims = destination.dimensions();
844 params.template set_extent<T>(dims);
845 params.
set_endpoint(endpoint_t::source, source_context.handle(),
const_cast<T*
>(source), dims);
846 params.
set_endpoint(endpoint_t::destination, destination);
848 copy(params, stream);
869 template <
typename T, dimensionality_t NumDimensions>
880 template<
typename T, dimensionality_t NumDimensions>
884 if (destination.
size() < source.size()) {
885 throw ::std::invalid_argument(
886 "Attempt to copy a span of " + ::std::to_string(source.size()) +
887 " elements into a CUDA array of " + ::std::to_string(destination.
size()) +
" elements");
890 copy(destination, source.data(), stream);
903 template <
typename T, dimensionality_t NumDimensions>
906 auto dims = source.dimensions();
910 params.template set_extent<T>(dims);
912 params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
913 params.set_default_pitches();
915 copy(params, stream);
936 template <
typename T, dimensionality_t NumDimensions>
947 template <
typename T, dimensionality_t NumDimensions>
951 if (destination.size() < source.
size()) {
952 throw ::std::invalid_argument(
953 "Attempt to copy a CUDA array of " + ::std::to_string(source.
size()) +
954 " elements into a span of " + ::std::to_string(destination.size()) +
" elements");
957 copy(destination.data(), source, stream);
967 template <
typename T, dimensionality_t NumDimensions>
970 auto dims = source.dimensions();
974 params.template set_extent<T>(dims);
976 params.
set_endpoint(endpoint_t::destination, destination);
977 params.set_default_pitches();
980 detail_::multidim_copy<NumDimensions>(source.context_handle(), params, stream);
1001 template <
typename T, dimensionality_t NumDimensions>
1005 if (destination.size() < source.
size_bytes()) {
1006 throw ::std::invalid_argument(
1007 "Attempt to copy " + ::std::to_string(source.
size_bytes()) +
" bytes from an array into a " 1008 "region of smaller size (" + ::std::to_string(destination.size()) +
" bytes)");
1011 copy(destination.start(), source, stream);
1027 template <
typename T, dimensionality_t NumDimensions>
1031 if (destination.
size_bytes() < source.size()) {
1032 throw ::std::invalid_argument(
1033 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1034 " bytes into an array of size " + ::std::to_string(destination.
size_bytes()) +
" bytes");
1037 copy(destination, static_cast<T const*>(source.start()), stream);
1056 template <
typename T>
1057 void copy_single(T* destination,
const T* source, optional_ref<const stream_t> stream = {});
1077 void copy(
void* destination,
void const* source,
size_t num_bytes, optional_ref<const stream_t> stream = {});
1099 template <
typename T,
size_t N>
1100 inline void copy(c_array<T,N>& destination,
const_region_t source, optional_ref<const stream_t> stream = {})
1103 size_t required_size = N *
sizeof(T);
1104 if (source.size() != required_size) {
1105 throw ::std::invalid_argument(
1106 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1107 " bytes into an array of size " + ::std::to_string(required_size) +
" bytes");
1110 return copy(&(destination[0]), source.start(),
sizeof(T) * N, stream);
1136 template <
typename T,
size_t N>
1137 inline void copy(
region_t destination, c_array<const T,N>
const& source, optional_ref<const stream_t> stream = {})
1140 if (destination.size() < N) {
1141 throw ::std::logic_error(
"Source size exceeds destination size");
1144 return copy(destination.start(), source,
sizeof(T) * N, stream);
1162 if (destination.size() < num_bytes) {
1163 throw ::std::logic_error(
"Attempt to copy beyond the end of the destination region");
1166 copy(destination.start(), source.start(), num_bytes, stream);
1188 copy(destination, source, source.size(), stream);
1209 inline void copy(
region_t destination,
void* source, optional_ref<const stream_t> stream = {})
1211 return copy(destination.start(), source, destination.size(), stream);
1231 inline void copy(
region_t destination,
void* source,
size_t num_bytes, optional_ref<const stream_t> stream = {})
1234 if (destination.size() < num_bytes) {
1235 throw ::std::logic_error(
"Number of bytes to copy exceeds destination size");
1238 return copy(destination.start(), source, num_bytes, stream);
1260 inline void copy(
void* destination,
const_region_t source,
size_t num_bytes, optional_ref<const stream_t> stream = {})
1263 if (source.size() < num_bytes) {
1264 throw ::std::logic_error(
"Attempt to copy more than the source region's size");
1267 copy(destination, source.start(), num_bytes, stream);
1286 copy(destination, source, source.size(), stream);
1296 auto result = cuMemsetD8Async(
address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
1303 set(region.start(), byte_value, region.size(), stream_handle);
1308 set(
start, 0, num_bytes, stream_handle);
1313 zero(region.start(), region.size(), stream_handle);
1317 template <
typename T>
1318 inline void typed_set(T* start,
const T& value,
size_t num_elements,
stream::handle_t stream_handle)
1320 static_assert(::std::is_trivially_copyable<T>::value,
"Non-trivially-copyable types cannot be used for setting memory");
1322 sizeof(T) == 1 or
sizeof(T) == 2 or
1323 sizeof(T) == 4 or
sizeof(T) == 8,
1324 "Unsupported type size - only sizes 1, 2 and 4 are supported");
1328 case(1): result = cuMemsetD8Async (
address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle);
break;
1329 case(2): result = cuMemsetD16Async(
address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle);
break;
1330 case(4): result = cuMemsetD32Async(
address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle);
break;
1349 template <
typename T>
1350 void typed_set(T* start,
const T& value,
size_t num_elements, optional_ref<const stream_t> stream);
1360 void zero(
void* start,
size_t num_bytes, optional_ref<const stream_t> stream);
1364 namespace inter_context {
1369 const void * source_address,
1372 optional_ref<const stream_t> stream);
1436 void * destination_address,
1438 const void * source_address,
1441 optional_ref<const stream_t> stream);
1449 optional_ref<const stream_t> stream)
1451 copy(destination, destination_context, source.start(), source_context, source.size(), stream);
1460 optional_ref<const stream_t> stream)
1462 copy(destination.start(), destination_context, source, source_context, destination.size(), stream);
1471 optional_ref<const stream_t> stream)
1474 if (destination.size() < destination.size()) {
1475 throw ::std::invalid_argument(
1476 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1477 " bytes into a region of size " + ::std::to_string(destination.size()) +
" bytes");
1480 copy(destination.start(), destination_context, source, source_context, stream);
1484 template <
typename T, dimensionality_t NumDimensions>
1488 optional_ref<const stream_t> stream)
1507 size_t size_in_bytes,
1543 size_t size_in_bytes,
1565 auto result = cuMemFreeHost(host_ptr);
1566 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT 1567 if (result == status::success) {
return; }
1569 if (result == status::success or result == status::context_is_destroyed) {
return; }
1571 throw runtime_error(result,
"Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1601 inline void register_(
const void *ptr,
size_t size,
unsigned flags)
1603 auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1605 "Could not register and page-lock the region of " + ::std::to_string(size) +
1606 " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
1607 " with flags " + cuda::detail_::as_hex(flags));
1612 register_(region.start(), region.size(), flags);
1624 is_mapped_io_space =
true,
1625 is_not_mapped_io_space =
false 1636 do_not_map_into_device_memory =
false 1676 bool register_mapped_io_space,
1677 bool map_into_device_space,
1678 bool make_device_side_accessible_to_all
1679 #
if CUDA_VERSION >= 11010
1680 ,
bool considered_read_only_by_device
1686 (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1687 | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1688 | (make_device_side_accessible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1689 #
if CUDA_VERSION >= 11010
1690 | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
1722 bool register_mapped_io_space,
1723 bool map_into_device_space,
1724 bool make_device_side_accessible_to_all
1725 #
if CUDA_VERSION >= 11010
1726 ,
bool considered_read_only_by_device
1733 register_mapped_io_space,
1734 map_into_device_space,
1735 make_device_side_accessible_to_all
1736 #if CUDA_VERSION >= 11010 1737 , considered_read_only_by_device
1738 #endif // CUDA_VERSION >= 11010 1758 unsigned no_flags_set { 0 };
1777 register_(region.start(), region.size());
1789 auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1791 "Could not unregister the memory segment starting at address *a");
1813 inline void set(
void*
start,
int byte_value,
size_t num_bytes)
1815 ::std::memset(start, byte_value, num_bytes);
1824 memory::set(region.start(), byte_value, region.size(), nullopt);
1833 inline void zero(
void* start,
size_t num_bytes)
1835 set(
start, 0, num_bytes);
1854 template <
typename T>
1857 zero(ptr,
sizeof(T));
1869 using attribute_t = CUmem_range_attribute;
1870 using advice_t = CUmem_advise;
1872 template <
typename T>
1873 inline T get_scalar_attribute(
const_region_t region, attribute_t attribute)
1875 uint32_t attribute_value { 0 };
1876 auto result = cuMemRangeGetAttribute(
1877 &attribute_value,
sizeof(attribute_value), attribute,
device::address(region.start()), region.size());
1879 "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1880 return static_cast<T
>(attribute_value);
1887 auto result = cuMemAdvise(
device::address(region.start()), region.size(), advice, device_id);
1889 + cuda::detail_::ptr_as_hex(region.start()));
1892 inline advice_t as_advice(attribute_t attribute,
bool set)
1894 switch (attribute) {
1895 case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1896 return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1897 case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1898 return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1899 case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1900 return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1902 throw ::std::invalid_argument(
1903 "CUDA memory range attribute does not correspond to any range advice value");
1909 static constexpr
const bool set {
true };
1910 advise(region, as_advice(settable_attribute,
set), device_id);
1913 inline void set_attribute(
const_region_t region, attribute_t settable_attribute)
1915 static constexpr
const bool set {
true };
1917 advise(region, as_advice(settable_attribute,
set), dummy_device_id);
1920 inline void unset_attribute(
const_region_t region, attribute_t settable_attribute)
1922 static constexpr
const bool unset {
false };
1924 advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1933 template <
typename GenericRegion>
1934 struct region_helper :
public GenericRegion {
1935 using GenericRegion::GenericRegion;
1937 bool is_read_mostly()
const 1939 return range::detail_::get_scalar_attribute<bool>(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1942 void designate_read_mostly()
const 1944 range::detail_::set_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1947 void undesignate_read_mostly()
const 1949 range::detail_::unset_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1952 device_t preferred_location()
const;
1953 void set_preferred_location(
device_t& device)
const;
1954 void clear_preferred_location()
const;
1960 using region_t = detail_::region_helper<memory::region_t>;
1971 template <
typename Allocator = ::std::allocator<cuda::device_t> >
1976 global = CU_MEM_ATTACH_GLOBAL,
1977 host = CU_MEM_ATTACH_HOST,
1978 single_stream = CU_MEM_ATTACH_SINGLE,
1988 auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
1989 attachment_t::global : attachment_t::host;
1999 auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
2002 status =
static_cast<status_t>(status::unknown);
2005 + ::std::to_string(num_bytes) +
" bytes of managed CUDA memory");
2014 inline void free(
void* ptr)
2018 throw_if_error_lazy(result,
"Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
2024 free(region.start());
2027 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2030 void* operator()(
size_t num_bytes)
const 2032 return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
2045 CAW_SET_SCOPE_CONTEXT(context_handle);
2046 return allocate_in_current_context(num_bytes, initial_visibility);
2103 inline void free(
void* managed_ptr)
2107 "Freeing managed memory (host and device regions) at address " 2108 + cuda::detail_::ptr_as_hex(managed_ptr));
2114 free(region.start());
2124 auto result = cuMemPrefetchAsync(
device::address(region.start()), region.size(), destination, source_stream_handle);
2126 "Prefetching " + ::std::to_string(region.size()) +
" bytes of managed memory at address " 2127 + cuda::detail_::ptr_as_hex(region.start()) +
" to " + (
2128 (destination == CU_DEVICE_CPU) ?
"the host" : cuda::device::detail_::identify(destination)) );
2162 template <
typename T>
2165 auto unconsted_host_mem_ptr =
const_cast<typename ::std::remove_const<T>::type *
>(host_memory_ptr);
2167 auto get_device_pointer_flags = 0u;
2168 auto status = cuMemHostGetDevicePointer(
2170 unconsted_host_mem_ptr,
2171 get_device_pointer_flags);
2173 "Failed obtaining the device-side pointer for host-memory pointer " 2174 + cuda::detail_::ptr_as_hex(host_memory_ptr) +
" supposedly mapped to device memory");
2207 size_t size_in_bytes,
2212 auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
2213 void* allocated_ptr;
2214 auto status = cuMemHostAlloc(&allocated_ptr, size_in_bytes, flags);
2215 if (
is_success(status) && (allocated_ptr ==
nullptr)) {
2217 status =
static_cast<status_t>(status::named_t::unknown);
2220 "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
2221 +
" bytes of global memory in " + context::detail_::identify(current_context_handle));
2222 allocated.host_side = { allocated_ptr, size_in_bytes };
2229 size_t size_in_bytes,
2232 CAW_SET_SCOPE_CONTEXT(context_handle);
2233 return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
2236 inline void free(
void* host_side_pair)
2238 auto result = cuMemFreeHost(host_side_pair);
2240 + cuda::detail_::ptr_as_hex(host_side_pair));
2256 size_t size_in_bytes,
2269 size_t size_in_bytes,
2293 void* host_side_ptr;
2294 auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER,
memory::device::address(ptr));
2295 throw_if_error_lazy(status,
"Failed obtaining the host-side address of supposedly-device-side pointer " 2296 + cuda::detail_::ptr_as_hex(ptr));
2335 template <
typename T,
typename RawDeleter,
typename RegionAllocator>
2336 unique_span<T> make_convenient_type_unique_span(
size_t size, RegionAllocator allocator)
2338 memory::detail_::check_allocation_type<T>();
2339 auto deleter = [](span<T> sp) {
2340 return RawDeleter{}(sp.data());
2342 region_t allocated_region = allocator(size *
sizeof(T));
2343 return unique_span<T>(
2344 allocated_region.as_span<T>(),
2356 template <
typename T>
2359 auto allocate_in_current_context_ = [](
size_t size) {
return allocate_in_current_context(size); };
2360 CAW_SET_SCOPE_CONTEXT(context_handle);
2361 return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context_);
2383 template <
typename T>
2391 template <
typename T>
2400 template <
typename T>
2406 template <
typename T>
2409 return device::make_unique_span<T>(context, size);
2413 template <
typename T>
2416 return device::make_unique_span<T>(device, size);
2441 template <
typename T>
2445 auto allocator = [](
size_t size) {
return allocate(size); };
2446 return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2455 template <
typename T, initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2460 CAW_SET_SCOPE_CONTEXT(context_handle);
2461 auto allocator = [](
size_t size) {
2462 return allocate_in_current_context(size, InitialVisibility);
2464 return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2491 template <
typename T>
2502 template <
typename T>
2514 template <
typename T>
2532 template <
typename T>
2537 auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2538 throw_if_error_lazy(api_call_result,
"Could not locate the device memory address for a symbol");
2539 api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2540 throw_if_error_lazy(api_call_result,
"Could not locate the device memory address for the symbol at address" 2541 + cuda::detail_::ptr_as_hex(start));
2542 return {
start, symbol_size };
2549 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_ void register_(const_region_t region)
Register a memory region with the CUDA driver.
Definition: memory.hpp:1775
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2289
Proxy class for a CUDA stream.
Definition: stream.hpp:246
endpoint_t
Type for choosing between endpoints of copy operations.
Definition: copy_parameters.hpp:19
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory.
Definition: memory.hpp:248
cpu_write_combining write_combining
whether or not the GPU can batch multiple writes to this area and propagate them at its convenience...
Definition: memory.hpp:96
unique_span< T > make_unique_span(const context_t &context, size_t size)
See device::make_unique_span(const context_t& context, size_t size)
Definition: memory.hpp:2407
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
is_not_accessible_on_all_devices
Definition: memory.hpp:1646
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
this_type & clear_offsets() noexcept
Clear the offsets into both the source and the destination endpoint regions.
Definition: copy_parameters.hpp:275
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:29
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:395
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
::std::size_t size_bytes() const noexcept
Overall size in bytes of the elements of the array, over all dimensions.
Definition: array.hpp:248
portability_across_contexts portability
whether or not the allocated region can be used in different CUDA contexts.
Definition: memory.hpp:93
constexpr span_pair_t< T > as_spans() const
Definition: memory.hpp:165
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region.
Definition: memory.hpp:208
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:335
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130
T * get() const
Definition: pointer.hpp:139
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:82
memory::region_t host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:161
void set(void *start, int byte_value, size_t num_bytes)
Sets all bytes in a stretch of host-side memory to a single value.
Definition: memory.hpp:1813
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
The cuda::memory::copy_parameters_t class template and related definitions.
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator)
Definition: memory.hpp:219
options accepted by CUDA's allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:91
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
::std::size_t size() const noexcept
Overall number of elements in the array, over all dimensions.
Definition: array.hpp:245
span< T > host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:143
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:208
A pair of memory regions, one in system (=host) memory and one on a CUDA device's memory - mapped to ...
Definition: memory.hpp:158
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1634
void deregister(const_region_t region)
Have the CUDA driver "forget" about a region of memory which was previously registered with it...
Definition: memory.hpp:1795
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1644
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:131
region_pair_t allocate(cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:280
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2533
this_type & clear_offset(endpoint_t endpoint) noexcept
Set the copy operation to use the multi-dimensional region of the specified endpoint without skipping...
Definition: copy_parameters.hpp:269
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA's complex copyin...
Definition: copy_parameters.hpp:68
Wrappers for getting and setting CUDA's choice of which device is 'current'.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1623
void free(void *host_ptr)
Frees a region of pinned host memory which was allocated with one of the pinned host memory allocatio...
Definition: memory.hpp:1563
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:239
A wrapper class for host and/or device pointers, allowing easy access to CUDA's pointer attributes...
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:700
void set(void *ptr, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets a number of bytes in memory to a fixed value.
Definition: memory.hpp:422
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
A pair of memory spans, one in device-global memory and one in host/system memory, mapped to it.
Definition: memory.hpp:140
const_region_t device_side_region_for(const_region_t region)
Get the memory region mapped to a given host-side region.
Definition: memory.hpp:2189
void free(region_pair_t pair)
Free a pair of mapped memory regions.
Definition: memory.hpp:2278
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region.
Definition: memory.hpp:213
this_type & set_endpoint(endpoint_t endpoint, const cuda::array_t< T, NumDimensions > &array) noexcept
Set one of the copy endpoints to a CUDA array.
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:672
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) given given a host-side pointer ma...
Definition: memory.hpp:2163
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:62
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:75
is_accessible_on_all_devices
Definition: memory.hpp:1645
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:240
region_t allocate(size_t size_in_bytes, allocation_options options)
Allocates pinned host memory.
Definition: memory.hpp:344
Wrapper class for a CUDA device.
Definition: device.hpp:135
void zero(region_t region, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:734
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:203
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1975
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2311