26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_ 27 #define CUDA_API_WRAPPERS_MEMORY_HPP_ 36 #include "detail/unique_span.hpp" 39 #include <cuda_runtime.h> 65 isnt_portable =
false,
103 template <
typename T,
bool CheckConstructibility = false>
104 inline void check_allocation_type() noexcept
106 static_assert(::std::is_trivially_constructible<T>::value,
107 "Attempt to create a typed buffer of a non-trivially-constructive type");
108 static_assert(not CheckConstructibility or ::std::is_trivially_destructible<T>::value,
109 "Attempt to create a typed buffer of a non-trivially-destructible type " 110 "without allowing for its destruction");
111 static_assert(::std::is_trivially_copyable<T>::value,
112 "Attempt to create a typed buffer of a non-trivially-copyable type");
118 (options.
portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) |
119 (options.
write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
141 template <
typename T>
148 constexpr operator ::std::pair<span<T>, span<T>>()
const {
return { host_side, device_side }; }
149 constexpr operator ::std::pair<region_t, region_t>()
const {
return { host_side, device_side }; }
166 template <
typename T>
169 return { host_side.as_span<T>(), device_side.as_span<T>() };
185 #if CUDA_VERSION >= 11020 187 size_t num_bytes, optional<stream::handle_t> stream_handle = {})
192 #if CUDA_VERSION >= 11020 197 auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle);
200 status =
static_cast<decltype(status)
>(status::unknown);
203 "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
204 " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) );
209 auto status = cuMemAlloc(&allocated, num_bytes);
212 status =
static_cast<status_t>(status::unknown);
215 " bytes of global memory on the current CUDA device");
219 #if CUDA_VERSION >= 11020 222 size_t size_in_bytes,
223 optional<stream::handle_t> stream_handle = {})
225 CAW_SET_SCOPE_CONTEXT(context_handle);
226 return allocate_in_current_context(size_in_bytes, stream_handle);
231 size_t size_in_bytes)
233 CAW_SET_SCOPE_CONTEXT(context_handle);
234 return allocate_in_current_context(size_in_bytes);
238 #if CUDA_VERSION >= 11020 239 inline void free_on_stream(
240 void* allocated_region_start,
243 auto status = cuMemFreeAsync(
device::address(allocated_region_start), stream_handle);
245 "Failed scheduling an asynchronous freeing of the global memory region starting at " 246 + cuda::detail_::ptr_as_hex(allocated_region_start) +
" on " 247 + stream::detail_::identify(stream_handle));
249 #endif // CUDA_VERSION >= 11020 251 inline void free_in_current_context(
253 void* allocated_region_start)
255 auto result = cuMemFree(
address(allocated_region_start));
256 if (result == status::success) {
return; }
257 #ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT 258 if (result == status::context_is_destroyed) {
return; }
261 + cuda::detail_::ptr_as_hex(allocated_region_start)
262 +
" in " + context::detail_::identify(current_context_handle));
268 #if CUDA_VERSION >= 11020 269 inline void free(
void* region_start, optional_ref<const stream_t> stream = {});
271 inline void free(
void* ptr);
274 #if CUDA_VERSION >= 11020 275 inline void free(
region_t region, optional_ref<const stream_t> stream = {})
278 free(region.start(), stream);
284 free(region.start());
288 #if CUDA_VERSION >= 11020 302 region_t allocate(
size_t size_in_bytes, optional_ref<const stream_t> stream);
339 void* operator()(
size_t num_bytes)
const {
340 return detail_::allocate_in_current_context(num_bytes).start();
362 template <
typename T>
363 void typed_set(T*
start,
const T& value,
size_t num_elements, optional_ref<const stream_t> stream = {});
387 inline void set(
void*
start,
int byte_value,
size_t num_bytes, optional_ref<const stream_t> stream = {})
389 return typed_set<unsigned char>(
390 static_cast<unsigned char*
>(
start),
391 static_cast<unsigned char>(byte_value),
405 inline void set(
region_t region,
int byte_value, optional_ref<const stream_t> stream = {})
407 set(region.start(), byte_value, region.size(), stream);
418 inline void zero(
void* start,
size_t num_bytes, optional_ref<const stream_t> stream = {})
420 set(
start, 0, num_bytes, stream);
430 inline void zero(
region_t region, optional_ref<const stream_t> stream = {})
432 zero(region.start(), region.size(), stream);
442 template <
typename T>
443 inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
445 zero(ptr,
sizeof(T), stream);
466 inline void copy(
void* destination,
const void* source,
size_t num_bytes,
stream::handle_t stream_handle)
472 throw_if_error_lazy(result,
"Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
485 if (destination.size() < source.size()) {
486 throw ::std::logic_error(
"Source size exceeds destination size");
489 copy(destination.start(), source.start(), source.size(), stream_handle);
495 inline status_t multidim_copy_in_current_context(
496 ::std::integral_constant<dimensionality_t, 2>,
498 optional<stream::handle_t> stream_handle)
506 return stream_handle ?
507 cuMemcpy2DAsync(¶ms, *stream_handle) :
511 inline status_t multidim_copy_in_current_context(
512 ::std::integral_constant<dimensionality_t, 3>,
514 optional<stream::handle_t> stream_handle)
516 if (params.srcContext == params.dstContext) {
518 using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
519 auto* intra_context_params =
reinterpret_cast<intra_context_type *
>(¶ms);
520 return stream_handle ?
521 cuMemcpy3DAsync(intra_context_params, *stream_handle) :
522 cuMemcpy3D(intra_context_params);
524 return stream_handle ?
525 cuMemcpy3DPeerAsync(¶ms, *stream_handle) :
526 cuMemcpy3DPeer(¶ms);
529 template<dimensionality_t NumDimensions>
531 return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
535 template<dimensionality_t NumDimensions>
539 optional<stream::handle_t> stream_handle)
541 CAW_SET_SCOPE_CONTEXT(context_handle);
542 return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
547 template <
typename T, dimensionality_t NumDimensions>
551 auto dims = source.dimensions();
556 params.template set_extent<T>(dims);
558 params.
set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
559 params.set_default_pitches();
561 auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
562 throw_if_error(status,
"Scheduling an asynchronous copy from an array into a regular memory region");
566 template <
typename T, dimensionality_t NumDimensions>
570 auto dims = destination.dimensions();
575 params.template set_extent<T>(dims);
576 params.
set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
577 params.
set_endpoint(endpoint_t::destination, destination);
578 params.set_default_pitches();
580 auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
581 throw_if_error(status,
"Scheduling an asynchronous copy from regular memory into an array");
598 template <
typename T>
599 void copy_single(T* destination,
const T* source, optional<stream::handle_t> stream_handle)
601 copy(destination, source,
sizeof(T), stream_handle);
626 template <
typename T,
size_t N>
627 inline void copy(span<T> destination, c_array<const T,N>
const& source, optional_ref<const stream_t> stream = {})
630 if (destination.size() < N) {
631 throw ::std::logic_error(
"Source size exceeds destination size");
634 return copy(destination.data(), source,
sizeof(T) * N, stream);
646 template <
typename T,
size_t N>
647 void copy(c_array<T,N>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
650 if (source.size() > N) {
651 throw ::std::invalid_argument(
652 "Attempt to copy a span of " + ::std::to_string(source.size()) +
653 " elements into an array of " + ::std::to_string(N) +
" elements");
656 return copy(destination, source.start(),
sizeof(T) * N, stream);
668 template <
typename T,
size_t N>
669 inline void copy(
void* destination, c_array<const T,N>
const& source, optional_ref<const stream_t> stream = {})
671 return copy(destination, source,
sizeof(T) * N, stream);
691 template <
typename T,
size_t N>
692 inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream_t> stream = {})
694 return copy(destination, source,
sizeof(T) * N, stream);
711 void set(
void* ptr,
int byte_value,
size_t num_bytes, optional_ref<const stream_t> stream = {});
724 inline void set(
region_t region,
int byte_value, optional_ref<const stream_t> stream = {})
726 return set(region.start(), byte_value, region.size(), stream);
736 inline void zero(
region_t region, optional_ref<const stream_t> stream = {})
738 return set(region, 0, stream);
749 inline void zero(
void* ptr,
size_t num_bytes, optional_ref<const stream_t> stream = {})
751 return set(ptr, 0, num_bytes, stream);
761 template <
typename T>
764 zero(ptr,
sizeof(T));
769 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2> two,
copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
772 auto context_handle = context::current::detail_::get_handle();
773 if (context_handle != context::detail_::none) {
774 return detail_::multidim_copy_in_current_context(two, params, stream_handle);
776 auto current_device_id = cuda::device::current::detail_::get_id();
777 context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
778 context::current::detail_::push(context_handle);
781 auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle);
782 context::current::detail_::pop();
783 cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
789 context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
790 return multidim_copy(::std::integral_constant<dimensionality_t, 2>{}, params, stream_handle);
793 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>,
copy_parameters_t<3> params, optional<stream::handle_t> stream_handle)
795 if (params.srcContext == params.dstContext) {
796 context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
797 return detail_::multidim_copy_in_current_context(params, stream_handle);
799 return stream_handle ?
800 cuMemcpy3DPeerAsync(¶ms, *stream_handle) :
801 cuMemcpy3DPeer(¶ms);
804 template<dimensionality_t NumDimensions>
807 return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
823 template<dimensionality_t NumDimensions>
838 template<
typename T, dimensionality_t NumDimensions>
841 auto dims = destination.dimensions();
844 params.template set_extent<T>(dims);
845 params.
set_endpoint(endpoint_t::source, source_context.handle(),
const_cast<T*
>(source), dims);
846 params.
set_endpoint(endpoint_t::destination, destination);
848 copy(params, stream);
869 template <
typename T, dimensionality_t NumDimensions>
880 template<
typename T, dimensionality_t NumDimensions>
884 if (destination.
size() < source.size()) {
885 throw ::std::invalid_argument(
886 "Attempt to copy a span of " + ::std::to_string(source.size()) +
887 " elements into a CUDA array of " + ::std::to_string(destination.
size()) +
" elements");
890 copy(destination, source.data(), stream);
903 template <
typename T, dimensionality_t NumDimensions>
906 auto dims = source.dimensions();
910 params.template set_extent<T>(dims);
912 params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
913 params.set_default_pitches();
915 copy(params, stream);
936 template <
typename T, dimensionality_t NumDimensions>
947 template <
typename T, dimensionality_t NumDimensions>
951 if (destination.size() < source.
size()) {
952 throw ::std::invalid_argument(
953 "Attempt to copy a CUDA array of " + ::std::to_string(source.
size()) +
954 " elements into a span of " + ::std::to_string(destination.size()) +
" elements");
957 copy(destination.data(), source, stream);
967 template <
typename T, dimensionality_t NumDimensions>
970 auto dims = source.dimensions();
974 params.template set_extent<T>(dims);
976 params.
set_endpoint(endpoint_t::destination, destination);
977 params.set_default_pitches();
980 detail_::multidim_copy<NumDimensions>(source.context_handle(), params, stream);
1001 template <
typename T, dimensionality_t NumDimensions>
1005 if (destination.size() < source.
size_bytes()) {
1006 throw ::std::invalid_argument(
1007 "Attempt to copy " + ::std::to_string(source.
size_bytes()) +
" bytes from an array into a " 1008 "region of smaller size (" + ::std::to_string(destination.size()) +
" bytes)");
1011 copy(destination.start(), source, stream);
1027 template <
typename T, dimensionality_t NumDimensions>
1031 if (destination.
size_bytes() < source.size()) {
1032 throw ::std::invalid_argument(
1033 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1034 " bytes into an array of size " + ::std::to_string(destination.
size_bytes()) +
" bytes");
1037 copy(destination, static_cast<T const*>(source.start()), stream);
1056 template <
typename T>
1057 void copy_single(T* destination,
const T* source, optional_ref<const stream_t> stream = {});
1077 void copy(
void* destination,
void const* source,
size_t num_bytes, optional_ref<const stream_t> stream = {});
1099 template <
typename T,
size_t N>
1100 inline void copy(c_array<T,N>& destination,
const_region_t source, optional_ref<const stream_t> stream = {})
1103 size_t required_size = N *
sizeof(T);
1104 if (source.size() != required_size) {
1105 throw ::std::invalid_argument(
1106 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1107 " bytes into an array of size " + ::std::to_string(required_size) +
" bytes");
1110 return copy(&(destination[0]), source.start(),
sizeof(T) * N, stream);
1136 template <
typename T,
size_t N>
1137 inline void copy(
region_t destination, c_array<const T,N>
const& source, optional_ref<const stream_t> stream = {})
1140 if (destination.size() < N) {
1141 throw ::std::logic_error(
"Source size exceeds destination size");
1144 return copy(destination.start(), source,
sizeof(T) * N, stream);
1162 if (destination.size() < num_bytes) {
1163 throw ::std::logic_error(
"Attempt to copy beyond the end of the destination region");
1166 copy(destination.start(), source.start(), num_bytes, stream);
1188 copy(destination, source, source.size(), stream);
1209 inline void copy(
region_t destination,
void* source, optional_ref<const stream_t> stream = {})
1211 return copy(destination.start(), source, destination.size(), stream);
1231 inline void copy(
region_t destination,
void* source,
size_t num_bytes, optional_ref<const stream_t> stream = {})
1234 if (destination.size() < num_bytes) {
1235 throw ::std::logic_error(
"Number of bytes to copy exceeds destination size");
1238 return copy(destination.start(), source, num_bytes, stream);
1260 inline void copy(
void* destination,
const_region_t source,
size_t num_bytes, optional_ref<const stream_t> stream = {})
1263 if (source.size() < num_bytes) {
1264 throw ::std::logic_error(
"Attempt to copy more than the source region's size");
1267 copy(destination, source.start(), num_bytes, stream);
1286 copy(destination, source, source.size(), stream);
1296 auto result = cuMemsetD8Async(
address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
1303 set(region.start(), byte_value, region.size(), stream_handle);
1308 set(
start, 0, num_bytes, stream_handle);
1313 zero(region.start(), region.size(), stream_handle);
1317 template <
typename T>
1318 inline void typed_set(T* start,
const T& value,
size_t num_elements,
stream::handle_t stream_handle)
1320 static_assert(::std::is_trivially_copyable<T>::value,
"Non-trivially-copyable types cannot be used for setting memory");
1322 sizeof(T) == 1 or
sizeof(T) == 2 or
1323 sizeof(T) == 4 or
sizeof(T) == 8,
1324 "Unsupported type size - only sizes 1, 2 and 4 are supported");
1328 case(1): result = cuMemsetD8Async (
address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle);
break;
1329 case(2): result = cuMemsetD16Async(
address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle);
break;
1330 case(4): result = cuMemsetD32Async(
address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle);
break;
1349 template <
typename T>
1350 void typed_set(T* start,
const T& value,
size_t num_elements, optional_ref<const stream_t> stream);
1360 void zero(
void* start,
size_t num_bytes, optional_ref<const stream_t> stream);
1364 namespace inter_context {
1369 const void * source_address,
1372 optional_ref<const stream_t> stream);
1436 void * destination_address,
1438 const void * source_address,
1441 optional_ref<const stream_t> stream);
1449 optional_ref<const stream_t> stream)
1451 copy(destination, destination_context, source.start(), source_context, source.size(), stream);
1460 optional_ref<const stream_t> stream)
1462 copy(destination.start(), destination_context, source, source_context, destination.size(), stream);
1471 optional_ref<const stream_t> stream)
1474 if (destination.size() < destination.size()) {
1475 throw ::std::invalid_argument(
1476 "Attempt to copy a region of " + ::std::to_string(source.size()) +
1477 " bytes into a region of size " + ::std::to_string(destination.size()) +
" bytes");
1480 copy(destination.start(), destination_context, source, source_context, stream);
1484 template <
typename T, dimensionality_t NumDimensions>
1488 optional_ref<const stream_t> stream)
1507 size_t size_in_bytes,
1543 size_t size_in_bytes,
1565 auto result = cuMemFreeHost(host_ptr);
1566 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT 1567 if (result == status::success) {
return; }
1569 if (result == status::success or result == status::context_is_destroyed) {
return; }
1571 throw runtime_error(result,
"Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1601 inline void register_(
const void *ptr,
size_t size,
unsigned flags)
1603 auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1605 "Could not register and page-lock the region of " + ::std::to_string(size) +
1606 " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
1607 " with flags " + cuda::detail_::as_hex(flags));
1612 register_(region.start(), region.size(), flags);
1624 is_mapped_io_space =
true,
1625 is_not_mapped_io_space =
false 1636 do_not_map_into_device_memory =
false 1676 bool register_mapped_io_space,
1677 bool map_into_device_space,
1678 bool make_device_side_accessible_to_all
1679 #
if CUDA_VERSION >= 11010
1680 ,
bool considered_read_only_by_device
1686 (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1687 | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1688 | (make_device_side_accessible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1689 #
if CUDA_VERSION >= 11010
1690 | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
1722 bool register_mapped_io_space,
1723 bool map_into_device_space,
1724 bool make_device_side_accessible_to_all
1725 #
if CUDA_VERSION >= 11010
1726 ,
bool considered_read_only_by_device
1733 register_mapped_io_space,
1734 map_into_device_space,
1735 make_device_side_accessible_to_all
1736 #if CUDA_VERSION >= 11010 1737 , considered_read_only_by_device
1738 #endif // CUDA_VERSION >= 11010 1758 unsigned no_flags_set { 0 };
1777 register_(region.start(), region.size());
1789 auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1791 "Could not unregister the memory segment starting at address *a");
1813 inline void set(
void*
start,
int byte_value,
size_t num_bytes)
1815 ::std::memset(start, byte_value, num_bytes);
1824 memory::set(region.start(), byte_value, region.size(), nullopt);
1833 inline void zero(
void* start,
size_t num_bytes)
1835 set(
start, 0, num_bytes);
1854 template <
typename T>
1857 zero(ptr,
sizeof(T));
1869 using attribute_t = CUmem_range_attribute;
1870 using advice_t = CUmem_advise;
1872 template <
typename T>
1873 inline T get_scalar_attribute(
const_region_t region, attribute_t attribute)
1875 uint32_t attribute_value { 0 };
1876 auto result = cuMemRangeGetAttribute(
1877 &attribute_value,
sizeof(attribute_value), attribute,
device::address(region.start()), region.size());
1879 "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1880 return static_cast<T
>(attribute_value);
1888 #if CUDA_VERSION >= 13000 1889 auto result = cuMemAdvise(
address, region.size(), advice, location);
1891 if (location.type != CU_MEM_LOCATION_TYPE_DEVICE) {
1893 "Advising on memory other than on CUDA devices is not supported before CUDA 13.0");
1895 auto result = cuMemAdvise(
address, region.size(), advice, location.id);
1898 + cuda::detail_::ptr_as_hex(region.start()) +
" in " + cuda::memory::detail_::identify(location));
1903 advise(region, advice, pool::detail_::create_mem_location(device_id));
1906 inline advice_t as_advice(attribute_t attribute,
bool set)
1908 switch (attribute) {
1909 case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1910 return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1911 case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1912 return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1913 case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1914 return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1916 throw ::std::invalid_argument(
1917 "CUDA memory range attribute does not correspond to any range advice value");
1923 static constexpr
const bool set {
true };
1924 advise(region, as_advice(settable_attribute,
set), device_id);
1927 inline void set_attribute(
const_region_t region, attribute_t settable_attribute)
1929 static constexpr
const bool set {
true };
1931 advise(region, as_advice(settable_attribute,
set), dummy_device_id);
1934 inline void unset_attribute(
const_region_t region, attribute_t settable_attribute)
1936 static constexpr
const bool unset {
false };
1938 advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1947 template <
typename GenericRegion>
1948 struct region_helper :
public GenericRegion {
1949 using GenericRegion::GenericRegion;
1951 bool is_read_mostly()
const 1953 return range::detail_::get_scalar_attribute<bool>(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1956 void designate_read_mostly()
const 1958 range::detail_::set_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1961 void undesignate_read_mostly()
const 1963 range::detail_::unset_attribute(*
this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1966 device_t preferred_location()
const;
1967 void set_preferred_location(
device_t& device)
const;
1968 void clear_preferred_location()
const;
1974 using region_t = detail_::region_helper<memory::region_t>;
1985 template <
typename Allocator = ::std::allocator<cuda::device_t> >
1990 global = CU_MEM_ATTACH_GLOBAL,
1991 host = CU_MEM_ATTACH_HOST,
1992 single_stream = CU_MEM_ATTACH_SINGLE,
2002 auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
2003 attachment_t::global : attachment_t::host;
2013 auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
2016 status =
static_cast<status_t>(status::unknown);
2019 + ::std::to_string(num_bytes) +
" bytes of managed CUDA memory");
2028 inline void free(
void* ptr)
2032 throw_if_error_lazy(result,
"Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
2038 free(region.start());
2041 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2044 void* operator()(
size_t num_bytes)
const 2046 return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
2059 CAW_SET_SCOPE_CONTEXT(context_handle);
2060 return allocate_in_current_context(num_bytes, initial_visibility);
2117 inline void free(
void* managed_ptr)
2121 "Freeing managed memory (host and device regions) at address " 2122 + cuda::detail_::ptr_as_hex(managed_ptr));
2128 free(region.start());
2139 #if CUDA_VERSION >= 13000 2140 static constexpr
unsigned flags { 0 };
2141 auto result = cuMemPrefetchAsync(
address, region.size(), destination, flags, source_stream_handle);
2143 if (destination.type == CU_MEM_LOCATION_TYPE_HOST) {
2144 destination = { CU_MEM_LOCATION_TYPE_DEVICE, CU_DEVICE_CPU };
2146 if (destination.type != CU_MEM_LOCATION_TYPE_DEVICE) {
2148 "Prefetching to destination types other than CUDA devices is not supported before CUDA 13.0");
2150 auto result = cuMemPrefetchAsync(
address, region.size(), destination.id, source_stream_handle);
2153 "Prefetching " + ::std::to_string(region.size()) +
" bytes of managed memory at address " 2154 + cuda::detail_::ptr_as_hex(region.start()) +
" to " + cuda::memory::detail_::identify(destination));
2163 prefetch(region, pool::detail_::create_mem_location(destination), source_stream_handle);
2197 template <
typename T>
2200 auto unconsted_host_mem_ptr =
const_cast<typename ::std::remove_const<T>::type *
>(host_memory_ptr);
2202 auto get_device_pointer_flags = 0u;
2203 auto status = cuMemHostGetDevicePointer(
2205 unconsted_host_mem_ptr,
2206 get_device_pointer_flags);
2208 "Failed obtaining the device-side pointer for host-memory pointer " 2209 + cuda::detail_::ptr_as_hex(host_memory_ptr) +
" supposedly mapped to device memory");
2242 size_t size_in_bytes,
2247 auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
2248 void* allocated_ptr;
2249 auto status = cuMemHostAlloc(&allocated_ptr, size_in_bytes, flags);
2250 if (
is_success(status) && (allocated_ptr ==
nullptr)) {
2252 status =
static_cast<status_t>(status::named_t::unknown);
2255 "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
2256 +
" bytes of global memory in " + context::detail_::identify(current_context_handle));
2257 allocated.host_side = { allocated_ptr, size_in_bytes };
2264 size_t size_in_bytes,
2267 CAW_SET_SCOPE_CONTEXT(context_handle);
2268 return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
2271 inline void free(
void* host_side_pair)
2273 auto result = cuMemFreeHost(host_side_pair);
2275 + cuda::detail_::ptr_as_hex(host_side_pair));
2291 size_t size_in_bytes,
2304 size_t size_in_bytes,
2328 void* host_side_ptr;
2329 auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER,
memory::device::address(ptr));
2330 throw_if_error_lazy(status,
"Failed obtaining the host-side address of supposedly-device-side pointer " 2331 + cuda::detail_::ptr_as_hex(ptr));
2369 template <
typename T,
typename RawDeleter,
typename RegionAllocator>
2370 unique_span<T> make_convenient_type_unique_span(
size_t size, RegionAllocator allocator)
2372 memory::detail_::check_allocation_type<T>();
2373 auto deleter = [](span<T> sp) {
2374 return RawDeleter{}(sp.data());
2376 region_t allocated_region = allocator(size *
sizeof(T));
2377 return unique_span<T>(
2378 allocated_region.as_span<T>(),
2390 template <
typename T>
2393 auto allocate_in_current_context_ = [](
size_t size) {
return allocate_in_current_context(size); };
2394 CAW_SET_SCOPE_CONTEXT(context_handle);
2395 return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context_);
2417 template <
typename T>
2425 template <
typename T>
2434 template <
typename T>
2440 template <
typename T>
2443 return device::make_unique_span<T>(context, size);
2447 template <
typename T>
2450 return device::make_unique_span<T>(device, size);
2475 template <
typename T>
2479 auto allocator = [](
size_t size) {
return allocate(size); };
2480 return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2489 template <
typename T, initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2494 CAW_SET_SCOPE_CONTEXT(context_handle);
2495 auto allocator = [](
size_t size) {
2496 return allocate_in_current_context(size, InitialVisibility);
2498 return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2525 template <
typename T>
2536 template <
typename T>
2548 template <
typename T>
2566 template <
typename T>
2571 auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2572 throw_if_error_lazy(api_call_result,
"Could not locate the device memory address for a symbol");
2573 api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2574 throw_if_error_lazy(api_call_result,
"Could not locate the device memory address for the symbol at address" 2575 + cuda::detail_::ptr_as_hex(start));
2576 return {
start, symbol_size };
2583 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_ void register_(const_region_t region)
Register a memory region with the CUDA driver.
Definition: memory.hpp:1775
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2324
Proxy class for a CUDA stream.
Definition: stream.hpp:258
endpoint_t
Type for choosing between endpoints of copy operations.
Definition: copy_parameters.hpp:19
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory.
Definition: memory.hpp:244
cpu_write_combining write_combining
whether or not the GPU can batch multiple writes to this area and propagate them at its convenience...
Definition: memory.hpp:98
The cuda::memory::pool_t proxy class for memory pools, and related code for creating, manipulating and allocating using memory pools.
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator=Allocator())
Definition: memory.hpp:215
unique_span< T > make_unique_span(const context_t &context, size_t size)
See device::make_unique_span(const context_t& context, size_t size)
Definition: memory.hpp:2441
Wrapper class for a CUDA context.
Definition: context.hpp:249
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
is_not_accessible_on_all_devices
Definition: memory.hpp:1646
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1974
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
this_type & clear_offsets() noexcept
Clear the offsets into both the source and the destination endpoint regions.
Definition: copy_parameters.hpp:284
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:29
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:391
CUmemLocation location_t
Used in a limited number of API functions which can relate both to CUDA device memory and system memo...
Definition: types.hpp:555
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
::std::size_t size_bytes() const noexcept
Overall size in bytes of the elements of the array, over all dimensions.
Definition: array.hpp:255
portability_across_contexts portability
whether or not the allocated region can be used in different CUDA contexts.
Definition: memory.hpp:95
constexpr span_pair_t< T > as_spans() const
Definition: memory.hpp:167
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region.
Definition: memory.hpp:204
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:346
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:126
T * get() const
Definition: pointer.hpp:139
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:84
memory::region_t host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:163
void set(void *start, int byte_value, size_t num_bytes)
Sets all bytes in a stretch of host-side memory to a single value.
Definition: memory.hpp:1813
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
The cuda::memory::copy_parameters_t class template and related definitions.
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:627
options accepted by CUDA's allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:93
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
::std::size_t size() const noexcept
Overall number of elements in the array, over all dimensions.
Definition: array.hpp:252
span< T > host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:145
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:208
A pair of memory regions, one in system (=host) memory and one on a CUDA device's memory - mapped to ...
Definition: memory.hpp:160
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1634
void deregister(const_region_t region)
Have the CUDA driver "forget" about a region of memory which was previously registered with it...
Definition: memory.hpp:1795
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1644
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:131
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2567
this_type & clear_offset(endpoint_t endpoint) noexcept
Set the copy operation to use the multi-dimensional region of the specified endpoint without skipping...
Definition: copy_parameters.hpp:278
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
region_pair_t allocate(const cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:276
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA's complex copyin...
Definition: copy_parameters.hpp:68
Wrappers for getting and setting CUDA's choice of which device is 'current'.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1976
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:684
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1623
void free(void *host_ptr)
Frees a region of pinned host memory which was allocated with one of the pinned host memory allocatio...
Definition: memory.hpp:1563
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:236
A wrapper class for host and/or device pointers, allowing easy access to CUDA's pointer attributes...
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:702
void set(void *ptr, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets a number of bytes in memory to a fixed value.
Definition: memory.hpp:418
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
A pair of memory spans, one in device-global memory and one in host/system memory, mapped to it.
Definition: memory.hpp:142
const_region_t device_side_region_for(const_region_t region)
Get the memory region mapped to a given host-side region.
Definition: memory.hpp:2224
void free(region_pair_t pair)
Free a pair of mapped memory regions.
Definition: memory.hpp:2313
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region.
Definition: memory.hpp:209
this_type & set_endpoint(endpoint_t endpoint, const cuda::array_t< T, NumDimensions > &array) noexcept
Set one of the copy endpoints to a CUDA array.
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:674
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) given given a host-side pointer ma...
Definition: memory.hpp:2198
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:64
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:71
is_accessible_on_all_devices
Definition: memory.hpp:1645
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:236
region_t allocate(size_t size_in_bytes, allocation_options options)
Allocates pinned host memory.
Definition: memory.hpp:340
Wrapper class for a CUDA device.
Definition: device.hpp:135
void zero(region_t region, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:736
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:755
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:214
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1989
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2346