cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
memory.hpp
Go to the documentation of this file.
1 
25 #pragma once
26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_
27 #define CUDA_API_WRAPPERS_MEMORY_HPP_
28 
29 #include "copy_parameters.hpp"
30 #include "array.hpp"
31 #include "constants.hpp"
32 #include "current_device.hpp"
33 #include "error.hpp"
34 #include "pointer.hpp"
35 #include "current_context.hpp"
36 #include "detail/unique_span.hpp"
37 
38 // The following is needed for cudaGetSymbolAddress, cudaGetSymbolSize
39 #include <cuda_runtime.h>
40 
41 #include <memory>
42 #include <cstring> // for ::std::memset
43 #include <vector>
44 #include <utility>
45 
46 #include "memory_pool.hpp"
47 
48 namespace cuda {
49 
51 class device_t;
52 class context_t;
53 class stream_t;
54 class module_t;
56 
57 namespace memory {
58 
64 enum class portability_across_contexts : bool {
65  isnt_portable = false,
66  is_portable = true,
67 };
68 
84 enum cpu_write_combining : bool {
85  without_wc = false,
86  with_wc = true,
87 };
88 
96 
99 };
100 
101 namespace detail_ {
102 
103 template <typename T, bool CheckConstructibility = false>
104 inline void check_allocation_type() noexcept
105 {
106  static_assert(::std::is_trivially_constructible<T>::value,
107  "Attempt to create a typed buffer of a non-trivially-constructive type");
108  static_assert(not CheckConstructibility or ::std::is_trivially_destructible<T>::value,
109  "Attempt to create a typed buffer of a non-trivially-destructible type "
110  "without allowing for its destruction");
111  static_assert(::std::is_trivially_copyable<T>::value,
112  "Attempt to create a typed buffer of a non-trivially-copyable type");
113 }
114 
115 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
116 {
117  return
118  (options.portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) |
119  (options.write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
120 }
121 
122 } // namespace detail_
123 
131 namespace mapped {
132 
133 // TODO: Perhaps make this an array of size 2 and use aspects to index it?
134 
141 template <typename T>
142 struct span_pair_t {
145  span<T> host_side, device_side;
146 
148  constexpr operator ::std::pair<span<T>, span<T>>() const { return { host_side, device_side }; }
149  constexpr operator ::std::pair<region_t, region_t>() const { return { host_side, device_side }; }
151 };
152 
164 
166  template <typename T>
167  constexpr span_pair_t<T> as_spans() const
168  {
169  return { host_side.as_span<T>(), device_side.as_span<T>() };
170  }
171 };
172 
173 } // namespace mapped
174 
176 namespace device {
177 
178 namespace detail_ {
179 
185 #if CUDA_VERSION >= 11020
186 inline cuda::memory::region_t allocate_in_current_context(
187  size_t num_bytes, optional<stream::handle_t> stream_handle = {})
188 #else
189 inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
190 #endif
191 {
192 #if CUDA_VERSION >= 11020
193  if (stream_handle) {
194  device::address_t allocated = 0;
195  // Note: the typed cudaMalloc also takes its size in bytes, apparently,
196  // not in number of elements
197  auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle);
198  if (is_success(status) && allocated == 0) {
199  // Can this even happen? hopefully not
200  status = static_cast<decltype(status)>(status::unknown);
201  }
202  throw_if_error_lazy(status,
203  "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
204  " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) );
205  return {as_pointer(allocated), num_bytes};
206  }
207 #endif
208  device::address_t allocated = 0;
209  auto status = cuMemAlloc(&allocated, num_bytes);
210  if (is_success(status) && allocated == 0) {
211  // Can this even happen? hopefully not
212  status = static_cast<status_t>(status::unknown);
213  }
214  throw_if_error_lazy(status, "Failed allocating " + ::std::to_string(num_bytes) +
215  " bytes of global memory on the current CUDA device");
216  return {as_pointer(allocated), num_bytes};
217 }
218 
219 #if CUDA_VERSION >= 11020
220 inline region_t allocate(
221  context::handle_t context_handle,
222  size_t size_in_bytes,
223  optional<stream::handle_t> stream_handle = {})
224 {
225  CAW_SET_SCOPE_CONTEXT(context_handle);
226  return allocate_in_current_context(size_in_bytes, stream_handle);
227 }
228 #else
229 inline region_t allocate(
230  context::handle_t context_handle,
231  size_t size_in_bytes)
232 {
233  CAW_SET_SCOPE_CONTEXT(context_handle);
234  return allocate_in_current_context(size_in_bytes);
235 }
236 #endif
237 
238 #if CUDA_VERSION >= 11020
239 inline void free_on_stream(
240  void* allocated_region_start,
241  stream::handle_t stream_handle)
242 {
243  auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
244  throw_if_error_lazy(status,
245  "Failed scheduling an asynchronous freeing of the global memory region starting at "
246  + cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
247  + stream::detail_::identify(stream_handle));
248 }
249 #endif // CUDA_VERSION >= 11020
250 
251 inline void free_in_current_context(
252  context::handle_t current_context_handle,
253  void* allocated_region_start)
254 {
255  auto result = cuMemFree(address(allocated_region_start));
256  if (result == status::success) { return; }
257 #ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
258  if (result == status::context_is_destroyed) { return; }
259 #endif
260  throw runtime_error(result, "Freeing device memory at "
261  + cuda::detail_::ptr_as_hex(allocated_region_start)
262  + " in " + context::detail_::identify(current_context_handle));
263 }
264 
265 } // namespace detail_
266 
268 #if CUDA_VERSION >= 11020
269 inline void free(void* region_start, optional_ref<const stream_t> stream = {});
270 #else
271 inline void free(void* ptr);
272 #endif
273 
274 #if CUDA_VERSION >= 11020
275 inline void free(region_t region, optional_ref<const stream_t> stream = {})
277 {
278  free(region.start(), stream);
279 }
280 #else
281 inline void free(region_t region)
283 {
284  free(region.start());
285 }
286 #endif
287 
288 #if CUDA_VERSION >= 11020
289 
302 region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stream);
303 #endif
304 
318 inline region_t allocate(const context_t& context, size_t size_in_bytes);
319 
333 inline region_t allocate(const device_t& device, size_t size_in_bytes);
334 
335 namespace detail_ {
336 
337 // Note: Allocates _in the current context_! No current context => failure!
338 struct allocator {
339  void* operator()(size_t num_bytes) const {
340  return detail_::allocate_in_current_context(num_bytes).start();
341  }
342 };
343 
344 struct deleter {
345  void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
346 };
347 
348 } // namespace detail_
349 
362 template <typename T>
363 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream = {});
364 
387 inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {})
388 {
389  return typed_set<unsigned char>(
390  static_cast<unsigned char*>(start),
391  static_cast<unsigned char>(byte_value),
392  num_bytes,
393  stream);
394 }
395 
405 inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
406 {
407  set(region.start(), byte_value, region.size(), stream);
408 }
409 
418 inline void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream = {})
419 {
420  set(start, 0, num_bytes, stream);
421 }
422 
430 inline void zero(region_t region, optional_ref<const stream_t> stream = {})
431 {
432  zero(region.start(), region.size(), stream);
433 }
434 
442 template <typename T>
443 inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
444 {
445  zero(ptr, sizeof(T), stream);
446 }
447 
448 } // namespace device
449 
451 namespace detail_ {
452 
454 
466 inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
467 {
468  auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
469 
470  // TODO: Determine whether it was from host to device, device to host etc and
471  // add this information to the error string
472  throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
473 }
474 
482 inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
483 {
484 #ifndef NDEBUG
485  if (destination.size() < source.size()) {
486  throw ::std::logic_error("Source size exceeds destination size");
487  }
488 #endif
489  copy(destination.start(), source.start(), source.size(), stream_handle);
490 }
492 
494 
495 inline status_t multidim_copy_in_current_context(
496  ::std::integral_constant<dimensionality_t, 2>,
497  copy_parameters_t<2> params,
498  optional<stream::handle_t> stream_handle)
499 {
500  // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
501  // structure holds no information about contexts.
502  //
503  // Note: The stream handle, even if present, might be the null handle; for now
504  // we distinguish between using the null stream handle - the default stream's -
505  // and using the synchronous API
506  return stream_handle ?
507  cuMemcpy2DAsync(&params, *stream_handle) :
508  cuMemcpy2D(&params);
509 }
510 
511 inline status_t multidim_copy_in_current_context(
512  ::std::integral_constant<dimensionality_t, 3>,
513  copy_parameters_t<3> params,
514  optional<stream::handle_t> stream_handle)
515 {
516  if (params.srcContext == params.dstContext) {
517  // TODO: Should we check it's also the current context?
518  using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
519  auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
520  return stream_handle ?
521  cuMemcpy3DAsync(intra_context_params, *stream_handle) :
522  cuMemcpy3D(intra_context_params);
523  }
524  return stream_handle ?
525  cuMemcpy3DPeerAsync(&params, *stream_handle) :
526  cuMemcpy3DPeer(&params);
527 }
528 
529 template<dimensionality_t NumDimensions>
530 status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, optional<stream::handle_t> stream_handle) {
531  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
532 }
533 
534 // Note: Assumes the stream handle is for a stream in the current context
535 template<dimensionality_t NumDimensions>
536 status_t multidim_copy(
537  context::handle_t context_handle,
539  optional<stream::handle_t> stream_handle)
540 {
541  CAW_SET_SCOPE_CONTEXT(context_handle);
542  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
543 }
544 
545 // Assumes the array and the stream share the same context, and that the destination is
546 // accessible from that context (e.g. allocated within it, or being managed memory, etc.)
547 template <typename T, dimensionality_t NumDimensions>
548 void copy(T *destination, const array_t<T, NumDimensions>& source, optional<stream::handle_t> stream_handle)
549 {
550  using memory::endpoint_t;
551  auto dims = source.dimensions();
552  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
553  auto params = copy_parameters_t<NumDimensions> {};
554  params.clear_offset(endpoint_t::source);
555  params.clear_offset(endpoint_t::destination);
556  params.template set_extent<T>(dims);
557  params.set_endpoint(endpoint_t::source, source);
558  params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
559  params.set_default_pitches();
560  params.clear_rest();
561  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
562  throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
563 }
564 
565 
566 template <typename T, dimensionality_t NumDimensions>
567 void copy(const array_t<T, NumDimensions>& destination, const T* source, optional<stream::handle_t> stream_handle)
568 {
569  using memory::endpoint_t;
570  auto dims = destination.dimensions();
571  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
572  auto params = copy_parameters_t<NumDimensions>{};
573  params.clear_offset(endpoint_t::source);
574  params.clear_offset(endpoint_t::destination);
575  params.template set_extent<T>(dims);
576  params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
577  params.set_endpoint(endpoint_t::destination, destination);
578  params.set_default_pitches();
579  params.clear_rest();
580  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
581  throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
582 }
583 
598 template <typename T>
599 void copy_single(T* destination, const T* source, optional<stream::handle_t> stream_handle)
600 {
601  copy(destination, source, sizeof(T), stream_handle);
602 }
603 
604 } // namespace detail_
605 
616 
626 template <typename T, size_t N>
627 inline void copy(span<T> destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
628 {
629 #ifndef NDEBUG
630  if (destination.size() < N) {
631  throw ::std::logic_error("Source size exceeds destination size");
632  }
633 #endif
634  return copy(destination.data(), source, sizeof(T) * N, stream);
635 }
636 
646 template <typename T, size_t N>
647 void copy(c_array<T,N>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
648 {
649 #ifndef NDEBUG
650  if (source.size() > N) {
651  throw ::std::invalid_argument(
652  "Attempt to copy a span of " + ::std::to_string(source.size()) +
653  " elements into an array of " + ::std::to_string(N) + " elements");
654  }
655 #endif
656  return copy(destination, source.start(), sizeof(T) * N, stream);
657 }
658 
668 template <typename T, size_t N>
669 inline void copy(void* destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
670 {
671  return copy(destination, source, sizeof(T) * N, stream);
672 }
673 
691 template <typename T, size_t N>
692 inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream_t> stream = {})
693 {
694  return copy(destination, source, sizeof(T) * N, stream);
695 }
696 
698 
711 void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {});
712 
724 inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
725 {
726  return set(region.start(), byte_value, region.size(), stream);
727 }
728 
736 inline void zero(region_t region, optional_ref<const stream_t> stream = {})
737 {
738  return set(region, 0, stream);
739 }
740 
749 inline void zero(void* ptr, size_t num_bytes, optional_ref<const stream_t> stream = {})
750 {
751  return set(ptr, 0, num_bytes, stream);
752 }
753 
761 template <typename T>
762 inline void zero(T* ptr)
763 {
764  zero(ptr, sizeof(T));
765 }
766 
767 namespace detail_ {
768 
769 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2> two, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
770 {
771  // TODO: Move this logic into the scoped ensurer class
772  auto context_handle = context::current::detail_::get_handle();
773  if (context_handle != context::detail_::none) {
774  return detail_::multidim_copy_in_current_context(two, params, stream_handle);
775  }
776  auto current_device_id = cuda::device::current::detail_::get_id();
777  context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
778  context::current::detail_::push(context_handle);
779  // Note this _must_ be an intra-context copy, as inter-context is not supported
780  // and there's no indication of context in the relevant data structures
781  auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle);
782  context::current::detail_::pop();
783  cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
784  return status;
785 }
786 
787 inline status_t multidim_copy(context::handle_t context_handle, ::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
788 {
789  context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
790  return multidim_copy(::std::integral_constant<dimensionality_t, 2>{}, params, stream_handle);
791 }
792 
793 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params, optional<stream::handle_t> stream_handle)
794 {
795  if (params.srcContext == params.dstContext) {
796  context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
797  return detail_::multidim_copy_in_current_context(params, stream_handle);
798  }
799  return stream_handle ?
800  cuMemcpy3DPeerAsync(&params, *stream_handle) :
801  cuMemcpy3DPeer(&params);
802 }
803 
804 template<dimensionality_t NumDimensions>
805 status_t multidim_copy(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle)
806 {
807  return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
808 }
809 
810 
811 } // namespace detail_
812 
823 template<dimensionality_t NumDimensions>
824 void copy(copy_parameters_t<NumDimensions> params, optional_ref<const stream_t> stream = {});
825 
838 template<typename T, dimensionality_t NumDimensions>
839 void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source, optional_ref<const stream_t> stream = {})
840 {
841  auto dims = destination.dimensions();
842  auto params = copy_parameters_t<NumDimensions> {};
843  params.clear_offsets();
844  params.template set_extent<T>(dims);
845  params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast<T*>(source), dims);
846  params.set_endpoint(endpoint_t::destination, destination);
847  params.clear_rest();
848  copy(params, stream);
849 }
850 
869 template <typename T, dimensionality_t NumDimensions>
870 void copy(array_t<T, NumDimensions>& destination, const T* source, optional_ref<const stream_t> stream = {});
871 
880 template<typename T, dimensionality_t NumDimensions>
881 void copy(const array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
882 {
883 #ifndef NDEBUG
884  if (destination.size() < source.size()) {
885  throw ::std::invalid_argument(
886  "Attempt to copy a span of " + ::std::to_string(source.size()) +
887  " elements into a CUDA array of " + ::std::to_string(destination.size()) + " elements");
888  }
889 #endif
890  copy(destination, source.data(), stream);
891 }
892 
903 template <typename T, dimensionality_t NumDimensions>
904 void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
905 {
906  auto dims = source.dimensions();
907  auto params = copy_parameters_t<NumDimensions> {};
908  params.clear_offset(endpoint_t::source);
909  params.clear_offset(endpoint_t::destination);
910  params.template set_extent<T>(dims);
911  params.set_endpoint(endpoint_t::source, source);
912  params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
913  params.set_default_pitches();
914  params.clear_rest();
915  copy(params, stream);
916 }
917 
936 template <typename T, dimensionality_t NumDimensions>
937 void copy(T* destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {});
938 
939 
947 template <typename T, dimensionality_t NumDimensions>
948 void copy(span<T> destination, const array_t<T, NumDimensions>& source, optional_ref <const stream_t> stream = {})
949 {
950 #ifndef NDEBUG
951  if (destination.size() < source.size()) {
952  throw ::std::invalid_argument(
953  "Attempt to copy a CUDA array of " + ::std::to_string(source.size()) +
954  " elements into a span of " + ::std::to_string(destination.size()) + " elements");
955  }
956 #endif
957  copy(destination.data(), source, stream);
958 }
959 
967 template <typename T, dimensionality_t NumDimensions>
968 void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream)
969 {
970  auto dims = source.dimensions();
971  auto params = copy_parameters_t<NumDimensions> {};
972  params.clear_offset(endpoint_t::source);
973  params.clear_offset(endpoint_t::destination);
974  params.template set_extent<T>(dims);
975  params.set_endpoint(endpoint_t::source, source);
976  params.set_endpoint(endpoint_t::destination, destination);
977  params.set_default_pitches();
978  params.clear_rest();
979  auto status = //(source.context() == destination.context()) ?
980  detail_::multidim_copy<NumDimensions>(source.context_handle(), params, stream);
981  throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
982 }
983 
1001 template <typename T, dimensionality_t NumDimensions>
1002 void copy(region_t destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
1003 {
1004 #ifndef NDEBUG
1005  if (destination.size() < source.size_bytes()) {
1006  throw ::std::invalid_argument(
1007  "Attempt to copy " + ::std::to_string(source.size_bytes()) + " bytes from an array into a "
1008  "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
1009  }
1010 #endif
1011  copy(destination.start(), source, stream);
1012 }
1013 
1027 template <typename T, dimensionality_t NumDimensions>
1028 void copy(array_t<T, NumDimensions>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
1029 {
1030 #ifndef NDEBUG
1031  if (destination.size_bytes() < source.size()) {
1032  throw ::std::invalid_argument(
1033  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1034  " bytes into an array of size " + ::std::to_string(destination.size_bytes()) + " bytes");
1035  }
1036 #endif
1037  copy(destination, static_cast<T const*>(source.start()), stream);
1038 }
1039 
1056 template <typename T>
1057 void copy_single(T* destination, const T* source, optional_ref<const stream_t> stream = {});
1058 
1077 void copy(void* destination, void const* source, size_t num_bytes, optional_ref<const stream_t> stream = {});
1078 
1079 
1099 template <typename T, size_t N>
1100 inline void copy(c_array<T,N>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
1101 {
1102 #ifndef NDEBUG
1103  size_t required_size = N * sizeof(T);
1104  if (source.size() != required_size) {
1105  throw ::std::invalid_argument(
1106  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1107  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
1108  }
1109 #endif
1110  return copy(&(destination[0]), source.start(), sizeof(T) * N, stream);
1111 }
1112 
1136 template <typename T, size_t N>
1137 inline void copy(region_t destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
1138 {
1139 #ifndef NDEBUG
1140  if (destination.size() < N) {
1141  throw ::std::logic_error("Source size exceeds destination size");
1142  }
1143 #endif
1144  return copy(destination.start(), source, sizeof(T) * N, stream);
1145 }
1146 
1147 
1159 inline void copy(region_t destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
1160 {
1161 #ifndef NDEBUG
1162  if (destination.size() < num_bytes) {
1163  throw ::std::logic_error("Attempt to copy beyond the end of the destination region");
1164  }
1165 #endif
1166  copy(destination.start(), source.start(), num_bytes, stream);
1167 }
1168 
1169 
1186 inline void copy(region_t destination, const_region_t source, optional_ref<const stream_t> stream = {})
1187 {
1188  copy(destination, source, source.size(), stream);
1189 }
1190 
1191 
1209 inline void copy(region_t destination, void* source, optional_ref<const stream_t> stream = {})
1210 {
1211  return copy(destination.start(), source, destination.size(), stream);
1212 }
1213 
1231 inline void copy(region_t destination, void* source, size_t num_bytes, optional_ref<const stream_t> stream = {})
1232 {
1233 #ifndef NDEBUG
1234  if (destination.size() < num_bytes) {
1235  throw ::std::logic_error("Number of bytes to copy exceeds destination size");
1236  }
1237 #endif
1238  return copy(destination.start(), source, num_bytes, stream);
1239 }
1240 
1260 inline void copy(void* destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
1261 {
1262 #ifndef NDEBUG
1263  if (source.size() < num_bytes) {
1264  throw ::std::logic_error("Attempt to copy more than the source region's size");
1265  }
1266 #endif
1267  copy(destination, source.start(), num_bytes, stream);
1268 }
1269 
1284 inline void copy(void* destination, const_region_t source, optional_ref<const stream_t> stream = {})
1285 {
1286  copy(destination, source, source.size(), stream);
1287 }
1288 
1289 namespace device {
1290 
1291 namespace detail_ {
1292 
1293 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
1294 {
1295  // TODO: Double-check that this call doesn't require setting the current device
1296  auto result = cuMemsetD8Async(address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
1297  throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
1298 }
1299 
1300 
1301 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
1302 {
1303  set(region.start(), byte_value, region.size(), stream_handle);
1304 }
1305 
1306 inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle)
1307 {
1308  set(start, 0, num_bytes, stream_handle);
1309 }
1310 
1311 inline void zero(region_t region, stream::handle_t stream_handle)
1312 {
1313  zero(region.start(), region.size(), stream_handle);
1314 }
1315 
1316 // TODO: Drop this in favor of <algorithm>-like functions under `cuda::`.
1317 template <typename T>
1318 inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle)
1319 {
1320  static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
1321  static_assert(
1322  sizeof(T) == 1 or sizeof(T) == 2 or
1323  sizeof(T) == 4 or sizeof(T) == 8,
1324  "Unsupported type size - only sizes 1, 2 and 4 are supported");
1325  // TODO: Consider checking for alignment when compiling without NDEBUG
1326  status_t result = static_cast<status_t>(cuda::status::success);
1327  switch(sizeof(T)) {
1328  case(1): result = cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle); break;
1329  case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
1330  case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
1331  }
1332  throw_if_error_lazy(result, "Setting global device memory bytes");
1333 }
1334 
1335 } // namespace detail_
1336 
1337 
1349 template <typename T>
1350 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);
1351 
1360 void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);
1361 
1362 } // namespace device
1363 
1364 namespace inter_context {
1365 
1366 void copy(
1367  void * destination,
1368  const context_t& destination_context,
1369  const void * source_address,
1370  const context_t& source_context,
1371  size_t num_bytes,
1372  optional_ref<const stream_t> stream);
1373 
1374 /*
1375 inline void copy(
1376  region_t destination,
1377  const context_t& destination_context,
1378  const_region_t source,
1379  const context_t& source_context,
1380  optional_ref<const stream_t> stream)
1381 {
1382 #ifndef NDEBUG
1383  if (destination.size() < destination.size()) {
1384  throw ::std::invalid_argument(
1385  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1386  " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
1387  }
1388 #endif
1389  copy(destination.start(), destination_context, source, source_context, stream);
1390 }
1391 */
1392 
1393 
1394 /*
1395 
1396 template <typename T, dimensionality_t NumDimensions>
1397 inline void copy(
1398  array_t<T, NumDimensions> destination,
1399  array_t<T, NumDimensions> source,
1400  optional_ref<const stream_t> stream)
1401 {
1402  // for arrays, a single mechanism handles both intra- and inter-context copying
1403  return memory::copy(destination, source, stream);
1404 }
1405 */
1406 
1407 namespace detail_ {
1408 
1432 } // namespace detail_
1433 
1435 void copy(
1436  void * destination_address,
1437  const context_t& destination_context,
1438  const void * source_address,
1439  const context_t& source_context,
1440  size_t num_bytes,
1441  optional_ref<const stream_t> stream);
1442 
1444 inline void copy(
1445  void * destination,
1446  const context_t& destination_context,
1447  const_region_t source,
1448  const context_t& source_context,
1449  optional_ref<const stream_t> stream)
1450 {
1451  copy(destination, destination_context, source.start(), source_context, source.size(), stream);
1452 }
1453 
1455 inline void copy(
1456  region_t destination,
1457  const context_t& destination_context,
1458  const void* source,
1459  const context_t& source_context,
1460  optional_ref<const stream_t> stream)
1461 {
1462  copy(destination.start(), destination_context, source, source_context, destination.size(), stream);
1463 }
1464 
1466 inline void copy(
1467  region_t destination,
1468  const context_t& destination_context,
1469  const_region_t source,
1470  const context_t& source_context,
1471  optional_ref<const stream_t> stream)
1472 {
1473 #ifndef NDEBUG
1474  if (destination.size() < destination.size()) {
1475  throw ::std::invalid_argument(
1476  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1477  " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
1478  }
1479 #endif
1480  copy(destination.start(), destination_context, source, source_context, stream);
1481 }
1482 
1484 template <typename T, dimensionality_t NumDimensions>
1485 inline void copy(
1486  array_t<T, NumDimensions> destination,
1488  optional_ref<const stream_t> stream)
1489 {
1490  // for arrays, a single mechanism handles both intra- and inter-context copying
1491  return memory::copy(destination, source, stream);
1492 }
1493 
1494 } // namespace inter_context
1495 
1498 namespace host {
1499 
1500 namespace detail_ {
1501 
1502 // Even though the pinned memory should not in principle be associated in principle with a context or a device, in
1503 // practice it needs to be registered somewhere - and that somewhere is a context. Passing a context does not mean
1504 // the allocation will have special affinity to the device terms of better performance etc.
1505 inline region_t allocate(
1506  context::handle_t context_handle,
1507  size_t size_in_bytes,
1508  allocation_options options);
1509 
1510 } // namespace detail_
1511 
1529 region_t allocate(size_t size_in_bytes, allocation_options options);
1530 
1543  size_t size_in_bytes,
1545  cpu_write_combining cpu_wc = cpu_write_combining(false))
1546 {
1547  return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
1548 }
1549 
1551 inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
1552 {
1553  return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
1554 }
1555 
1563 inline void free(void* host_ptr)
1564 {
1565  auto result = cuMemFreeHost(host_ptr);
1566 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
1567  if (result == status::success) { return; }
1568 #else
1569  if (result == status::success or result == status::context_is_destroyed) { return; }
1570 #endif
1571  throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1572 }
1573 
1579 inline void free(region_t region) { return free(region.data()); }
1580 
1581 namespace detail_ {
1582 
1583 struct allocator {
1584  void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes).data(); }
1585 };
1586 struct deleter {
1587  void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
1588 };
1589 
1601 inline void register_(const void *ptr, size_t size, unsigned flags)
1602 {
1603  auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1604  throw_if_error_lazy(result,
1605  "Could not register and page-lock the region of " + ::std::to_string(size) +
1606  " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
1607  " with flags " + cuda::detail_::as_hex(flags));
1608 }
1609 
1610 inline void register_(const_region_t region, unsigned flags)
1611 {
1612  register_(region.start(), region.size(), flags);
1613 }
1614 
1615 } // namespace detail_
1616 
1623 enum mapped_io_space : bool {
1624  is_mapped_io_space = true,
1625  is_not_mapped_io_space = false
1626 };
1627 
1635  map_into_device_memory = true,
1636  do_not_map_into_device_memory = false
1637 };
1638 
1647 };
1648 
1675 inline void register_(const void *ptr, size_t size,
1676  bool register_mapped_io_space,
1677  bool map_into_device_space,
1678  bool make_device_side_accessible_to_all
1679 #if CUDA_VERSION >= 11010
1680  , bool considered_read_only_by_device
1681 #endif // CUDA_VERSION >= 11010
1682  )
1683 {
1685  ptr, size,
1686  (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1687  | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1688  | (make_device_side_accessible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1689 #if CUDA_VERSION >= 11010
1690  | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
1691 #endif // CUDA_VERSION >= 11010
1692  );
1693 }
1694 
1720 inline void register_(
1721  const_region_t region,
1722  bool register_mapped_io_space,
1723  bool map_into_device_space,
1724  bool make_device_side_accessible_to_all
1725 #if CUDA_VERSION >= 11010
1726  , bool considered_read_only_by_device
1727 #endif // CUDA_VERSION >= 11010
1728  )
1729 {
1730  register_(
1731  region.start(),
1732  region.size(),
1733  register_mapped_io_space,
1734  map_into_device_space,
1735  make_device_side_accessible_to_all
1736 #if CUDA_VERSION >= 11010
1737  , considered_read_only_by_device
1738 #endif // CUDA_VERSION >= 11010
1739  );
1740 }
1741 
1756 inline void register_(void const *ptr, size_t size)
1757 {
1758  unsigned no_flags_set { 0 };
1759  detail_::register_(ptr, size, no_flags_set);
1760 }
1761 
1775 inline void register_(const_region_t region)
1776 {
1777  register_(region.start(), region.size());
1778 }
1779 
1787 inline void deregister(const void *ptr)
1788 {
1789  auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1790  throw_if_error_lazy(result,
1791  "Could not unregister the memory segment starting at address *a");
1792 }
1793 
1795 inline void deregister(const_region_t region)
1796 {
1797  deregister(region.start());
1798 }
1799 
1806 
1813 inline void set(void* start, int byte_value, size_t num_bytes)
1814 {
1815  ::std::memset(start, byte_value, num_bytes);
1816  // TODO: Error handling?
1817 }
1818 
1822 inline void set(region_t region, int byte_value)
1823 {
1824  memory::set(region.start(), byte_value, region.size(), nullopt);
1825 }
1826 
1833 inline void zero(void* start, size_t num_bytes)
1834 {
1835  set(start, 0, num_bytes);
1836 }
1837 
1843 inline void zero(region_t region)
1844 {
1845  host::set(region, 0);
1846 }
1847 
1854 template <typename T>
1855 inline void zero(T* ptr)
1856 {
1857  zero(ptr, sizeof(T));
1858 }
1859 
1860 
1861 } // namespace host
1862 
1863 namespace managed {
1864 
1865 namespace range {
1866 
1867 namespace detail_ {
1868 
1869 using attribute_t = CUmem_range_attribute;
1870 using advice_t = CUmem_advise;
1871 
1872 template <typename T>
1873 inline T get_scalar_attribute(const_region_t region, attribute_t attribute)
1874 {
1875  uint32_t attribute_value { 0 };
1876  auto result = cuMemRangeGetAttribute(
1877  &attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
1878  throw_if_error_lazy(result,
1879  "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1880  return static_cast<T>(attribute_value);
1881 }
1882 
1883 // CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's
1884 // not called cuMemRangeSetAttribute, and uses a different enum.
1885 inline void advise(const_region_t region, advice_t advice, location_t location)
1886 {
1887  auto address = device::address(region.start());
1888 #if CUDA_VERSION >= 13000
1889  auto result = cuMemAdvise(address, region.size(), advice, location);
1890 #else
1891  if (location.type != CU_MEM_LOCATION_TYPE_DEVICE) {
1892  throw runtime_error(status::named_t::not_supported,
1893  "Advising on memory other than on CUDA devices is not supported before CUDA 13.0");
1894  }
1895  auto result = cuMemAdvise(address, region.size(), advice, location.id);
1896 #endif
1897  throw_if_error_lazy(result, "Setting an attribute for a managed memory range at "
1898  + cuda::detail_::ptr_as_hex(region.start()) + " in " + cuda::memory::detail_::identify(location));
1899 }
1900 
1901 inline void advise(const_region_t region, advice_t advice, cuda::device::id_t device_id)
1902 {
1903  advise(region, advice, pool::detail_::create_mem_location(device_id));
1904 }
1905 
1906 inline advice_t as_advice(attribute_t attribute, bool set)
1907 {
1908  switch (attribute) {
1909  case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1910  return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1911  case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1912  return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1913  case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1914  return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1915  default:
1916  throw ::std::invalid_argument(
1917  "CUDA memory range attribute does not correspond to any range advice value");
1918  }
1919 }
1920 
1921 inline void set_attribute(const_region_t region, attribute_t settable_attribute, cuda::device::id_t device_id)
1922 {
1923  static constexpr const bool set { true };
1924  advise(region, as_advice(settable_attribute, set), device_id);
1925 }
1926 
1927 inline void set_attribute(const_region_t region, attribute_t settable_attribute)
1928 {
1929  static constexpr const bool set { true };
1930  static constexpr const cuda::device::id_t dummy_device_id { 0 };
1931  advise(region, as_advice(settable_attribute, set), dummy_device_id);
1932 }
1933 
1934 inline void unset_attribute(const_region_t region, attribute_t settable_attribute)
1935 {
1936  static constexpr const bool unset { false };
1937  static constexpr const cuda::device::id_t dummy_device_id { 0 };
1938  advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1939 }
1940 
1941 } // namespace detail_
1942 
1943 } // namespace range
1944 
1945 namespace detail_ {
1946 
1947 template <typename GenericRegion>
1948 struct region_helper : public GenericRegion {
1949  using GenericRegion::GenericRegion;
1950 
1951  bool is_read_mostly() const
1952  {
1953  return range::detail_::get_scalar_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1954  }
1955 
1956  void designate_read_mostly() const
1957  {
1958  range::detail_::set_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1959  }
1960 
1961  void undesignate_read_mostly() const
1962  {
1963  range::detail_::unset_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1964  }
1965 
1966  device_t preferred_location() const;
1967  void set_preferred_location(device_t& device) const;
1968  void clear_preferred_location() const;
1969 };
1970 
1971 } // namespace detail_
1972 
1974 using region_t = detail_::region_helper<memory::region_t>;
1976 using const_region_t = detail_::region_helper<memory::const_region_t>;
1977 
1979 void advise_expected_access_by(const_region_t region, device_t& device);
1980 
1983 
1985 template <typename Allocator = ::std::allocator<cuda::device_t> >
1986 ::std::vector<device_t, Allocator> expected_accessors(const_region_t region, const Allocator& allocator = Allocator() );
1987 
1989 enum class attachment_t : unsigned {
1990  global = CU_MEM_ATTACH_GLOBAL,
1991  host = CU_MEM_ATTACH_HOST,
1992  single_stream = CU_MEM_ATTACH_SINGLE,
1993  };
1994 
1995 namespace detail_ {
1996 
1997 inline managed::region_t allocate_in_current_context(
1998  size_t num_bytes,
1999  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
2000 {
2001  device::address_t allocated = 0;
2002  auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
2003  attachment_t::global : attachment_t::host;
2004  // This is necessary because managed allocation requires at least one (primary)
2005  // context to have been constructed. We could theoretically check what our current
2006  // context is etc., but that would be brittle, since someone can managed-allocate,
2007  // then change contexts, then de-allocate, and we can't be certain that whoever
2008  // called us will call free
2009  cuda::device::primary_context::detail_::increase_refcount(cuda::device::default_device_id);
2010 
2011  // Note: Despite the templating by T, the size is still in bytes,
2012  // not in number of T's
2013  auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
2014  if (is_success(status) && allocated == 0) {
2015  // Can this even happen? hopefully not
2016  status = static_cast<status_t>(status::unknown);
2017  }
2018  throw_if_error_lazy(status, "Failed allocating "
2019  + ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
2020  return {as_pointer(allocated), num_bytes};
2021 }
2022 
2028 inline void free(void* ptr)
2029 {
2030  auto result = cuMemFree(device::address(ptr));
2031  cuda::device::primary_context::detail_::decrease_refcount(cuda::device::default_device_id);
2032  throw_if_error_lazy(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
2033 }
2034 
2036 inline void free(managed::region_t region)
2037 {
2038  free(region.start());
2039 }
2040 
2041 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2042 struct allocator {
2043  // Allocates in the current context!
2044  void* operator()(size_t num_bytes) const
2045  {
2046  return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
2047  }
2048 };
2049 
2050 struct deleter {
2051  void operator()(void* ptr) const { detail_::free(ptr); }
2052 };
2053 
2055  context::handle_t context_handle,
2056  size_t num_bytes,
2057  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
2058 {
2059  CAW_SET_SCOPE_CONTEXT(context_handle);
2060  return allocate_in_current_context(num_bytes, initial_visibility);
2061 }
2062 
2063 } // namespace detail_
2064 
2078 inline region_t allocate(
2079  const context_t& context,
2080  size_t num_bytes,
2081  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2082 
2096 inline region_t allocate(
2097  const device_t& device,
2098  size_t num_bytes,
2099  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2100 
2110 region_t allocate(size_t num_bytes);
2111 
2117 inline void free(void* managed_ptr)
2118 {
2119  auto result = cuMemFree(device::address(managed_ptr));
2120  throw_if_error_lazy(result,
2121  "Freeing managed memory (host and device regions) at address "
2122  + cuda::detail_::ptr_as_hex(managed_ptr));
2123 }
2124 
2126 inline void free(region_t region)
2127 {
2128  free(region.start());
2129 }
2130 
2131 namespace detail_ {
2132 
2133 inline void prefetch(
2134  const_region_t region,
2135  cuda::memory::location_t destination,
2136  stream::handle_t source_stream_handle)
2137 {
2138  auto address = device::address(region.start());
2139 #if CUDA_VERSION >= 13000
2140  static constexpr unsigned flags { 0 };
2141  auto result = cuMemPrefetchAsync(address, region.size(), destination, flags, source_stream_handle);
2142 #else
2143  if (destination.type == CU_MEM_LOCATION_TYPE_HOST) {
2144  destination = { CU_MEM_LOCATION_TYPE_DEVICE, CU_DEVICE_CPU };
2145  }
2146  if (destination.type != CU_MEM_LOCATION_TYPE_DEVICE) {
2147  throw runtime_error(status::named_t::not_supported,
2148  "Prefetching to destination types other than CUDA devices is not supported before CUDA 13.0");
2149  }
2150  auto result = cuMemPrefetchAsync(address, region.size(), destination.id, source_stream_handle);
2151 #endif
2152  throw_if_error_lazy(result,
2153  "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
2154  + cuda::detail_::ptr_as_hex(region.start()) + " to " + cuda::memory::detail_::identify(destination));
2155 }
2156 
2157 
2158 inline void prefetch(
2159  const_region_t region,
2160  cuda::device::id_t destination,
2161  stream::handle_t source_stream_handle)
2162 {
2163  prefetch(region, pool::detail_::create_mem_location(destination), source_stream_handle);
2164 }
2165 
2166 } // namespace detail_
2167 
2173 void prefetch(
2174  const_region_t region,
2175  const cuda::device_t& destination,
2176  const stream_t& stream);
2177 
2182 void prefetch_to_host(
2183  const_region_t region,
2184  const stream_t& stream);
2185 
2186 } // namespace managed
2187 
2188 namespace mapped {
2189 
2197 template <typename T>
2198 inline T* device_side_pointer_for(T* host_memory_ptr)
2199 {
2200  auto unconsted_host_mem_ptr = const_cast<typename ::std::remove_const<T>::type *>(host_memory_ptr);
2201  device::address_t device_side_ptr;
2202  auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation
2203  auto status = cuMemHostGetDevicePointer(
2204  &device_side_ptr,
2205  unconsted_host_mem_ptr,
2206  get_device_pointer_flags);
2207  throw_if_error_lazy(status,
2208  "Failed obtaining the device-side pointer for host-memory pointer "
2209  + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
2210  return as_pointer(device_side_ptr);
2211 }
2212 
2219 {
2220  return { device_side_pointer_for(region.start()), region.size() };
2221 }
2222 
2225 {
2226  return { device_side_pointer_for(region.start()), region.size() };
2227 }
2228 
2229 namespace detail_ {
2230 
2240 inline region_pair_t allocate_in_current_context(
2241  context::handle_t current_context_handle,
2242  size_t size_in_bytes,
2243  allocation_options options)
2244 {
2245  region_pair_t allocated {};
2246  // The default initialization is unnecessary, but let's play it safe
2247  auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
2248  void* allocated_ptr;
2249  auto status = cuMemHostAlloc(&allocated_ptr, size_in_bytes, flags);
2250  if (is_success(status) && (allocated_ptr == nullptr)) {
2251  // Can this even happen? hopefully not
2252  status = static_cast<status_t>(status::named_t::unknown);
2253  }
2254  throw_if_error_lazy(status,
2255  "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
2256  + " bytes of global memory in " + context::detail_::identify(current_context_handle));
2257  allocated.host_side = { allocated_ptr, size_in_bytes };
2258  allocated.device_side = device_side_region_for(allocated.host_side);
2259  return allocated;
2260 }
2261 
2262 inline region_pair_t allocate(
2263  context::handle_t context_handle,
2264  size_t size_in_bytes,
2265  allocation_options options)
2266 {
2267  CAW_SET_SCOPE_CONTEXT(context_handle);
2268  return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
2269 }
2270 
2271 inline void free(void* host_side_pair)
2272 {
2273  auto result = cuMemFreeHost(host_side_pair);
2274  throw_if_error_lazy(result, "Freeing a mapped memory region pair with host-side address "
2275  + cuda::detail_::ptr_as_hex(host_side_pair));
2276 }
2277 
2278 } // namespace detail_
2279 
2290  const cuda::context_t& context,
2291  size_t size_in_bytes,
2292  allocation_options options);
2293 
2303  const cuda::device_t& device,
2304  size_t size_in_bytes,
2306 
2307 
2313 inline void free(region_pair_t pair)
2314 {
2315  detail_::free(pair.host_side.data());
2316 }
2317 
2324 inline void free_region_pair_of(void* ptr)
2325 {
2326  // TODO: What if the pointer is not part of a mapped region pair?
2327  // We could check this...
2328  void* host_side_ptr;
2329  auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
2330  throw_if_error_lazy(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
2331  + cuda::detail_::ptr_as_hex(ptr));
2332  detail_::free(host_side_ptr);
2333 }
2334 
2346 inline bool is_part_of_a_region_pair(const void* ptr)
2347 {
2348  auto wrapped_ptr = pointer_t<const void> { ptr };
2349  return wrapped_ptr.other_side_of_region_pair().get() != nullptr;
2350 }
2351 
2352 } // namespace mapped
2353 
2354 namespace detail_ {
2369 template <typename T, typename RawDeleter, typename RegionAllocator>
2370 unique_span<T> make_convenient_type_unique_span(size_t size, RegionAllocator allocator)
2371 {
2372  memory::detail_::check_allocation_type<T>();
2373  auto deleter = [](span<T> sp) {
2374  return RawDeleter{}(sp.data());
2375  };
2376  region_t allocated_region = allocator(size * sizeof(T));
2377  return unique_span<T>(
2378  allocated_region.as_span<T>(), // no constructor calls - trivial construction
2379  deleter // no destructor calls - trivial destruction
2380  );
2381 }
2382 
2383 } // namespace detail_
2384 
2385 
2386 namespace device {
2387 
2388 namespace detail_ {
2389 
2390 template <typename T>
2391 unique_span<T> make_unique_span(const context::handle_t context_handle, size_t size)
2392 {
2393  auto allocate_in_current_context_ = [](size_t size) { return allocate_in_current_context(size); };
2394  CAW_SET_SCOPE_CONTEXT(context_handle);
2395  return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context_);
2396 }
2397 
2398 } // namespace detail_
2399 
2417 template <typename T>
2418 unique_span<T> make_unique_span(const context_t& context, size_t size);
2419 
2425 template <typename T>
2426 unique_span<T> make_unique_span(const device_t& device, size_t size);
2427 
2434 template <typename T>
2435 unique_span<T> make_unique_span(size_t size);
2436 
2437 } // namespace device
2438 
2440 template <typename T>
2441 inline unique_span<T> make_unique_span(const context_t& context, size_t size)
2442 {
2443  return device::make_unique_span<T>(context, size);
2444 }
2445 
2447 template <typename T>
2448 inline unique_span<T> make_unique_span(const device_t& device, size_t size)
2449 {
2450  return device::make_unique_span<T>(device, size);
2451 }
2452 
2453 namespace host {
2454 
2475 template <typename T>
2476 unique_span<T> make_unique_span(size_t size)
2477 {
2478  // Need this because of allocate takes more arguments and has default ones
2479  auto allocator = [](size_t size) { return allocate(size); };
2480  return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2481 }
2482 
2483 } // namespace host
2484 
2485 namespace managed {
2486 
2487 namespace detail_ {
2488 
2489 template <typename T, initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2490 unique_span<T> make_unique_span(
2491  const context::handle_t context_handle,
2492  size_t size)
2493 {
2494  CAW_SET_SCOPE_CONTEXT(context_handle);
2495  auto allocator = [](size_t size) {
2496  return allocate_in_current_context(size, InitialVisibility);
2497  };
2498  return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2499 }
2500 
2501 } // namespace detail_
2502 
2525 template <typename T>
2526 unique_span<T> make_unique_span(
2527  const context_t& context,
2528  size_t size,
2529  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2530 
2536 template <typename T>
2537 unique_span<T> make_unique_span(
2538  const device_t& device,
2539  size_t size,
2540  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2541 
2548 template <typename T>
2549 unique_span<T> make_unique_span(
2550  size_t size,
2551  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2552 
2553 } // namespace managed
2554 
2555 } // namespace memory
2556 
2557 namespace symbol {
2558 
2566 template <typename T>
2568 {
2569  void *start;
2570  size_t symbol_size;
2571  auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2572  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for a symbol");
2573  api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2574  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for the symbol at address"
2575  + cuda::detail_::ptr_as_hex(start));
2576  return { start, symbol_size };
2577 }
2578 
2579 } // namespace symbol
2580 
2581 } // namespace cuda
2582 
2583 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_
void register_(const_region_t region)
Register a memory region with the CUDA driver.
Definition: memory.hpp:1775
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2324
Proxy class for a CUDA stream.
Definition: stream.hpp:258
endpoint_t
Type for choosing between endpoints of copy operations.
Definition: copy_parameters.hpp:19
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory.
Definition: memory.hpp:244
cpu_write_combining write_combining
whether or not the GPU can batch multiple writes to this area and propagate them at its convenience...
Definition: memory.hpp:98
The cuda::memory::pool_t proxy class for memory pools, and related code for creating, manipulating and allocating using memory pools.
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator=Allocator())
Definition: memory.hpp:215
unique_span< T > make_unique_span(const context_t &context, size_t size)
See device::make_unique_span(const context_t& context, size_t size)
Definition: memory.hpp:2441
Wrapper class for a CUDA context.
Definition: context.hpp:249
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
is_not_accessible_on_all_devices
Definition: memory.hpp:1646
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1974
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
this_type & clear_offsets() noexcept
Clear the offsets into both the source and the destination endpoint regions.
Definition: copy_parameters.hpp:284
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:29
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:391
CUmemLocation location_t
Used in a limited number of API functions which can relate both to CUDA device memory and system memo...
Definition: types.hpp:555
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
::std::size_t size_bytes() const noexcept
Overall size in bytes of the elements of the array, over all dimensions.
Definition: array.hpp:255
portability_across_contexts portability
whether or not the allocated region can be used in different CUDA contexts.
Definition: memory.hpp:95
constexpr span_pair_t< T > as_spans() const
Definition: memory.hpp:167
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region.
Definition: memory.hpp:204
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:346
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:126
T * get() const
Definition: pointer.hpp:139
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:84
memory::region_t host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:163
void set(void *start, int byte_value, size_t num_bytes)
Sets all bytes in a stretch of host-side memory to a single value.
Definition: memory.hpp:1813
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
The cuda::memory::copy_parameters_t class template and related definitions.
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:627
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:93
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
::std::size_t size() const noexcept
Overall number of elements in the array, over all dimensions.
Definition: array.hpp:252
span< T > host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:145
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:208
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:160
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1634
void deregister(const_region_t region)
Have the CUDA driver "forget" about a region of memory which was previously registered with it...
Definition: memory.hpp:1795
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1644
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:131
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2567
this_type & clear_offset(endpoint_t endpoint) noexcept
Set the copy operation to use the multi-dimensional region of the specified endpoint without skipping...
Definition: copy_parameters.hpp:278
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
region_pair_t allocate(const cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:276
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA&#39;s complex copyin...
Definition: copy_parameters.hpp:68
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1976
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:684
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1623
void free(void *host_ptr)
Frees a region of pinned host memory which was allocated with one of the pinned host memory allocatio...
Definition: memory.hpp:1563
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:236
A wrapper class for host and/or device pointers, allowing easy access to CUDA&#39;s pointer attributes...
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:702
void set(void *ptr, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets a number of bytes in memory to a fixed value.
Definition: memory.hpp:418
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
A pair of memory spans, one in device-global memory and one in host/system memory, mapped to it.
Definition: memory.hpp:142
const_region_t device_side_region_for(const_region_t region)
Get the memory region mapped to a given host-side region.
Definition: memory.hpp:2224
void free(region_pair_t pair)
Free a pair of mapped memory regions.
Definition: memory.hpp:2313
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region.
Definition: memory.hpp:209
this_type & set_endpoint(endpoint_t endpoint, const cuda::array_t< T, NumDimensions > &array) noexcept
Set one of the copy endpoints to a CUDA array.
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:674
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) given given a host-side pointer ma...
Definition: memory.hpp:2198
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:64
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:71
is_accessible_on_all_devices
Definition: memory.hpp:1645
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:236
region_t allocate(size_t size_in_bytes, allocation_options options)
Allocates pinned host memory.
Definition: memory.hpp:340
Wrapper class for a CUDA device.
Definition: device.hpp:135
void zero(region_t region, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:736
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:755
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:214
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1989
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2346