cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
memory.hpp
Go to the documentation of this file.
1 
25 #pragma once
26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_
27 #define CUDA_API_WRAPPERS_MEMORY_HPP_
28 
29 #include "copy_parameters.hpp"
30 #include "array.hpp"
31 #include "constants.hpp"
32 #include "current_device.hpp"
33 #include "error.hpp"
34 #include "pointer.hpp"
35 #include "current_context.hpp"
36 #include "detail/unique_span.hpp"
37 
38 // The following is needed for cudaGetSymbolAddress, cudaGetSymbolSize
39 #include <cuda_runtime.h>
40 
41 #include <memory>
42 #include <cstring> // for ::std::memset
43 #include <vector>
44 #include <utility>
45 
46 namespace cuda {
47 
49 class device_t;
50 class context_t;
51 class stream_t;
52 class module_t;
54 
55 namespace memory {
56 
62 enum class portability_across_contexts : bool {
63  isnt_portable = false,
64  is_portable = true,
65 };
66 
82 enum cpu_write_combining : bool {
83  without_wc = false,
84  with_wc = true,
85 };
86 
94 
97 };
98 
99 namespace detail_ {
100 
101 template <typename T, bool CheckConstructibility = false>
102 inline void check_allocation_type() noexcept
103 {
104  static_assert(::std::is_trivially_constructible<T>::value,
105  "Attempt to create a typed buffer of a non-trivially-constructive type");
106  static_assert(not CheckConstructibility or ::std::is_trivially_destructible<T>::value,
107  "Attempt to create a typed buffer of a non-trivially-destructible type "
108  "without allowing for its destruction");
109  static_assert(::std::is_trivially_copyable<T>::value,
110  "Attempt to create a typed buffer of a non-trivially-copyable type");
111 }
112 
113 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
114 {
115  return
116  (options.portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) |
117  (options.write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
118 }
119 
120 } // namespace detail_
121 
129 namespace mapped {
130 
131 // TODO: Perhaps make this an array of size 2 and use aspects to index it?
132 
139 template <typename T>
140 struct span_pair_t {
143  span<T> host_side, device_side;
144 
146  constexpr operator ::std::pair<span<T>, span<T>>() const { return { host_side, device_side }; }
147  constexpr operator ::std::pair<region_t, region_t>() const { return { host_side, device_side }; }
149 };
150 
162 
164  template <typename T>
165  constexpr span_pair_t<T> as_spans() const
166  {
167  return { host_side.as_span<T>(), device_side.as_span<T>() };
168  }
169 };
170 
171 } // namespace mapped
172 
174 namespace device {
175 
176 namespace detail_ {
177 
183 #if CUDA_VERSION >= 11020
184 inline cuda::memory::region_t allocate_in_current_context(
185  size_t num_bytes, optional<stream::handle_t> stream_handle = {})
186 #else
187 inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
188 #endif
189 {
190 #if CUDA_VERSION >= 11020
191  if (stream_handle) {
192  device::address_t allocated = 0;
193  // Note: the typed cudaMalloc also takes its size in bytes, apparently,
194  // not in number of elements
195  auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle);
196  if (is_success(status) && allocated == 0) {
197  // Can this even happen? hopefully not
198  status = static_cast<decltype(status)>(status::unknown);
199  }
200  throw_if_error_lazy(status,
201  "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
202  " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) );
203  return {as_pointer(allocated), num_bytes};
204  }
205 #endif
206  device::address_t allocated = 0;
207  auto status = cuMemAlloc(&allocated, num_bytes);
208  if (is_success(status) && allocated == 0) {
209  // Can this even happen? hopefully not
210  status = static_cast<status_t>(status::unknown);
211  }
212  throw_if_error_lazy(status, "Failed allocating " + ::std::to_string(num_bytes) +
213  " bytes of global memory on the current CUDA device");
214  return {as_pointer(allocated), num_bytes};
215 }
216 
217 #if CUDA_VERSION >= 11020
218 inline region_t allocate(
219  context::handle_t context_handle,
220  size_t size_in_bytes,
221  optional<stream::handle_t> stream_handle = {})
222 {
223  CAW_SET_SCOPE_CONTEXT(context_handle);
224  return allocate_in_current_context(size_in_bytes, stream_handle);
225 }
226 #else
227 inline region_t allocate(
228  context::handle_t context_handle,
229  size_t size_in_bytes)
230 {
231  CAW_SET_SCOPE_CONTEXT(context_handle);
232  return allocate_in_current_context(size_in_bytes);
233 }
234 #endif
235 
236 #if CUDA_VERSION >= 11020
237 inline void free_on_stream(
238  void* allocated_region_start,
239  stream::handle_t stream_handle)
240 {
241  auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
242  throw_if_error_lazy(status,
243  "Failed scheduling an asynchronous freeing of the global memory region starting at "
244  + cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
245  + stream::detail_::identify(stream_handle));
246 }
247 #endif // CUDA_VERSION >= 11020
248 
249 inline void free_in_current_context(
250  context::handle_t current_context_handle,
251  void* allocated_region_start)
252 {
253  auto result = cuMemFree(address(allocated_region_start));
254  if (result == status::success) { return; }
255 #ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
256  if (result == status::context_is_destroyed) { return; }
257 #endif
258  throw runtime_error(result, "Freeing device memory at "
259  + cuda::detail_::ptr_as_hex(allocated_region_start)
260  + " in " + context::detail_::identify(current_context_handle));
261 }
262 
263 } // namespace detail_
264 
266 #if CUDA_VERSION >= 11020
267 inline void free(void* region_start, optional_ref<const stream_t> stream = {});
268 #else
269 inline void free(void* ptr);
270 #endif
271 
272 #if CUDA_VERSION >= 11020
273 inline void free(region_t region, optional_ref<const stream_t> stream = {})
275 {
276  free(region.start(), stream);
277 }
278 #else
279 inline void free(region_t region)
281 {
282  free(region.start());
283 }
284 #endif
285 
286 #if CUDA_VERSION >= 11020
287 
300 region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stream);
301 #endif
302 
316 inline region_t allocate(const context_t& context, size_t size_in_bytes);
317 
331 inline region_t allocate(const device_t& device, size_t size_in_bytes);
332 
333 namespace detail_ {
334 
335 // Note: Allocates _in the current context_! No current context => failure!
336 struct allocator {
337  void* operator()(size_t num_bytes) const {
338  return detail_::allocate_in_current_context(num_bytes).start();
339  }
340 };
341 
342 struct deleter {
343  void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
344 };
345 
346 } // namespace detail_
347 
360 template <typename T>
361 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream = {});
362 
385 inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {})
386 {
387  return typed_set<unsigned char>(
388  static_cast<unsigned char*>(start),
389  static_cast<unsigned char>(byte_value),
390  num_bytes,
391  stream);
392 }
393 
403 inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
404 {
405  set(region.start(), byte_value, region.size(), stream);
406 }
407 
416 inline void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream = {})
417 {
418  set(start, 0, num_bytes, stream);
419 }
420 
428 inline void zero(region_t region, optional_ref<const stream_t> stream = {})
429 {
430  zero(region.start(), region.size(), stream);
431 }
432 
440 template <typename T>
441 inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
442 {
443  zero(ptr, sizeof(T), stream);
444 }
445 
446 } // namespace device
447 
449 namespace detail_ {
450 
452 
464 inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
465 {
466  auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
467 
468  // TODO: Determine whether it was from host to device, device to host etc and
469  // add this information to the error string
470  throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
471 }
472 
480 inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
481 {
482 #ifndef NDEBUG
483  if (destination.size() < source.size()) {
484  throw ::std::logic_error("Source size exceeds destination size");
485  }
486 #endif
487  copy(destination.start(), source.start(), source.size(), stream_handle);
488 }
490 
492 
493 inline status_t multidim_copy_in_current_context(
494  ::std::integral_constant<dimensionality_t, 2>,
495  copy_parameters_t<2> params,
496  optional<stream::handle_t> stream_handle)
497 {
498  // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
499  // structure holds no information about contexts.
500  //
501  // Note: The stream handle, even if present, might be the null handle; for now
502  // we distinguish between using the null stream handle - the default stream's -
503  // and using the synchronous API
504  return stream_handle ?
505  cuMemcpy2DAsync(&params, *stream_handle) :
506  cuMemcpy2D(&params);
507 }
508 
509 inline status_t multidim_copy_in_current_context(
510  ::std::integral_constant<dimensionality_t, 3>,
511  copy_parameters_t<3> params,
512  optional<stream::handle_t> stream_handle)
513 {
514  if (params.srcContext == params.dstContext) {
515  // TODO: Should we check it's also the current context?
516  using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
517  auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
518  return stream_handle ?
519  cuMemcpy3DAsync(intra_context_params, *stream_handle) :
520  cuMemcpy3D(intra_context_params);
521  }
522  return stream_handle ?
523  cuMemcpy3DPeerAsync(&params, *stream_handle) :
524  cuMemcpy3DPeer(&params);
525 }
526 
527 template<dimensionality_t NumDimensions>
528 status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, optional<stream::handle_t> stream_handle) {
529  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
530 }
531 
532 // Note: Assumes the stream handle is for a stream in the current context
533 template<dimensionality_t NumDimensions>
534 status_t multidim_copy(
535  context::handle_t context_handle,
537  optional<stream::handle_t> stream_handle)
538 {
539  CAW_SET_SCOPE_CONTEXT(context_handle);
540  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
541 }
542 
543 // Assumes the array and the stream share the same context, and that the destination is
544 // accessible from that context (e.g. allocated within it, or being managed memory, etc.)
545 template <typename T, dimensionality_t NumDimensions>
546 void copy(T *destination, const array_t<T, NumDimensions>& source, optional<stream::handle_t> stream_handle)
547 {
548  using memory::endpoint_t;
549  auto dims = source.dimensions();
550  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
551  auto params = copy_parameters_t<NumDimensions> {};
552  params.clear_offset(endpoint_t::source);
553  params.clear_offset(endpoint_t::destination);
554  params.template set_extent<T>(dims);
555  params.set_endpoint(endpoint_t::source, source);
556  params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
557  params.set_default_pitches();
558  params.clear_rest();
559  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
560  throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
561 }
562 
563 
564 template <typename T, dimensionality_t NumDimensions>
565 void copy(const array_t<T, NumDimensions>& destination, const T* source, optional<stream::handle_t> stream_handle)
566 {
567  using memory::endpoint_t;
568  auto dims = destination.dimensions();
569  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
570  auto params = copy_parameters_t<NumDimensions>{};
571  params.clear_offset(endpoint_t::source);
572  params.clear_offset(endpoint_t::destination);
573  params.template set_extent<T>(dims);
574  params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
575  params.set_endpoint(endpoint_t::destination, destination);
576  params.set_default_pitches();
577  params.clear_rest();
578  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
579  throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
580 }
581 
596 template <typename T>
597 void copy_single(T* destination, const T* source, optional<stream::handle_t> stream_handle)
598 {
599  copy(destination, source, sizeof(T), stream_handle);
600 }
601 
602 } // namespace detail_
603 
614 
624 template <typename T, size_t N>
625 inline void copy(span<T> destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
626 {
627 #ifndef NDEBUG
628  if (destination.size() < N) {
629  throw ::std::logic_error("Source size exceeds destination size");
630  }
631 #endif
632  return copy(destination.data(), source, sizeof(T) * N, stream);
633 }
634 
644 template <typename T, size_t N>
645 void copy(c_array<T,N>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
646 {
647 #ifndef NDEBUG
648  if (source.size() > N) {
649  throw ::std::invalid_argument(
650  "Attempt to copy a span of " + ::std::to_string(source.size()) +
651  " elements into an array of " + ::std::to_string(N) + " elements");
652  }
653 #endif
654  return copy(destination, source.start(), sizeof(T) * N, stream);
655 }
656 
666 template <typename T, size_t N>
667 inline void copy(void* destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
668 {
669  return copy(destination, source, sizeof(T) * N, stream);
670 }
671 
689 template <typename T, size_t N>
690 inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream_t> stream = {})
691 {
692  return copy(destination, source, sizeof(T) * N, stream);
693 }
694 
696 
709 void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {});
710 
722 inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
723 {
724  return set(region.start(), byte_value, region.size(), stream);
725 }
726 
734 inline void zero(region_t region, optional_ref<const stream_t> stream = {})
735 {
736  return set(region, 0, stream);
737 }
738 
747 inline void zero(void* ptr, size_t num_bytes, optional_ref<const stream_t> stream = {})
748 {
749  return set(ptr, 0, num_bytes, stream);
750 }
751 
760 template <typename T>
761 inline void zero(T* ptr)
762 {
763  zero(ptr, sizeof(T));
764 }
765 
766 namespace detail_ {
767 
768 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2> two, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
769 {
770  // TODO: Move this logic into the scoped ensurer class
771  auto context_handle = context::current::detail_::get_handle();
772  if (context_handle != context::detail_::none) {
773  return detail_::multidim_copy_in_current_context(two, params, stream_handle);
774  }
775  auto current_device_id = cuda::device::current::detail_::get_id();
776  context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
777  context::current::detail_::push(context_handle);
778  // Note this _must_ be an intra-context copy, as inter-context is not supported
779  // and there's no indication of context in the relevant data structures
780  auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle);
781  context::current::detail_::pop();
782  cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
783  return status;
784 }
785 
786 inline status_t multidim_copy(context::handle_t context_handle, ::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
787 {
788  context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
789  return multidim_copy(::std::integral_constant<dimensionality_t, 2>{}, params, stream_handle);
790 }
791 
792 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params, optional<stream::handle_t> stream_handle)
793 {
794  if (params.srcContext == params.dstContext) {
795  context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
796  return detail_::multidim_copy_in_current_context(params, stream_handle);
797  }
798  return stream_handle ?
799  cuMemcpy3DPeerAsync(&params, *stream_handle) :
800  cuMemcpy3DPeer(&params);
801 }
802 
803 template<dimensionality_t NumDimensions>
804 status_t multidim_copy(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle)
805 {
806  return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
807 }
808 
809 
810 } // namespace detail_
811 
822 template<dimensionality_t NumDimensions>
823 void copy(copy_parameters_t<NumDimensions> params, optional_ref<const stream_t> stream = {});
824 
838 template<typename T, dimensionality_t NumDimensions>
839 void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source, optional_ref<const stream_t> stream = {})
840 {
841  auto dims = destination.dimensions();
842  auto params = copy_parameters_t<NumDimensions> {};
843  params.clear_offsets();
844  params.template set_extent<T>(dims);
845  params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast<T*>(source), dims);
846  params.set_endpoint(endpoint_t::destination, destination);
847  params.clear_rest();
848  copy(params, stream);
849 }
850 
869 template <typename T, dimensionality_t NumDimensions>
870 void copy(array_t<T, NumDimensions>& destination, const T* source, optional_ref<const stream_t> stream = {});
871 
880 template<typename T, dimensionality_t NumDimensions>
881 void copy(const array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
882 {
883 #ifndef NDEBUG
884  if (destination.size() < source.size()) {
885  throw ::std::invalid_argument(
886  "Attempt to copy a span of " + ::std::to_string(source.size()) +
887  " elements into a CUDA array of " + ::std::to_string(destination.size()) + " elements");
888  }
889 #endif
890  copy(destination, source.data(), stream);
891 }
892 
903 template <typename T, dimensionality_t NumDimensions>
904 void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
905 {
906  auto dims = source.dimensions();
907  auto params = copy_parameters_t<NumDimensions> {};
908  params.clear_offset(endpoint_t::source);
909  params.clear_offset(endpoint_t::destination);
910  params.template set_extent<T>(dims);
911  params.set_endpoint(endpoint_t::source, source);
912  params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
913  params.set_default_pitches();
914  params.clear_rest();
915  copy(params, stream);
916 }
917 
936 template <typename T, dimensionality_t NumDimensions>
937 void copy(T* destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {});
938 
939 
947 template <typename T, dimensionality_t NumDimensions>
948 void copy(span<T> destination, const array_t<T, NumDimensions>& source, optional_ref <const stream_t> stream = {})
949 {
950 #ifndef NDEBUG
951  if (destination.size() < source.size()) {
952  throw ::std::invalid_argument(
953  "Attempt to copy a CUDA array of " + ::std::to_string(source.size()) +
954  " elements into a span of " + ::std::to_string(destination.size()) + " elements");
955  }
956 #endif
957  copy(destination.data(), source, stream);
958 }
959 
967 template <typename T, dimensionality_t NumDimensions>
968 void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream)
969 {
970  auto dims = source.dimensions();
971  auto params = copy_parameters_t<NumDimensions> {};
972  params.clear_offset(endpoint_t::source);
973  params.clear_offset(endpoint_t::destination);
974  params.template set_extent<T>(dims);
975  params.set_endpoint(endpoint_t::source, source);
976  params.set_endpoint(endpoint_t::destination, destination);
977  params.set_default_pitches();
978  params.clear_rest();
979  auto status = //(source.context() == destination.context()) ?
980  detail_::multidim_copy<NumDimensions>(source.context_handle(), params, stream);
981  throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
982 }
983 
1001 template <typename T, dimensionality_t NumDimensions>
1002 void copy(region_t destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
1003 {
1004 #ifndef NDEBUG
1005  if (destination.size() < source.size_bytes()) {
1006  throw ::std::invalid_argument(
1007  "Attempt to copy " + ::std::to_string(source.size_bytes()) + " bytes from an array into a "
1008  "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
1009  }
1010 #endif
1011  copy(destination.start(), source, stream);
1012 }
1013 
1027 template <typename T, dimensionality_t NumDimensions>
1028 void copy(array_t<T, NumDimensions>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
1029 {
1030 #ifndef NDEBUG
1031  if (destination.size_bytes() < source.size()) {
1032  throw ::std::invalid_argument(
1033  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1034  " bytes into an array of size " + ::std::to_string(destination.size_bytes()) + " bytes");
1035  }
1036 #endif
1037  copy(destination, static_cast<T const*>(source.start()), stream);
1038 }
1039 
1056 template <typename T>
1057 void copy_single(T* destination, const T* source, optional_ref<const stream_t> stream = {});
1058 
1077 void copy(void* destination, void const* source, size_t num_bytes, optional_ref<const stream_t> stream = {});
1078 
1079 
1099 template <typename T, size_t N>
1100 inline void copy(c_array<T,N>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
1101 {
1102 #ifndef NDEBUG
1103  size_t required_size = N * sizeof(T);
1104  if (source.size() != required_size) {
1105  throw ::std::invalid_argument(
1106  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1107  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
1108  }
1109 #endif
1110  return copy(&(destination[0]), source.start(), sizeof(T) * N, stream);
1111 }
1112 
1136 template <typename T, size_t N>
1137 inline void copy(region_t destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
1138 {
1139 #ifndef NDEBUG
1140  if (destination.size() < N) {
1141  throw ::std::logic_error("Source size exceeds destination size");
1142  }
1143 #endif
1144  return copy(destination.start(), source, sizeof(T) * N, stream);
1145 }
1146 
1147 
1159 inline void copy(region_t destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
1160 {
1161 #ifndef NDEBUG
1162  if (destination.size() < num_bytes) {
1163  throw ::std::logic_error("Attempt to copy beyond the end of the destination region");
1164  }
1165 #endif
1166  copy(destination.start(), source.start(), num_bytes, stream);
1167 }
1168 
1169 
1186 inline void copy(region_t destination, const_region_t source, optional_ref<const stream_t> stream = {})
1187 {
1188  copy(destination, source, source.size(), stream);
1189 }
1190 
1191 
1209 inline void copy(region_t destination, void* source, optional_ref<const stream_t> stream = {})
1210 {
1211  return copy(destination.start(), source, destination.size(), stream);
1212 }
1213 
1231 inline void copy(region_t destination, void* source, size_t num_bytes, optional_ref<const stream_t> stream = {})
1232 {
1233 #ifndef NDEBUG
1234  if (destination.size() < num_bytes) {
1235  throw ::std::logic_error("Number of bytes to copy exceeds destination size");
1236  }
1237 #endif
1238  return copy(destination.start(), source, num_bytes, stream);
1239 }
1240 
1260 inline void copy(void* destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
1261 {
1262 #ifndef NDEBUG
1263  if (source.size() < num_bytes) {
1264  throw ::std::logic_error("Attempt to copy more than the source region's size");
1265  }
1266 #endif
1267  copy(destination, source.start(), num_bytes, stream);
1268 }
1269 
1284 inline void copy(void* destination, const_region_t source, optional_ref<const stream_t> stream = {})
1285 {
1286  copy(destination, source, source.size(), stream);
1287 }
1288 
1289 namespace device {
1290 
1291 namespace detail_ {
1292 
1293 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
1294 {
1295  // TODO: Double-check that this call doesn't require setting the current device
1296  auto result = cuMemsetD8Async(address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
1297  throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
1298 }
1299 
1300 
1301 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
1302 {
1303  set(region.start(), byte_value, region.size(), stream_handle);
1304 }
1305 
1306 inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle)
1307 {
1308  set(start, 0, num_bytes, stream_handle);
1309 }
1310 
1311 inline void zero(region_t region, stream::handle_t stream_handle)
1312 {
1313  zero(region.start(), region.size(), stream_handle);
1314 }
1315 
1316 // TODO: Drop this in favor of <algorithm>-like functions under `cuda::`.
1317 template <typename T>
1318 inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle)
1319 {
1320  static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
1321  static_assert(
1322  sizeof(T) == 1 or sizeof(T) == 2 or
1323  sizeof(T) == 4 or sizeof(T) == 8,
1324  "Unsupported type size - only sizes 1, 2 and 4 are supported");
1325  // TODO: Consider checking for alignment when compiling without NDEBUG
1326  status_t result = static_cast<status_t>(cuda::status::success);
1327  switch(sizeof(T)) {
1328  case(1): result = cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle); break;
1329  case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
1330  case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
1331  }
1332  throw_if_error_lazy(result, "Setting global device memory bytes");
1333 }
1334 
1335 } // namespace detail_
1336 
1337 
1349 template <typename T>
1350 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);
1351 
1360 void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);
1361 
1362 } // namespace device
1363 
1364 namespace inter_context {
1365 
1366 void copy(
1367  void * destination,
1368  const context_t& destination_context,
1369  const void * source_address,
1370  const context_t& source_context,
1371  size_t num_bytes,
1372  optional_ref<const stream_t> stream);
1373 
1374 /*
1375 inline void copy(
1376  region_t destination,
1377  const context_t& destination_context,
1378  const_region_t source,
1379  const context_t& source_context,
1380  optional_ref<const stream_t> stream)
1381 {
1382 #ifndef NDEBUG
1383  if (destination.size() < destination.size()) {
1384  throw ::std::invalid_argument(
1385  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1386  " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
1387  }
1388 #endif
1389  copy(destination.start(), destination_context, source, source_context, stream);
1390 }
1391 */
1392 
1393 
1394 /*
1395 
1396 template <typename T, dimensionality_t NumDimensions>
1397 inline void copy(
1398  array_t<T, NumDimensions> destination,
1399  array_t<T, NumDimensions> source,
1400  optional_ref<const stream_t> stream)
1401 {
1402  // for arrays, a single mechanism handles both intra- and inter-context copying
1403  return memory::copy(destination, source, stream);
1404 }
1405 */
1406 
1407 namespace detail_ {
1408 
1432 } // namespace detail_
1433 
1435 void copy(
1436  void * destination_address,
1437  const context_t& destination_context,
1438  const void * source_address,
1439  const context_t& source_context,
1440  size_t num_bytes,
1441  optional_ref<const stream_t> stream);
1442 
1444 inline void copy(
1445  void * destination,
1446  const context_t& destination_context,
1447  const_region_t source,
1448  const context_t& source_context,
1449  optional_ref<const stream_t> stream)
1450 {
1451  copy(destination, destination_context, source.start(), source_context, source.size(), stream);
1452 }
1453 
1455 inline void copy(
1456  region_t destination,
1457  const context_t& destination_context,
1458  const void* source,
1459  const context_t& source_context,
1460  optional_ref<const stream_t> stream)
1461 {
1462  copy(destination.start(), destination_context, source, source_context, destination.size(), stream);
1463 }
1464 
1466 inline void copy(
1467  region_t destination,
1468  const context_t& destination_context,
1469  const_region_t source,
1470  const context_t& source_context,
1471  optional_ref<const stream_t> stream)
1472 {
1473 #ifndef NDEBUG
1474  if (destination.size() < destination.size()) {
1475  throw ::std::invalid_argument(
1476  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1477  " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
1478  }
1479 #endif
1480  copy(destination.start(), destination_context, source, source_context, stream);
1481 }
1482 
1484 template <typename T, dimensionality_t NumDimensions>
1485 inline void copy(
1486  array_t<T, NumDimensions> destination,
1488  optional_ref<const stream_t> stream)
1489 {
1490  // for arrays, a single mechanism handles both intra- and inter-context copying
1491  return memory::copy(destination, source, stream);
1492 }
1493 
1494 } // namespace inter_context
1495 
1498 namespace host {
1499 
1500 namespace detail_ {
1501 
1502 // Even though the pinned memory should not in principle be associated in principle with a context or a device, in
1503 // practice it needs to be registered somewhere - and that somewhere is a context. Passing a context does not mean
1504 // the allocation will have special affinity to the device terms of better performance etc.
1505 inline region_t allocate(
1506  const context::handle_t context_handle,
1507  size_t size_in_bytes,
1508  allocation_options options);
1509 
1510 } // namespace detail_
1511 
1529 region_t allocate(size_t size_in_bytes, allocation_options options);
1530 
1543  size_t size_in_bytes,
1545  cpu_write_combining cpu_wc = cpu_write_combining(false))
1546 {
1547  return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
1548 }
1549 
1551 inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
1552 {
1553  return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
1554 }
1555 
1563 inline void free(void* host_ptr)
1564 {
1565  auto result = cuMemFreeHost(host_ptr);
1566 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
1567  if (result == status::success) { return; }
1568 #else
1569  if (result == status::success or result == status::context_is_destroyed) { return; }
1570 #endif
1571  throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1572 }
1573 
1579 inline void free(region_t region) { return free(region.data()); }
1580 
1581 namespace detail_ {
1582 
1583 struct allocator {
1584  void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes).data(); }
1585 };
1586 struct deleter {
1587  void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
1588 };
1589 
1601 inline void register_(const void *ptr, size_t size, unsigned flags)
1602 {
1603  auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1604  throw_if_error_lazy(result,
1605  "Could not register and page-lock the region of " + ::std::to_string(size) +
1606  " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
1607  " with flags " + cuda::detail_::as_hex(flags));
1608 }
1609 
1610 inline void register_(const_region_t region, unsigned flags)
1611 {
1612  register_(region.start(), region.size(), flags);
1613 }
1614 
1615 } // namespace detail_
1616 
1623 enum mapped_io_space : bool {
1624  is_mapped_io_space = true,
1625  is_not_mapped_io_space = false
1626 };
1627 
1635  map_into_device_memory = true,
1636  do_not_map_into_device_memory = false
1637 };
1638 
1647 };
1648 
1675 inline void register_(const void *ptr, size_t size,
1676  bool register_mapped_io_space,
1677  bool map_into_device_space,
1678  bool make_device_side_accessible_to_all
1679 #if CUDA_VERSION >= 11010
1680  , bool considered_read_only_by_device
1681 #endif // CUDA_VERSION >= 11010
1682  )
1683 {
1685  ptr, size,
1686  (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1687  | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1688  | (make_device_side_accessible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1689 #if CUDA_VERSION >= 11010
1690  | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
1691 #endif // CUDA_VERSION >= 11010
1692  );
1693 }
1694 
1720 inline void register_(
1721  const_region_t region,
1722  bool register_mapped_io_space,
1723  bool map_into_device_space,
1724  bool make_device_side_accessible_to_all
1725 #if CUDA_VERSION >= 11010
1726  , bool considered_read_only_by_device
1727 #endif // CUDA_VERSION >= 11010
1728  )
1729 {
1730  register_(
1731  region.start(),
1732  region.size(),
1733  register_mapped_io_space,
1734  map_into_device_space,
1735  make_device_side_accessible_to_all
1736 #if CUDA_VERSION >= 11010
1737  , considered_read_only_by_device
1738 #endif // CUDA_VERSION >= 11010
1739  );
1740 }
1741 
1756 inline void register_(void const *ptr, size_t size)
1757 {
1758  unsigned no_flags_set { 0 };
1759  detail_::register_(ptr, size, no_flags_set);
1760 }
1761 
1775 inline void register_(const_region_t region)
1776 {
1777  register_(region.start(), region.size());
1778 }
1779 
1787 inline void deregister(const void *ptr)
1788 {
1789  auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1790  throw_if_error_lazy(result,
1791  "Could not unregister the memory segment starting at address *a");
1792 }
1793 
1795 inline void deregister(const_region_t region)
1796 {
1797  deregister(region.start());
1798 }
1799 
1806 
1813 inline void set(void* start, int byte_value, size_t num_bytes)
1814 {
1815  ::std::memset(start, byte_value, num_bytes);
1816  // TODO: Error handling?
1817 }
1818 
1822 inline void set(region_t region, int byte_value)
1823 {
1824  memory::set(region.start(), byte_value, region.size(), nullopt);
1825 }
1826 
1833 inline void zero(void* start, size_t num_bytes)
1834 {
1835  set(start, 0, num_bytes);
1836 }
1837 
1843 inline void zero(region_t region)
1844 {
1845  host::set(region, 0);
1846 }
1847 
1854 template <typename T>
1855 inline void zero(T* ptr)
1856 {
1857  zero(ptr, sizeof(T));
1858 }
1859 
1860 
1861 } // namespace host
1862 
1863 namespace managed {
1864 
1865 namespace range {
1866 
1867 namespace detail_ {
1868 
1869 using attribute_t = CUmem_range_attribute;
1870 using advice_t = CUmem_advise;
1871 
1872 template <typename T>
1873 inline T get_scalar_attribute(const_region_t region, attribute_t attribute)
1874 {
1875  uint32_t attribute_value { 0 };
1876  auto result = cuMemRangeGetAttribute(
1877  &attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
1878  throw_if_error_lazy(result,
1879  "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1880  return static_cast<T>(attribute_value);
1881 }
1882 
1883 // CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's
1884 // not called cuMemRangeSetAttribute, and uses a different enum.
1885 inline void advise(const_region_t region, advice_t advice, cuda::device::id_t device_id)
1886 {
1887  auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id);
1888  throw_if_error_lazy(result, "Setting an attribute for a managed memory range at "
1889  + cuda::detail_::ptr_as_hex(region.start()));
1890 }
1891 
1892 inline advice_t as_advice(attribute_t attribute, bool set)
1893 {
1894  switch (attribute) {
1895  case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1896  return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1897  case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1898  return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1899  case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1900  return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1901  default:
1902  throw ::std::invalid_argument(
1903  "CUDA memory range attribute does not correspond to any range advice value");
1904  }
1905 }
1906 
1907 inline void set_attribute(const_region_t region, attribute_t settable_attribute, cuda::device::id_t device_id)
1908 {
1909  static constexpr const bool set { true };
1910  advise(region, as_advice(settable_attribute, set), device_id);
1911 }
1912 
1913 inline void set_attribute(const_region_t region, attribute_t settable_attribute)
1914 {
1915  static constexpr const bool set { true };
1916  static constexpr const cuda::device::id_t dummy_device_id { 0 };
1917  advise(region, as_advice(settable_attribute, set), dummy_device_id);
1918 }
1919 
1920 inline void unset_attribute(const_region_t region, attribute_t settable_attribute)
1921 {
1922  static constexpr const bool unset { false };
1923  static constexpr const cuda::device::id_t dummy_device_id { 0 };
1924  advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1925 }
1926 
1927 } // namespace detail_
1928 
1929 } // namespace range
1930 
1931 namespace detail_ {
1932 
1933 template <typename GenericRegion>
1934 struct region_helper : public GenericRegion {
1935  using GenericRegion::GenericRegion;
1936 
1937  bool is_read_mostly() const
1938  {
1939  return range::detail_::get_scalar_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1940  }
1941 
1942  void designate_read_mostly() const
1943  {
1944  range::detail_::set_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1945  }
1946 
1947  void undesignate_read_mostly() const
1948  {
1949  range::detail_::unset_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1950  }
1951 
1952  device_t preferred_location() const;
1953  void set_preferred_location(device_t& device) const;
1954  void clear_preferred_location() const;
1955 };
1956 
1957 } // namespace detail_
1958 
1960 using region_t = detail_::region_helper<memory::region_t>;
1962 using const_region_t = detail_::region_helper<memory::const_region_t>;
1963 
1965 void advise_expected_access_by(const_region_t region, device_t& device);
1966 
1969 
1971 template <typename Allocator = ::std::allocator<cuda::device_t> >
1972 typename ::std::vector<device_t, Allocator> expected_accessors(const_region_t region, const Allocator& allocator = Allocator() );
1973 
1975 enum class attachment_t : unsigned {
1976  global = CU_MEM_ATTACH_GLOBAL,
1977  host = CU_MEM_ATTACH_HOST,
1978  single_stream = CU_MEM_ATTACH_SINGLE,
1979  };
1980 
1981 namespace detail_ {
1982 
1983 inline managed::region_t allocate_in_current_context(
1984  size_t num_bytes,
1985  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1986 {
1987  device::address_t allocated = 0;
1988  auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
1989  attachment_t::global : attachment_t::host;
1990  // This is necessary because managed allocation requires at least one (primary)
1991  // context to have been constructed. We could theoretically check what our current
1992  // context is etc., but that would be brittle, since someone can managed-allocate,
1993  // then change contexts, then de-allocate, and we can't be certain that whoever
1994  // called us will call free
1995  cuda::device::primary_context::detail_::increase_refcount(cuda::device::default_device_id);
1996 
1997  // Note: Despite the templating by T, the size is still in bytes,
1998  // not in number of T's
1999  auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
2000  if (is_success(status) && allocated == 0) {
2001  // Can this even happen? hopefully not
2002  status = static_cast<status_t>(status::unknown);
2003  }
2004  throw_if_error_lazy(status, "Failed allocating "
2005  + ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
2006  return {as_pointer(allocated), num_bytes};
2007 }
2008 
2014 inline void free(void* ptr)
2015 {
2016  auto result = cuMemFree(device::address(ptr));
2017  cuda::device::primary_context::detail_::decrease_refcount(cuda::device::default_device_id);
2018  throw_if_error_lazy(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
2019 }
2020 
2022 inline void free(managed::region_t region)
2023 {
2024  free(region.start());
2025 }
2026 
2027 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2028 struct allocator {
2029  // Allocates in the current context!
2030  void* operator()(size_t num_bytes) const
2031  {
2032  return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
2033  }
2034 };
2035 
2036 struct deleter {
2037  void operator()(void* ptr) const { detail_::free(ptr); }
2038 };
2039 
2041  context::handle_t context_handle,
2042  size_t num_bytes,
2043  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
2044 {
2045  CAW_SET_SCOPE_CONTEXT(context_handle);
2046  return allocate_in_current_context(num_bytes, initial_visibility);
2047 }
2048 
2049 } // namespace detail_
2050 
2064 inline region_t allocate(
2065  const context_t& context,
2066  size_t num_bytes,
2067  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2068 
2082 inline region_t allocate(
2083  const device_t& device,
2084  size_t num_bytes,
2085  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2086 
2096 region_t allocate(size_t num_bytes);
2097 
2103 inline void free(void* managed_ptr)
2104 {
2105  auto result = cuMemFree(device::address(managed_ptr));
2106  throw_if_error_lazy(result,
2107  "Freeing managed memory (host and device regions) at address "
2108  + cuda::detail_::ptr_as_hex(managed_ptr));
2109 }
2110 
2112 inline void free(region_t region)
2113 {
2114  free(region.start());
2115 }
2116 
2117 namespace detail_ {
2118 
2119 inline void prefetch(
2120  const_region_t region,
2121  cuda::device::id_t destination,
2122  stream::handle_t source_stream_handle)
2123 {
2124  auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle);
2125  throw_if_error_lazy(result,
2126  "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
2127  + cuda::detail_::ptr_as_hex(region.start()) + " to " + (
2128  (destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination)) );
2129 }
2130 
2131 } // namespace detail_
2132 
2138 void prefetch(
2139  const_region_t region,
2140  const cuda::device_t& destination,
2141  const stream_t& stream);
2142 
2147 void prefetch_to_host(
2148  const_region_t region,
2149  const stream_t& stream);
2150 
2151 } // namespace managed
2152 
2153 namespace mapped {
2154 
2162 template <typename T>
2163 inline T* device_side_pointer_for(T* host_memory_ptr)
2164 {
2165  auto unconsted_host_mem_ptr = const_cast<typename ::std::remove_const<T>::type *>(host_memory_ptr);
2166  device::address_t device_side_ptr;
2167  auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation
2168  auto status = cuMemHostGetDevicePointer(
2169  &device_side_ptr,
2170  unconsted_host_mem_ptr,
2171  get_device_pointer_flags);
2172  throw_if_error_lazy(status,
2173  "Failed obtaining the device-side pointer for host-memory pointer "
2174  + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
2175  return as_pointer(device_side_ptr);
2176 }
2177 
2184 {
2185  return { device_side_pointer_for(region.start()), region.size() };
2186 }
2187 
2190 {
2191  return { device_side_pointer_for(region.start()), region.size() };
2192 }
2193 
2194 namespace detail_ {
2195 
2205 inline region_pair_t allocate_in_current_context(
2206  context::handle_t current_context_handle,
2207  size_t size_in_bytes,
2208  allocation_options options)
2209 {
2210  region_pair_t allocated {};
2211  // The default initialization is unnecessary, but let's play it safe
2212  auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
2213  void* allocated_ptr;
2214  auto status = cuMemHostAlloc(&allocated_ptr, size_in_bytes, flags);
2215  if (is_success(status) && (allocated_ptr == nullptr)) {
2216  // Can this even happen? hopefully not
2217  status = static_cast<status_t>(status::named_t::unknown);
2218  }
2219  throw_if_error_lazy(status,
2220  "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
2221  + " bytes of global memory in " + context::detail_::identify(current_context_handle));
2222  allocated.host_side = { allocated_ptr, size_in_bytes };
2223  allocated.device_side = device_side_region_for(allocated.host_side);
2224  return allocated;
2225 }
2226 
2227 inline region_pair_t allocate(
2228  context::handle_t context_handle,
2229  size_t size_in_bytes,
2230  allocation_options options)
2231 {
2232  CAW_SET_SCOPE_CONTEXT(context_handle);
2233  return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
2234 }
2235 
2236 inline void free(void* host_side_pair)
2237 {
2238  auto result = cuMemFreeHost(host_side_pair);
2239  throw_if_error_lazy(result, "Freeing a mapped memory region pair with host-side address "
2240  + cuda::detail_::ptr_as_hex(host_side_pair));
2241 }
2242 
2243 } // namespace detail_
2244 
2255  cuda::context_t& context,
2256  size_t size_in_bytes,
2257  allocation_options options);
2258 
2268  cuda::device_t& device,
2269  size_t size_in_bytes,
2271 
2272 
2278 inline void free(region_pair_t pair)
2279 {
2280  detail_::free(pair.host_side.data());
2281 }
2282 
2289 inline void free_region_pair_of(void* ptr)
2290 {
2291  // TODO: What if the pointer is not part of a mapped region pair?
2292  // We could check this...
2293  void* host_side_ptr;
2294  auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
2295  throw_if_error_lazy(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
2296  + cuda::detail_::ptr_as_hex(ptr));
2297  detail_::free(host_side_ptr);
2298 }
2299 
2311 inline bool is_part_of_a_region_pair(const void* ptr)
2312 {
2313  auto wrapped_ptr = pointer_t<const void> { ptr };
2314  return wrapped_ptr.other_side_of_region_pair().get() != nullptr;
2315 }
2316 
2317 } // namespace mapped
2318 
2319 namespace detail_ {
2335 template <typename T, typename RawDeleter, typename RegionAllocator>
2336 unique_span<T> make_convenient_type_unique_span(size_t size, RegionAllocator allocator)
2337 {
2338  memory::detail_::check_allocation_type<T>();
2339  auto deleter = [](span<T> sp) {
2340  return RawDeleter{}(sp.data());
2341  };
2342  region_t allocated_region = allocator(size * sizeof(T));
2343  return unique_span<T>(
2344  allocated_region.as_span<T>(), // no constructor calls - trivial construction
2345  deleter // no destructor calls - trivial destruction
2346  );
2347 }
2348 
2349 } // namespace detail_
2350 
2351 
2352 namespace device {
2353 
2354 namespace detail_ {
2355 
2356 template <typename T>
2357 unique_span<T> make_unique_span(const context::handle_t context_handle, size_t size)
2358 {
2359  auto allocate_in_current_context_ = [](size_t size) { return allocate_in_current_context(size); };
2360  CAW_SET_SCOPE_CONTEXT(context_handle);
2361  return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context_);
2362 }
2363 
2364 } // namespace detail_
2365 
2383 template <typename T>
2384 unique_span<T> make_unique_span(const context_t& context, size_t size);
2385 
2391 template <typename T>
2392 unique_span<T> make_unique_span(const device_t& device, size_t size);
2393 
2400 template <typename T>
2401 unique_span<T> make_unique_span(size_t size);
2402 
2403 } // namespace device
2404 
2406 template <typename T>
2407 inline unique_span<T> make_unique_span(const context_t& context, size_t size)
2408 {
2409  return device::make_unique_span<T>(context, size);
2410 }
2411 
2413 template <typename T>
2414 inline unique_span<T> make_unique_span(const device_t& device, size_t size)
2415 {
2416  return device::make_unique_span<T>(device, size);
2417 }
2418 
2419 namespace host {
2420 
2441 template <typename T>
2442 unique_span<T> make_unique_span(size_t size)
2443 {
2444  // Need this because of allocate takes more arguments and has default ones
2445  auto allocator = [](size_t size) { return allocate(size); };
2446  return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2447 }
2448 
2449 } // namespace host
2450 
2451 namespace managed {
2452 
2453 namespace detail_ {
2454 
2455 template <typename T, initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
2456 unique_span<T> make_unique_span(
2457  const context::handle_t context_handle,
2458  size_t size)
2459 {
2460  CAW_SET_SCOPE_CONTEXT(context_handle);
2461  auto allocator = [](size_t size) {
2462  return allocate_in_current_context(size, InitialVisibility);
2463  };
2464  return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
2465 }
2466 
2467 } // namespace detail_
2468 
2491 template <typename T>
2492 unique_span<T> make_unique_span(
2493  const context_t& context,
2494  size_t size,
2495  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2496 
2502 template <typename T>
2503 unique_span<T> make_unique_span(
2504  const device_t& device,
2505  size_t size,
2506  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2507 
2514 template <typename T>
2515 unique_span<T> make_unique_span(
2516  size_t size,
2517  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
2518 
2519 } // namespace managed
2520 
2521 } // namespace memory
2522 
2523 namespace symbol {
2524 
2532 template <typename T>
2534 {
2535  void *start;
2536  size_t symbol_size;
2537  auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2538  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for a symbol");
2539  api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2540  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for the symbol at address"
2541  + cuda::detail_::ptr_as_hex(start));
2542  return { start, symbol_size };
2543 }
2544 
2545 } // namespace symbol
2546 
2547 } // namespace cuda
2548 
2549 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_
void register_(const_region_t region)
Register a memory region with the CUDA driver.
Definition: memory.hpp:1775
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2289
Proxy class for a CUDA stream.
Definition: stream.hpp:246
endpoint_t
Type for choosing between endpoints of copy operations.
Definition: copy_parameters.hpp:19
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory.
Definition: memory.hpp:248
cpu_write_combining write_combining
whether or not the GPU can batch multiple writes to this area and propagate them at its convenience...
Definition: memory.hpp:96
unique_span< T > make_unique_span(const context_t &context, size_t size)
See device::make_unique_span(const context_t& context, size_t size)
Definition: memory.hpp:2407
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
is_not_accessible_on_all_devices
Definition: memory.hpp:1646
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
this_type & clear_offsets() noexcept
Clear the offsets into both the source and the destination endpoint regions.
Definition: copy_parameters.hpp:275
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:29
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:395
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
::std::size_t size_bytes() const noexcept
Overall size in bytes of the elements of the array, over all dimensions.
Definition: array.hpp:248
portability_across_contexts portability
whether or not the allocated region can be used in different CUDA contexts.
Definition: memory.hpp:93
constexpr span_pair_t< T > as_spans() const
Definition: memory.hpp:165
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region.
Definition: memory.hpp:208
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:335
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130
T * get() const
Definition: pointer.hpp:139
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:82
memory::region_t host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:161
void set(void *start, int byte_value, size_t num_bytes)
Sets all bytes in a stretch of host-side memory to a single value.
Definition: memory.hpp:1813
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
The cuda::memory::copy_parameters_t class template and related definitions.
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator)
Definition: memory.hpp:219
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:91
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
::std::size_t size() const noexcept
Overall number of elements in the array, over all dimensions.
Definition: array.hpp:245
span< T > host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:143
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:208
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:158
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1634
void deregister(const_region_t region)
Have the CUDA driver "forget" about a region of memory which was previously registered with it...
Definition: memory.hpp:1795
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1644
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:131
region_pair_t allocate(cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:280
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2533
this_type & clear_offset(endpoint_t endpoint) noexcept
Set the copy operation to use the multi-dimensional region of the specified endpoint without skipping...
Definition: copy_parameters.hpp:269
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA&#39;s complex copyin...
Definition: copy_parameters.hpp:68
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1623
void free(void *host_ptr)
Frees a region of pinned host memory which was allocated with one of the pinned host memory allocatio...
Definition: memory.hpp:1563
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
A wrapper class for host and/or device pointers, allowing easy access to CUDA&#39;s pointer attributes...
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:700
void set(void *ptr, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets a number of bytes in memory to a fixed value.
Definition: memory.hpp:422
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
A pair of memory spans, one in device-global memory and one in host/system memory, mapped to it.
Definition: memory.hpp:140
const_region_t device_side_region_for(const_region_t region)
Get the memory region mapped to a given host-side region.
Definition: memory.hpp:2189
void free(region_pair_t pair)
Free a pair of mapped memory regions.
Definition: memory.hpp:2278
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region.
Definition: memory.hpp:213
this_type & set_endpoint(endpoint_t endpoint, const cuda::array_t< T, NumDimensions > &array) noexcept
Set one of the copy endpoints to a CUDA array.
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:672
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) given given a host-side pointer ma...
Definition: memory.hpp:2163
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:62
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:75
is_accessible_on_all_devices
Definition: memory.hpp:1645
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:240
region_t allocate(size_t size_in_bytes, allocation_options options)
Allocates pinned host memory.
Definition: memory.hpp:344
Wrapper class for a CUDA device.
Definition: device.hpp:135
void zero(region_t region, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:734
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:203
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1975
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2311