cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
memory.hpp
Go to the documentation of this file.
1 
25 #pragma once
26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_
27 #define CUDA_API_WRAPPERS_MEMORY_HPP_
28 
29 #include "copy_parameters.hpp"
30 #include "array.hpp"
31 #include "constants.hpp"
32 #include "current_device.hpp"
33 #include "error.hpp"
34 #include "pointer.hpp"
35 #include "current_context.hpp"
36 
37 #include <cuda_runtime.h> // needed, rather than cuda_runtime_api.h, e.g. for cudaMalloc
38 #include <cuda.h>
39 
40 #include <memory>
41 #include <cstring> // for ::std::memset
42 #include <vector>
43 
44 namespace cuda {
45 
47 class device_t;
48 class context_t;
49 class stream_t;
50 class module_t;
52 
53 namespace memory {
54 
60 enum class portability_across_contexts : bool {
61  is_portable = true,
62  isnt_portable = false
63 };
64 
80 enum cpu_write_combining : bool {
81  with_wc = true,
82  without_wc = false
83 };
84 
90  portability_across_contexts portability;
91  cpu_write_combining write_combining;
92 };
93 
94 namespace detail_ {
95 
96 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
97 {
98  return
99  (options.portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) |
100  (options.write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
101 }
102 
103 } // namespace detail_
104 
113 namespace mapped {
114 
115 // TODO: Perhaps make this an array of size 2 and use aspects to index it?
116 // Or maybe inherit a pair?
117 
125 struct region_pair {
126  void* host_side;
127  void* device_side;
128  size_t size_in_bytes;
129 };
130 
131 } // namespace mapped
132 
136 namespace device {
137 
138 namespace detail_ {
139 
145 inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
146 {
147  device::address_t allocated = 0;
148  // Note: the typed cudaMalloc also takes its size in bytes, apparently,
149  // not in number of elements
150  auto status = cuMemAlloc(&allocated, num_bytes);
151  if (is_success(status) && allocated == 0) {
152  // Can this even happen? hopefully not
153  status = static_cast<status_t>(status::unknown);
154  }
155  throw_if_error_lazy(status, "Failed allocating " + ::std::to_string(num_bytes) +
156  " bytes of global memory on the current CUDA device");
157  return {as_pointer(allocated), num_bytes};
158 }
159 
160 inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes)
161 {
162  CAW_SET_SCOPE_CONTEXT(context_handle);
163  return allocate_in_current_context(size_in_bytes);
164 }
165 
166 } // namespace detail_
167 
168 #if CUDA_VERSION >= 11020
169 namespace async {
170 
171 namespace detail_ {
172 
176 inline region_t allocate(
177  context::handle_t context_handle,
178  stream::handle_t stream_handle,
179  size_t num_bytes)
180 {
181  device::address_t allocated = 0;
182  // Note: the typed cudaMalloc also takes its size in bytes, apparently,
183  // not in number of elements
184  auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle);
185  if (is_success(status) && allocated == 0) {
186  // Can this even happen? hopefully not
187  status = static_cast<decltype(status)>(status::unknown);
188  }
189  throw_if_error_lazy(status,
190  "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
191  " bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) );
192  return {as_pointer(allocated), num_bytes};
193 }
194 
195 } // namespace detail_
196 
210 region_t allocate(const stream_t& stream, size_t size_in_bytes);
211 
212 } // namespace async
213 #endif
214 
218 inline void free(void* ptr)
220 {
221  auto result = cuMemFree(address(ptr));
222 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
223  if (result == status::success) { return; }
224 #else
225  if (result == status::success or result == status::context_is_destroyed) { return; }
226 #endif
227  throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr));
228 }
229 inline void free(region_t region) { free(region.start()); }
231 
232 #if CUDA_VERSION >= 11020
233 namespace async {
234 
235 namespace detail_ {
236 
237 inline void free(
238  context::handle_t context_handle,
239  stream::handle_t stream_handle,
240  void* allocated_region_start)
241 {
242  auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
243  throw_if_error_lazy(status,
244  "Failed scheduling an asynchronous freeing of the global memory region starting at "
245  + cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
246  + stream::detail_::identify(stream_handle, context_handle) );
247 }
248 
249 } // namespace detail_
250 
258 void free(const stream_t& stream, void* region_start);
260 
261 inline void free(const stream_t& stream, region_t region)
262 {
263  free(stream, region.data());
264 }
266 
267 } // namespace async
268 #endif
269 
270 
284 inline region_t allocate(const context_t& context, size_t size_in_bytes);
285 
299 inline region_t allocate(const device_t& device, size_t size_in_bytes);
300 
301 namespace detail_ {
302 
303 // Note: Allocates _in the current context_! No current context => failure!
304 struct allocator {
305  void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); }
306 };
307 struct deleter {
308  void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
309 };
310 
311 } // namespace detail_
312 
313 
325 template <typename T>
326 void typed_set(T* start, const T& value, size_t num_elements);
327 
335 
341 inline void set(void* start, int byte_value, size_t num_bytes)
342 {
343  return typed_set<unsigned char>(static_cast<unsigned char*>(start), static_cast<unsigned char>(byte_value), num_bytes);
344 }
345 
349 inline void set(region_t region, int byte_value)
350 {
351  set(region.start(), byte_value, region.size());
352 }
354 
358 
362 inline void zero(void* start, size_t num_bytes)
363 {
364  set(start, 0, num_bytes);
365 }
366 
372 inline void zero(region_t region)
373 {
374  zero(region.start(), region.size());
375 }
377 
384 template <typename T>
385 inline void zero(T* ptr)
386 {
387  zero(ptr, sizeof(T));
388 }
389 
390 } // namespace device
391 
405 
410 void copy(void *destination, const void *source, size_t num_bytes);
411 
416 inline void copy(void* destination, const_region_t source)
417 {
418  return copy(destination, source.start(), source.size());
419 }
420 
428 inline void copy(region_t destination, const_region_t source)
429 {
430 #ifndef NDEBUG
431  if (destination.size() < source.size()) {
432  throw ::std::logic_error("Can't copy a large region into a smaller one");
433  }
434 #endif
435  return copy(destination.start(), source);
436 }
437 
443 template <typename T, size_t N>
444 inline void copy(region_t destination, const T(&source)[N])
445 {
446 #ifndef NDEBUG
447  if (destination.size() < N) {
448  throw ::std::logic_error("Source size exceeds destination size");
449  }
450 #endif
451  return copy(destination.start(), source, sizeof(T) * N);
452 }
453 
459 template <typename T, size_t N>
460 inline void copy(T(&destination)[N], const_region_t source)
461 {
462 #ifndef NDEBUG
463  size_t required_size = N * sizeof(T);
464  if (source.size() != required_size) {
465  throw ::std::invalid_argument(
466  "Attempt to copy a region of " + ::std::to_string(source.size()) +
467  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
468  }
469 #endif
470  return copy(destination, source.start(), sizeof(T) * N);
471 }
472 
473 template <typename T, size_t N>
474 inline void copy(void* destination, T (&source)[N])
475 {
476  return copy(destination, source, sizeof(T) * N);
477 }
478 
484 template <typename T, size_t N>
485 inline void copy(T(&destination)[N], T* source)
486 {
487  return copy(destination, source, sizeof(T) * N);
488 }
489 
496 inline void copy(region_t destination, void* source, size_t num_bytes)
497 {
498 #ifndef NDEBUG
499  if (destination.size() < num_bytes) {
500  throw ::std::logic_error("Number of bytes to copy exceeds destination size");
501  }
502 #endif
503  return copy(destination.start(), source, num_bytes);
504 }
505 
506 inline void copy(region_t destination, void* source)
507 {
508  return copy(destination, source, destination.size());
509 }
511 
523 void set(void* ptr, int byte_value, size_t num_bytes);
524 
535 inline void set(region_t region, int byte_value)
536 {
537  return set(region.start(), byte_value, region.size());
538 }
539 
546 inline void zero(region_t region)
547 {
548  return set(region, 0);
549 }
550 
557 inline void zero(void* ptr, size_t num_bytes)
558 {
559  return set(ptr, 0, num_bytes);
560 }
561 
569 template <typename T>
570 inline void zero(T* ptr)
571 {
572  zero(ptr, sizeof(T));
573 }
574 
575 namespace detail_ {
576 
577 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params)
578 {
579  // TODO: Move this logic into the scoped ensurer class
580  auto context_handle = context::current::detail_::get_handle();
581  if (context_handle != context::detail_::none) {
582  return cuMemcpy2D(&params);
583  }
584  auto current_device_id = cuda::device::current::detail_::get_id();
585  context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
586  context::current::detail_::push(context_handle);
587  // Note this _must_ be an intra-context copy, as inter-context is not supported
588  // and there's no indication of context in the relevant data structures
589  auto status = cuMemcpy2D(&params);
590  context::current::detail_::pop();
591  cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
592  return status;
593 }
594 
595 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params)
596 {
597  if (params.srcContext == params.dstContext) {
598  context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
599  auto *intra_context_params = reinterpret_cast<base_copy_params<3>::intra_context_type *>(&params);
600  return cuMemcpy3D(intra_context_params);
601  }
602  return cuMemcpy3DPeer(&params);
603 }
604 
605 template<dimensionality_t NumDimensions>
606 status_t multidim_copy(copy_parameters_t<NumDimensions> params)
607 {
608  return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params);
609 }
610 
611 
612 } // namespace detail_
613 
624 template<dimensionality_t NumDimensions>
626 {
627  status_t status = detail_::multidim_copy(params);
628  throw_if_error_lazy(status, "Copying using a general copy parameters structure");
629 }
630 
642 template<typename T, dimensionality_t NumDimensions>
643 void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source)
644 {
645  auto dims = destination.dimensions();
646  auto params = copy_parameters_t<NumDimensions> {};
647  params.clear_offsets();
648  params.template set_extent<T>(dims);
649  params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast<T*>(source), dims);
650  params.set_endpoint(endpoint_t::destination, destination);
651  params.clear_rest();
652  copy(params);
653 }
654 
665 template<typename T, dimensionality_t NumDimensions>
666 void copy(const array_t<T, NumDimensions>& destination, const T *source)
667 {
668  copy(destination, context_of(source), source);
669 }
670 
681 template <typename T, dimensionality_t NumDimensions>
682 void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source)
683 {
684  auto dims = source.dimensions();
685  auto params = copy_parameters_t<NumDimensions> {};
686  params.clear_offset(endpoint_t::source);
687  params.clear_offset(endpoint_t::destination);
688  params.template set_extent<T>(dims);
689  params.set_endpoint(endpoint_t::source, source);
690  params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
691  params.set_default_pitches();
692  params.clear_rest();
693  copy(params);
694 }
695 
706 template <typename T, dimensionality_t NumDimensions>
707 void copy(T *destination, const array_t<T, NumDimensions>& source)
708 {
709  copy(context_of(destination), destination, source);
710 }
711 
712 template <typename T, dimensionality_t NumDimensions>
713 void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source)
714 {
715  auto dims = source.dimensions();
716  auto params = copy_parameters_t<NumDimensions> {};
717  params.clear_offset(endpoint_t::source);
718  params.clear_offset(endpoint_t::destination);
719  params.template set_extent<T>(dims);
720  params.set_endpoint(endpoint_t::source, source);
721  params.set_endpoint(endpoint_t::destination, destination);
722  params.set_default_pitches();
723  params.clear_rest();;
724  auto status = //(source.context() == destination.context()) ?
725  detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
726  throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
727 }
728 
729 template <typename T, dimensionality_t NumDimensions>
730 void copy(region_t destination, const array_t<T, NumDimensions>& source)
731 {
732  if (destination.size() < source.size_bytes()) {
733  throw ::std::logic_error("Attempt to copy an array into a memory region too small to hold the copy");
734  }
735  copy(destination.start(), source);
736 }
737 
738 template <typename T, dimensionality_t NumDimensions>
739 void copy(const array_t<T, NumDimensions>& destination, const_region_t source)
740 {
741  if (destination.size_bytes() < source.size()) {
742  throw ::std::logic_error("Attempt to copy into an array from a source region larger than the array's size");
743  }
744  copy(destination, source.start());
745 }
746 
755 template <typename T>
756 void copy_single(T* destination, const T* source)
757 {
758  copy(destination, source, sizeof(T));
759 }
760 
761 namespace async {
762 
763 namespace detail_ {
764 
778 
787 inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
788 {
789  auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
790 
791  // TODO: Determine whether it was from host to device, device to host etc and
792  // add this information to the error string
793  throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
794 }
795 
803 inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
804 {
805 #ifndef NDEBUG
806  if (destination.size() < source.size()) {
807  throw ::std::logic_error("Source size exceeds destination size");
808  }
809 #endif
810  copy(destination.start(), source.start(), source.size(), stream_handle);
811 }
813 
815 
816 inline status_t multidim_copy_in_current_context(
817  ::std::integral_constant<dimensionality_t, 2>,
818  copy_parameters_t<2> params,
819  stream::handle_t stream_handle)
820 {
821  // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
822  // structure holds no information about contexts.
823  return cuMemcpy2DAsync(&params, stream_handle);
824 }
825 
826 inline status_t multidim_copy_in_current_context(
827  ::std::integral_constant<dimensionality_t, 3>,
828  copy_parameters_t<3> params,
829  stream::handle_t stream_handle)
830 {
831  if (params.srcContext == params.dstContext) {
832  using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
833  auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
834  return cuMemcpy3DAsync(intra_context_params, stream_handle);
835  }
836  return cuMemcpy3DPeerAsync(&params, stream_handle);
837 
838 }
839 
840 template<dimensionality_t NumDimensions>
841 status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle) {
842  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
843 }
844 
845 // Note: Assumes the stream handle is for a stream in the current context
846 template<dimensionality_t NumDimensions>
847 status_t multidim_copy(
848  context::handle_t context_handle,
850  stream::handle_t stream_handle)
851 {
852  CAW_SET_SCOPE_CONTEXT(context_handle);
853  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
854 }
855 
856 
857 // Assumes the array and the stream share the same context, and that the destination is
858 // accessible from that context (e.g. allocated within it, or being managed memory, etc.)
859 template <typename T, dimensionality_t NumDimensions>
860 void copy(T *destination, const array_t<T, NumDimensions>& source, stream::handle_t stream_handle)
861 {
862  using memory::endpoint_t;
863  auto dims = source.dimensions();
864  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
865  auto params = copy_parameters_t<NumDimensions> {};
866  params.clear_offset(endpoint_t::source);
867  params.clear_offset(endpoint_t::destination);
868  params.template set_extent<T>(dims);
869  params.set_endpoint(endpoint_t::source, source);
870  params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
871  params.set_default_pitches();
872  params.clear_rest();
873  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
874  throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
875 }
876 
877 
878 template <typename T, dimensionality_t NumDimensions>
879 void copy(const array_t<T, NumDimensions>& destination, const T* source, stream::handle_t stream_handle)
880 {
881  using memory::endpoint_t;
882  auto dims = destination.dimensions();
883  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
884  auto params = copy_parameters_t<NumDimensions>{};
885  params.clear_offset(endpoint_t::source);
886  params.clear_offset(endpoint_t::destination);
887  params.template set_extent<T>(dims);
888  params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
889  params.set_endpoint(endpoint_t::destination, destination);
890  params.set_default_pitches();
891  params.clear_rest();
892  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
893  throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
894 }
895 
910 template <typename T>
911 void copy_single(T& destination, const T& source, stream::handle_t stream_handle)
912 {
913  copy(&destination, &source, sizeof(T), stream_handle);
914 }
915 
916 } // namespace detail_
917 
936 void copy(void* destination, void const* source, size_t num_bytes, const stream_t& stream);
938 
939 inline void copy(void* destination, const_region_t source, size_t num_bytes, const stream_t& stream)
940 {
941 #ifndef NDEBUG
942  if (source.size() < num_bytes) {
943  throw ::std::logic_error("Attempt to copy more than the source region's size");
944  }
945 #endif
946  copy(destination, source.start(), num_bytes, stream);
947 }
948 
949 inline void copy(region_t destination, const_region_t source, size_t num_bytes, const stream_t& stream)
950 {
951 #ifndef NDEBUG
952  if (destination.size() < num_bytes) {
953  throw ::std::logic_error("Attempt to copy beyond the end of the destination region");
954  }
955 #endif
956  copy(destination.start(), source.start(), num_bytes, stream);
957 }
958 
959 inline void copy(void* destination, const_region_t source, const stream_t& stream)
960 {
961  copy(destination, source, source.size(), stream);
962 }
963 
964 inline void copy(region_t destination, const_region_t source, const stream_t& stream)
965 {
966  copy(destination, source, source.size(), stream);
967 }
968 
969 inline void copy(region_t destination, void* source, const stream_t& stream)
970 {
971  return copy(destination.start(), source, destination.size(), stream);
972 }
973 
974 
978 template <typename T, size_t N>
979 inline void copy(region_t destination, const T(&source)[N], const stream_t& stream)
980 {
981 #ifndef NDEBUG
982  if (destination.size() < N) {
983  throw ::std::logic_error("Source size exceeds destination size");
984  }
985 #endif
986  return copy(destination.start(), source, sizeof(T) * N, stream);
987 }
988 
989 inline void copy(region_t destination, void* source, size_t num_bytes, const stream_t& stream)
990 {
991 #ifndef NDEBUG
992  if (destination.size() < num_bytes) {
993  throw ::std::logic_error("Number of bytes to copy exceeds destination size");
994  }
995 #endif
996  return copy(destination.start(), source, num_bytes, stream);
997 }
999 
1009 template <typename T, dimensionality_t NumDimensions>
1010 void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream);
1011 
1012 template <typename T, dimensionality_t NumDimensions>
1013 void copy(array_t<T, NumDimensions>& destination, const_region_t source, const stream_t& stream)
1014 {
1015 #ifndef NDEBUG
1016  size_t required_size = destination.size() * sizeof(T);
1017  if (source.size() != required_size) {
1018  throw ::std::invalid_argument(
1019  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1020  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
1021  }
1022 #endif
1023  copy(destination, source.start(), stream);
1024 }
1025 
1035 template <typename T, dimensionality_t NumDimensions>
1036 void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream);
1037 
1038 template <typename T, dimensionality_t NumDimensions>
1039 void copy(region_t destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
1040 {
1041 #ifndef NDEBUG
1042  size_t required_size = source.size() * sizeof(T);
1043  if (destination.size() < required_size) {
1044  throw ::std::invalid_argument(
1045  "Attempt to copy " + ::std::to_string(required_size) + " bytes from an array into a "
1046  "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
1047  }
1048 #endif
1049  copy(destination.start(), source, stream);
1050 }
1051 
1057 template <typename T, size_t N>
1058 inline void copy(T(&destination)[N], T* source, const stream_t& stream)
1059 {
1060  return copy(destination, source, sizeof(T) * N, stream);
1061 }
1062 
1068 template <typename T, size_t N>
1069 inline void copy(T(&destination)[N], const_region_t source, const stream_t& stream)
1070 {
1071 #ifndef NDEBUG
1072  size_t required_size = N * sizeof(T);
1073  if (source.size() != required_size) {
1074  throw ::std::invalid_argument(
1075  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1076  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
1077  }
1078 #endif
1079  return copy(destination, source.start(), sizeof(T) * N, stream);
1080 }
1081 
1082 
1094 template <typename T>
1095 void copy_single(T& destination, const T& source, const stream_t& stream);
1096 
1097 } // namespace async
1098 
1099 namespace device {
1100 
1101 namespace async {
1102 
1103 namespace detail_ {
1104 
1105 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
1106 {
1107  // TODO: Double-check that this call doesn't require setting the current device
1108  auto result = cuMemsetD8Async(address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
1109  throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
1110 }
1111 
1112 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
1113 {
1114  set(region.start(), byte_value, region.size(), stream_handle);
1115 }
1116 
1117 inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle)
1118 {
1119  set(start, 0, num_bytes, stream_handle);
1120 }
1121 
1122 inline void zero(region_t region, stream::handle_t stream_handle)
1123 {
1124  zero(region.start(), region.size(), stream_handle);
1125 }
1126 
1127 // TODO: Drop this in favor of <algorithm>-like functions under `cuda::`.
1128 template <typename T>
1129 inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle)
1130 {
1131  static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
1132  static_assert(
1133  sizeof(T) == 1 or sizeof(T) == 2 or
1134  sizeof(T) == 4 or sizeof(T) == 8,
1135  "Unsupported type size - only sizes 1, 2 and 4 are supported");
1136  // TODO: Consider checking for alignment when compiling without NDEBUG
1137  status_t result = static_cast<status_t>(cuda::status::success);
1138  switch(sizeof(T)) {
1139  case(1): result = cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle); break;
1140  case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
1141  case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
1142  }
1143  throw_if_error_lazy(result, "Setting global device memory bytes");
1144 }
1145 
1146 } // namespace detail_
1147 
1148 
1161 template <typename T>
1162 void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream);
1163 
1175 inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
1176 {
1177  return typed_set<unsigned char>(
1178  static_cast<unsigned char*>(start),
1179  static_cast<unsigned char>(byte_value),
1180  num_bytes,
1181  stream);
1182 }
1183 
1187 void zero(void* start, size_t num_bytes, const stream_t& stream);
1188 
1199 template <typename T>
1200 inline void zero(T* ptr, const stream_t& stream)
1201 {
1202  zero(ptr, sizeof(T), stream);
1203 }
1204 
1205 } // namespace async
1206 
1207 
1208 } // namespace device
1209 
1210 namespace inter_context {
1211 
1212 namespace detail_ {
1213 
1214 inline void copy(
1215  void * destination_address,
1216  context::handle_t destination_context,
1217  const void * source_address,
1218  context::handle_t source_context,
1219  size_t num_bytes)
1220 {
1221  auto status = cuMemcpyPeer(
1222  reinterpret_cast<device::address_t>(destination_address),
1223  destination_context,
1224  reinterpret_cast<device::address_t>(source_address),
1225  source_context, num_bytes);
1226  throw_if_error_lazy(status,
1227  ::std::string("Failed copying data between devices: From address ")
1228  + cuda::detail_::ptr_as_hex(source_address) + " in "
1229  + context::detail_::identify(source_context) + " to address "
1230  + cuda::detail_::ptr_as_hex(destination_address) + " in "
1231  + context::detail_::identify(destination_context) );
1232 }
1233 
1234 } // namespace detail_
1235 
1236 void copy(
1237  void * destination,
1238  const context_t& destination_context,
1239  const void * source_address,
1240  const context_t& source_context,
1241  size_t num_bytes);
1242 
1243 inline void copy(
1244  void * destination,
1245  const context_t& destination_context,
1246  const_region_t source,
1247  const context_t& source_context)
1248 {
1249  copy(destination, destination_context, source.start(), source_context, source.size());
1250 }
1251 
1252 inline void copy(
1253  region_t destination,
1254  const context_t& destination_context,
1255  const_region_t source,
1256  const context_t& source_context)
1257 {
1258 #ifndef NDEBUG
1259  if (destination.size() < destination.size()) {
1260  throw ::std::invalid_argument(
1261  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1262  " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
1263  }
1264 #endif
1265  copy(destination.start(), destination_context, source, source_context);
1266 }
1267 
1268 template <typename T, dimensionality_t NumDimensions>
1269 inline void copy(
1270  array_t<T, NumDimensions> destination,
1272 {
1273  // for arrays, a single mechanism handles both intra- and inter-context copying
1274  return memory::copy(destination, source);
1275 }
1276 
1277 namespace async {
1278 
1279 namespace detail_ {
1280 
1281 inline void copy(
1282  void *destination,
1283  context::handle_t destination_context_handle,
1284  const void *source,
1285  context::handle_t source_context_handle,
1286  size_t num_bytes,
1287  stream::handle_t stream_handle)
1288 {
1289  auto result = cuMemcpyPeerAsync(
1290  device::address(destination),
1291  destination_context_handle,
1292  device::address(source),
1293  source_context_handle,
1294  num_bytes, stream_handle);
1295 
1296  // TODO: Determine whether it was from host to device, device to host etc and
1297  // add this information to the error string
1298  throw_if_error_lazy(result, "Scheduling an inter-context memory copy from "
1299  + context::detail_::identify(source_context_handle) + " to "
1300  + context::detail_::identify(destination_context_handle) + " on "
1301  + stream::detail_::identify(stream_handle));
1302 }
1303 
1311 inline void copy(
1312  region_t destination,
1313  context::handle_t destination_context_handle,
1314  const_region_t source,
1315  context::handle_t source_context_handle,
1316  stream::handle_t stream_handle)
1317 {
1318 #ifndef NDEBUG
1319  if (destination.size() < source.size()) {
1320  throw ::std::logic_error("Can't copy a large region into a smaller one");
1321  }
1322 #endif
1323  copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(),
1324  stream_handle);
1325 }
1327 
1328 } // namespace detail_
1329 
1330 void copy(
1331  void * destination_address,
1332  context_t destination_context,
1333  const void * source_address,
1334  context_t source_context,
1335  size_t num_bytes,
1336  const stream_t& stream);
1337 
1338 void copy(
1339  void * destination,
1340  context_t destination_context,
1341  const_region_t source,
1342  context_t source_context,
1343  const stream_t& stream);
1344 
1345 inline void copy(
1346  region_t destination,
1347  context_t destination_context,
1348  const_region_t source,
1349  context_t source_context,
1350  const stream_t& stream);
1351 
1352 template <typename T, dimensionality_t NumDimensions>
1353 inline void copy(
1354  array_t<T, NumDimensions> destination,
1356  const stream_t& stream)
1357 {
1358  // for arrays, a single mechanism handles both intra- and inter-context copying
1359  return memory::async::copy(destination, source, stream);
1360 }
1361 
1362 
1363 } // namespace async
1364 
1365 } // namespace inter_context
1366 
1372 namespace host {
1373 
1395 region_t allocate(
1396  size_t size_in_bytes,
1397  allocation_options options);
1398 
1399 
1400 inline region_t allocate(
1401  size_t size_in_bytes,
1403  cpu_write_combining cpu_wc = cpu_write_combining(false))
1404 {
1405  return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
1406 }
1407 
1408 inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
1409 {
1410  return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
1411 }
1412 
1418 inline void free(void* host_ptr)
1419 {
1420  auto result = cuMemFreeHost(host_ptr);
1421 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
1422  if (result == status::success) { return; }
1423 #else
1424  if (result == status::success or result == status::context_is_destroyed) { return; }
1425 #endif
1426  throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1427 }
1428 
1429 inline void free(region_t region) { return free(region.data()); }
1430 
1431 namespace detail_ {
1432 
1433 struct allocator {
1434  void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes).data(); }
1435 };
1436 struct deleter {
1437  void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
1438 };
1439 
1440 
1452 inline void register_(const void *ptr, size_t size, unsigned flags)
1453 {
1454  auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1455  throw_if_error_lazy(result,
1456  "Could not register and page-lock the region of " + ::std::to_string(size) +
1457  " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
1458  " with flags " + cuda::detail_::as_hex(flags));
1459 }
1460 
1461 inline void register_(const_region_t region, unsigned flags)
1462 {
1463  register_(region.start(), region.size(), flags);
1464 }
1465 
1466 } // namespace detail_
1467 
1473 enum mapped_io_space : bool {
1474  is_mapped_io_space = true,
1475  is_not_mapped_io_space = false
1476 };
1477 
1484  map_into_device_memory = true,
1485  do_not_map_into_device_memory = false
1486 };
1487 
1496 };
1497 
1498 
1499 // Can't use register(), since that's a reserved word
1500 inline void register_(const void *ptr, size_t size,
1501  bool register_mapped_io_space,
1502  bool map_into_device_space,
1503  bool make_device_side_accesible_to_all
1504 #if CUDA_VERSION >= 11010
1505  , bool considered_read_only_by_device
1506 #endif // CUDA_VERSION >= 11010
1507  )
1508 {
1509  detail_::register_(
1510  ptr, size,
1511  (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1512  | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1513  | (make_device_side_accesible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1514 #if CUDA_VERSION >= 11010
1515  | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
1516 #endif // CUDA_VERSION >= 11010
1517  );
1518 }
1519 
1520 inline void register_(
1521  const_region_t region,
1522  bool register_mapped_io_space,
1523  bool map_into_device_space,
1524  bool make_device_side_accesible_to_all
1525 #if CUDA_VERSION >= 11010
1526  , bool considered_read_only_by_device
1527 #endif // CUDA_VERSION >= 11010
1528  )
1529 {
1530  register_(
1531  region.start(),
1532  region.size(),
1533  register_mapped_io_space,
1534  map_into_device_space,
1535  make_device_side_accesible_to_all
1536 #if CUDA_VERSION >= 11010
1537  , considered_read_only_by_device
1538 #endif // CUDA_VERSION >= 11010
1539  );
1540 }
1541 
1542 
1543 inline void register_(void const *ptr, size_t size)
1544 {
1545  unsigned no_flags_set { 0 };
1546  detail_::register_(ptr, size, no_flags_set);
1547 }
1548 
1549 inline void register_(const_region_t region)
1550 {
1551  register_(region.start(), region.size());
1552 }
1553 
1554 // the CUDA API calls this "unregister", but that's semantically
1555 // inaccurate. The registration is not undone, rolled back, it's
1556 // just ended
1557 inline void deregister(const void *ptr)
1558 {
1559  auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1560  throw_if_error_lazy(result,
1561  "Could not unregister the memory segment starting at address *a");
1562 }
1563 
1564 inline void deregister(const_region_t region)
1565 {
1566  deregister(region.start());
1567 }
1568 
1579 inline void set(void* start, int byte_value, size_t num_bytes)
1580 {
1581  ::std::memset(start, byte_value, num_bytes);
1582  // TODO: Error handling?
1583 }
1584 
1585 inline void zero(void* start, size_t num_bytes)
1586 {
1587  set(start, 0, num_bytes);
1588 }
1589 
1590 template <typename T>
1591 inline void zero(T* ptr)
1592 {
1593  zero(ptr, sizeof(T));
1594 }
1595 
1596 
1597 } // namespace host
1598 
1615 namespace managed {
1616 
1617 struct const_region_t;
1618 
1619 namespace detail_ {
1620 
1621 using advice_t = CUmem_advise;
1622 
1623 template <typename T>
1624 inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute);
1625 
1626 inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id);
1627 // inline void advise(managed::const_region_t region, advice_t attribute);
1628 
1629 template <typename T>
1630 struct base_region_t : public memory::detail_::base_region_t<T> {
1631  using parent = memory::detail_::base_region_t<T>;
1632  using parent::parent;
1633 
1634  bool is_read_mostly() const
1635  {
1636  return get_scalar_range_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1637  }
1638 
1639  void designate_read_mostly() const
1640  {
1641  set_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1642  }
1643 
1644  void undesignate_read_mostly() const
1645  {
1646  unset_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1647  }
1648 
1649  device_t preferred_location() const;
1650  void set_preferred_location(device_t& device) const;
1651  void clear_preferred_location() const;
1652 
1653  // TODO: Consider using a field proxy
1654 };
1655 
1656 } // namespace detail_
1657 
1658 struct region_t : public detail_::base_region_t<void> {
1659  using base_region_t<void>::base_region_t;
1660  operator memory::region_t() { return memory::region_t{ start(), size() }; }
1661 };
1662 
1663 struct const_region_t : public detail_::base_region_t<void const> {
1664  using base_region_t<void const>::base_region_t;
1665  const_region_t(const region_t& r) : detail_::base_region_t<void const>(r.start(), r.size()) {}
1666 };
1667 
1668 void advise_expected_access_by(managed::const_region_t region, device_t& device);
1669 void advise_no_access_expected_by(managed::const_region_t region, device_t& device);
1670 
1671 template <typename Allocator = ::std::allocator<cuda::device_t> >
1672  typename ::std::vector<device_t, Allocator> accessors(managed::const_region_t region, const Allocator& allocator = Allocator() );
1673 
1674 namespace detail_ {
1675 
1676 template <typename T>
1677 inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute)
1678 {
1679  uint32_t attribute_value { 0 };
1680  auto result = cuMemRangeGetAttribute(
1681  &attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
1682  throw_if_error_lazy(result,
1683  "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1684  return static_cast<T>(attribute_value);
1685 }
1686 
1687 // CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's
1688 // not called cuMemRangeSetAttribute, and uses a different enum.
1689 inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id)
1690 {
1691  auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id);
1692  throw_if_error_lazy(result, "Setting an attribute for a managed memory range at "
1693  + cuda::detail_::ptr_as_hex(region.start()));
1694 }
1695 
1696 // inline void set_range_attribute(managed::const_region_t region, range_attribute_t attribute, cuda::device::handle_t device_id)
1697 
1698 inline advice_t as_advice(range_attribute_t attribute, bool set)
1699 {
1700  switch (attribute) {
1701  case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1702  return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1703  case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1704  return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1705  case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1706  return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1707  default:
1708  throw ::std::invalid_argument(
1709  "CUDA memory range attribute does not correspond to any range advice value");
1710  }
1711 }
1712 
1713 inline void set_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute, cuda::device::id_t device_id)
1714 {
1715  static constexpr const bool set { true };
1716  advise(region, as_advice(settable_attribute, set), device_id);
1717 }
1718 
1719 inline void unset_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute)
1720 {
1721  static constexpr const bool unset { false };
1722  static constexpr const cuda::device::id_t dummy_device_id { 0 };
1723  advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1724 }
1725 
1726 } // namespace detail_
1727 
1728 
1729 enum class attachment_t : unsigned {
1730  global = CU_MEM_ATTACH_GLOBAL,
1731  host = CU_MEM_ATTACH_HOST,
1732  single_stream = CU_MEM_ATTACH_SINGLE,
1733  };
1734 
1735 
1736 namespace detail_ {
1737 
1738 inline region_t allocate_in_current_context(
1739  size_t num_bytes,
1740  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1741 {
1742  device::address_t allocated = 0;
1743  auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
1744  attachment_t::global : attachment_t::host;
1745  // This is necessary because managed allocation requires at least one (primary)
1746  // context to have been constructed. We could theoretically check what our current
1747  // context is etc., but that would be brittle, since someone can managed-allocate,
1748  // then change contexts, then de-allocate, and we can't be certain that whoever
1749  // called us will call free
1750  cuda::device::primary_context::detail_::increase_refcount(cuda::device::default_device_id);
1751 
1752  // Note: Despite the templating by T, the size is still in bytes,
1753  // not in number of T's
1754  auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
1755  if (is_success(status) && allocated == 0) {
1756  // Can this even happen? hopefully not
1757  status = static_cast<status_t>(status::unknown);
1758  }
1759  throw_if_error_lazy(status, "Failed allocating "
1760  + ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
1761  return {as_pointer(allocated), num_bytes};
1762 }
1763 
1769 inline void free(void* ptr)
1771 {
1772  auto result = cuMemFree(device::address(ptr));
1773  cuda::device::primary_context::detail_::decrease_refcount(cuda::device::default_device_id);
1774  throw_if_error_lazy(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
1775 }
1776 inline void free(region_t region)
1777 {
1778  free(region.start());
1779 }
1781 
1782 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
1783 struct allocator {
1784  // Allocates in the current context!
1785  void* operator()(size_t num_bytes) const
1786  {
1787  return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
1788  }
1789 };
1790 
1791 struct deleter {
1792  void operator()(void* ptr) const { memory::device::free(ptr); }
1793 };
1794 
1795 inline region_t allocate(
1796  context::handle_t context_handle,
1797  size_t num_bytes,
1798  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1799 {
1800  CAW_SET_SCOPE_CONTEXT(context_handle);
1801  return allocate_in_current_context(num_bytes, initial_visibility);
1802 }
1803 
1804 } // namespace detail_
1805 
1819 inline region_t allocate(
1820  const context_t& context,
1821  size_t num_bytes,
1822  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
1823 
1837 inline region_t allocate(
1838  const device_t& device,
1839  size_t num_bytes,
1840  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
1841 
1851 region_t allocate(size_t num_bytes);
1852 
1858 inline void free(void* managed_ptr)
1859 {
1860  auto result = cuMemFree(device::address(managed_ptr));
1861  throw_if_error_lazy(result,
1862  "Freeing managed memory (host and device regions) at address "
1863  + cuda::detail_::ptr_as_hex(managed_ptr));
1864 }
1865 
1866 inline void free(region_t region)
1867 {
1868  free(region.start());
1869 }
1870 
1871 namespace advice {
1872 
1873 enum kind_t {
1874  read_mostly = CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
1875  preferred_location = CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
1876  accessor = CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
1877  // Note: CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION is never set
1878 };
1879 
1880 namespace detail_ {
1881 
1882 inline void set(const_region_t region, kind_t advice, cuda::device::id_t device_id)
1883 {
1884  auto result = cuMemAdvise(device::address(region.start()), region.size(),
1885  static_cast<managed::detail_::advice_t>(advice), device_id);
1886  throw_if_error_lazy(result, "Setting advice on a (managed) memory region at"
1887  + cuda::detail_::ptr_as_hex(region.start()) + " w.r.t. " + cuda::device::detail_::identify(device_id));
1888 }
1889 
1890 } // namespace detail_
1891 
1892 void set(const_region_t region, kind_t advice, const device_t& device);
1893 
1894 } // namespace advice
1895 
1896 namespace async {
1897 
1898 namespace detail_ {
1899 
1900 inline void prefetch(
1901  const_region_t region,
1902  cuda::device::id_t destination,
1903  stream::handle_t source_stream_handle)
1904 {
1905  auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle);
1906  throw_if_error_lazy(result,
1907  "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
1908  + cuda::detail_::ptr_as_hex(region.start()) + " to " + (
1909  (destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination)) );
1910 }
1911 
1912 } // namespace detail_
1913 
1919 void prefetch(
1920  const_region_t region,
1921  const cuda::device_t& destination,
1922  const stream_t& stream);
1923 
1928 void prefetch_to_host(
1929  const_region_t region,
1930  const stream_t& stream);
1931 
1932 } // namespace async
1933 
1934 } // namespace managed
1935 
1936 namespace mapped {
1937 
1942 template <typename T>
1943 inline T* device_side_pointer_for(T* host_memory_ptr)
1944 {
1945  device::address_t device_side_ptr;
1946  auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation
1947  auto status = cuMemHostGetDevicePointer(
1948  &device_side_ptr,
1949  host_memory_ptr,
1950  get_device_pointer_flags);
1951  throw_if_error_lazy(status,
1952  "Failed obtaining the device-side pointer for host-memory pointer "
1953  + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
1954  return as_pointer(device_side_ptr);
1955 }
1956 
1957 namespace detail_ {
1958 
1968 inline region_pair allocate_in_current_context(
1969  context::handle_t current_context_handle,
1970  size_t size_in_bytes,
1971  allocation_options options)
1972 {
1973  region_pair allocated {};
1974  // The default initialization is unnecessary, but let's play it safe
1975  allocated.size_in_bytes = size_in_bytes;
1976  auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
1977  auto status = cuMemHostAlloc(&allocated.host_side, size_in_bytes, flags);
1978  if (is_success(status) && (allocated.host_side == nullptr)) {
1979  // Can this even happen? hopefully not
1980  status = static_cast<status_t>(status::named_t::unknown);
1981  }
1982  throw_if_error_lazy(status,
1983  "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
1984  + " bytes of global memory in " + context::detail_::identify(current_context_handle));
1985  allocated.device_side = device_side_pointer_for(allocated.host_side);
1986  return allocated;
1987 }
1988 
1989 inline region_pair allocate(
1990  context::handle_t context_handle,
1991  size_t size_in_bytes,
1992  allocation_options options)
1993 {
1994  CAW_SET_SCOPE_CONTEXT(context_handle);
1995  return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
1996 }
1997 
1998 inline void free(void* host_side_pair)
1999 {
2000  auto result = cuMemFreeHost(host_side_pair);
2001  throw_if_error_lazy(result, "Freeing a mapped memory region pair with host-side address "
2002  + cuda::detail_::ptr_as_hex(host_side_pair));
2003 }
2004 
2005 } // namespace detail_
2006 
2016 region_pair allocate(
2017  cuda::context_t& context,
2018  size_t size_in_bytes,
2019  allocation_options options);
2020 
2029 region_pair allocate(
2030  cuda::device_t& device,
2031  size_t size_in_bytes,
2033 
2034 
2041 inline void free(region_pair pair)
2042 {
2043  detail_::free(pair.host_side);
2044 }
2045 
2052 inline void free_region_pair_of(void* ptr)
2053 {
2054  // TODO: What if the pointer is not part of a mapped region pair?
2055  // We could check this...
2056  void* host_side_ptr;
2057  auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
2058  throw_if_error_lazy(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
2059  + cuda::detail_::ptr_as_hex(ptr));
2060  detail_::free(host_side_ptr);
2061 }
2062 
2074 inline bool is_part_of_a_region_pair(const void* ptr)
2075 {
2076  auto wrapped_ptr = pointer_t<const void> { ptr };
2077  return wrapped_ptr.other_side_of_region_pair().get() != nullptr;
2078 }
2079 
2080 } // namespace mapped
2081 
2082 
2083 } // namespace memory
2084 
2085 namespace symbol {
2093 template <typename T>
2094 inline memory::region_t locate(T&& symbol)
2095 {
2096  void *start;
2097  size_t symbol_size;
2098  auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2099  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for a symbol");
2100  api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2101  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for the symbol at address"
2102  + cuda::detail_::ptr_as_hex(start));
2103  return { start, symbol_size };
2104 }
2105 
2106 } // namespace symbol
2107 
2108 } // namespace cuda
2109 
2110 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:80
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:60
Proxy class for a CUDA stream.
Definition: stream.hpp:213
is_not_accessible_on_all_devices
Definition: memory.hpp:1495
Wrapper class for a CUDA context.
Definition: context.hpp:221
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:25
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
context::handle_t handle() const noexcept
The CUDA context ID this object is wrapping.
Definition: context.hpp:309
void copy(void *destination, const void *source, size_t num_bytes)
Synchronously copies data between memory spaces or within a memory space.
Definition: memory.hpp:533
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2074
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:30
void typed_set(T *start, const T &value, size_t num_elements)
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:545
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:898
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1493
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:332
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:219
T * get() const
Definition: pointer.hpp:133
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2052
void copy_single(T *destination, const T *source)
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:756
Memory regions appearing in both on the host-side and device-side address spaces with the regions in ...
Definition: memory.hpp:1658
The copy_parameters class template and related definitions.
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:89
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:269
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:201
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:126
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2094
Definition: memory.hpp:1663
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA&#39;s complex copyin...
Definition: copy_parameters.hpp:23
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
is_accessible_on_all_devices
Definition: memory.hpp:1494
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Return a pointers address as a numeric value of the type appropriate for device.
Definition: types.hpp:661
CUstream handle_t
The CUDA API&#39;s handle for streams.
Definition: types.hpp:337
A wrapper class for host and/or device pointers, allowing easy access to CUDA&#39;s pointer attributes...
Representation, allocation and manipulation of CUDA-related memory, of different kinds.
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) for the device-side memory mapped ...
Definition: memory.hpp:1943
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1473
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1483
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:652
Definition: types.hpp:774
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:125
Definition: types.hpp:765
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:205
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:192
void zero(void *start, size_t num_bytes)
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:362
Host-side (= system) memory which is "pinned", i.e.