cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
memory.hpp
Go to the documentation of this file.
1 
25 #pragma once
26 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_
27 #define CUDA_API_WRAPPERS_MEMORY_HPP_
28 
29 #include <cuda/api/array.hpp>
30 #include <cuda/api/constants.hpp>
32 #include <cuda/api/error.hpp>
33 #include <cuda/api/pointer.hpp>
35 
36 #include <cuda_runtime.h> // needed, rather than cuda_runtime_api.h, e.g. for cudaMalloc
37 #include <cuda.h>
38 
39 #include <memory>
40 #include <cstring> // for ::std::memset
41 #include <vector>
42 
43 namespace cuda {
44 
46 class device_t;
47 class context_t;
48 class stream_t;
49 class module_t;
51 
52 namespace memory {
53 
59 enum class portability_across_contexts : bool {
60  is_portable = true,
61  isnt_portable = false
62 };
63 
79 enum cpu_write_combining : bool {
80  with_wc = true,
81  without_wc = false
82 };
83 
89  portability_across_contexts portability;
90  cpu_write_combining write_combining;
91 };
92 
93 namespace detail_ {
94 
95 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
96 {
97  return
98  (options.portability == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE : 0) &
99  (options.write_combining == cpu_write_combining::with_wc ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
100 }
101 
102 } // namespace detail_
103 
112 namespace mapped {
113 
114 // TODO: Perhaps make this an array of size 2 and use aspects to index it?
115 // Or maybe inherit a pair?
116 
124 struct region_pair {
125  void* host_side;
126  void* device_side;
127  size_t size_in_bytes;
128 };
129 
130 } // namespace mapped
131 
135 namespace device {
136 
137 namespace detail_ {
138 
144 inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
145 {
146  device::address_t allocated = 0;
147  // Note: the typed cudaMalloc also takes its size in bytes, apparently,
148  // not in number of elements
149  auto status = cuMemAlloc(&allocated, num_bytes);
150  if (is_success(status) && allocated == 0) {
151  // Can this even happen? hopefully not
152  status = (status_t) status::unknown;
153  }
154  throw_if_error_lazy(status, "Failed allocating " + ::std::to_string(num_bytes) +
155  " bytes of global memory on the current CUDA device");
156  return {as_pointer(allocated), num_bytes};
157 }
158 
159 inline region_t allocate(context::handle_t context_handle, size_t size_in_bytes)
160 {
161  context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
162  return allocate_in_current_context(size_in_bytes);
163 }
164 
165 } // namespace detail_
166 
167 #if CUDA_VERSION >= 11020
168 namespace async {
169 
170 namespace detail_ {
171 
175 inline region_t allocate(
176  context::handle_t context_handle,
177  stream::handle_t stream_handle,
178  size_t num_bytes)
179 {
180  device::address_t allocated = 0;
181  // Note: the typed cudaMalloc also takes its size in bytes, apparently,
182  // not in number of elements
183  auto status = cuMemAllocAsync(&allocated, num_bytes, stream_handle);
184  if (is_success(status) && allocated == 0) {
185  // Can this even happen? hopefully not
186  status = static_cast<decltype(status)>(status::unknown);
187  }
188  throw_if_error_lazy(status,
189  "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
190  " bytes of global memory on " + stream::detail_::identify(stream_handle, context_handle) );
191  return {as_pointer(allocated), num_bytes};
192 }
193 
194 } // namespace detail_
195 
209 region_t allocate(const stream_t& stream, size_t size_in_bytes);
210 
211 } // namespace async
212 #endif
213 
217 inline void free(void* ptr)
219 {
220  auto result = cuMemFree(address(ptr));
221 #if CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
222  if (result == status::success) { return; }
223 #else
224  if (result == status::success or result == status::context_is_destroyed) { return; }
225 #endif
226  throw runtime_error(result, "Freeing device memory at " + cuda::detail_::ptr_as_hex(ptr));
227 }
228 inline void free(region_t region) { free(region.start()); }
230 
231 #if CUDA_VERSION >= 11020
232 namespace async {
233 
234 namespace detail_ {
235 
236 inline void free(
237  context::handle_t context_handle,
238  stream::handle_t stream_handle,
239  void* allocated_region_start)
240 {
241  auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
242  throw_if_error_lazy(status,
243  "Failed scheduling an asynchronous freeing of the global memory region starting at "
244  + cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
245  + stream::detail_::identify(stream_handle, context_handle) );
246 }
247 
248 } // namespace detail_
249 
257 void free(const stream_t& stream, void* region_start);
259 
260 inline void free(const stream_t& stream, region_t region)
261 {
262  free(stream, region.data());
263 }
265 
266 } // namespace async
267 #endif
268 
269 
283 inline region_t allocate(const context_t& context, size_t size_in_bytes);
284 
298 inline region_t allocate(const device_t& device, size_t size_in_bytes);
299 
300 namespace detail_ {
301 
302 // Note: Allocates _in the current context_! No current context => failure!
303 struct allocator {
304  void* operator()(size_t num_bytes) const { return detail_::allocate_in_current_context(num_bytes).start(); }
305 };
306 struct deleter {
307  void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
308 };
309 
310 } // namespace detail_
311 
312 
324 template <typename T>
325 void typed_set(T* start, const T& value, size_t num_elements);
326 
334 
340 inline void set(void* start, int byte_value, size_t num_bytes)
341 {
342  return typed_set<unsigned char>(static_cast<unsigned char*>(start), (unsigned char) byte_value, num_bytes);
343 }
344 
348 inline void set(region_t region, int byte_value)
349 {
350  set(region.start(), byte_value, region.size());
351 }
353 
357 
361 inline void zero(void* start, size_t num_bytes)
362 {
363  set(start, 0, num_bytes);
364 }
365 
371 inline void zero(region_t region)
372 {
373  zero(region.start(), region.size());
374 }
376 
383 template <typename T>
384 inline void zero(T* ptr)
385 {
386  zero(ptr, sizeof(T));
387 }
388 
389 } // namespace device
390 
404 
409 void copy(void *destination, const void *source, size_t num_bytes);
410 
415 inline void copy(void* destination, const_region_t source)
416 {
417  return copy(destination, source.start(), source.size());
418 }
419 
427 inline void copy(region_t destination, const_region_t source)
428 {
429 #ifndef NDEBUG
430  if (destination.size() < source.size()) {
431  throw ::std::logic_error("Can't copy a large region into a smaller one");
432  }
433 #endif
434  return copy(destination.start(), source);
435 }
436 
442 template <typename T, size_t N>
443 inline void copy(region_t destination, const T(&source)[N])
444 {
445 #ifndef NDEBUG
446  if (destination.size() < N) {
447  throw ::std::logic_error("Source size exceeds destination size");
448  }
449 #endif
450  return copy(destination.start(), source, sizeof(T) * N);
451 }
452 
458 template <typename T, size_t N>
459 inline void copy(T(&destination)[N], const_region_t source)
460 {
461 #ifndef NDEBUG
462  size_t required_size = N * sizeof(T);
463  if (source.size() != required_size) {
464  throw ::std::invalid_argument(
465  "Attempt to copy a region of " + ::std::to_string(source.size()) +
466  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
467  }
468 #endif
469  return copy(destination, source.start(), sizeof(T) * N);
470 }
471 
472 template <typename T, size_t N>
473 inline void copy(void* destination, T (&source)[N])
474 {
475  return copy(destination, source, sizeof(T) * N);
476 }
477 
483 template <typename T, size_t N>
484 inline void copy(T(&destination)[N], T* source)
485 {
486  return copy(destination, source, sizeof(T) * N);
487 }
488 
495 inline void copy(region_t destination, void* source, size_t num_bytes)
496 {
497 #ifndef NDEBUG
498  if (destination.size() < num_bytes) {
499  throw ::std::logic_error("Number of bytes to copy exceeds destination size");
500  }
501 #endif
502  return copy(destination.start(), source, num_bytes);
503 }
504 
505 inline void copy(region_t destination, void* source)
506 {
507  return copy(destination, source, destination.size());
508 }
510 
522 void set(void* ptr, int byte_value, size_t num_bytes);
523 
534 inline void set(region_t region, int byte_value)
535 {
536  return set(region.start(), byte_value, region.size());
537 }
538 
545 inline void zero(region_t region)
546 {
547  return set(region, 0);
548 }
549 
556 inline void zero(void* ptr, size_t num_bytes)
557 {
558  return set(ptr, 0, num_bytes);
559 }
560 
568 template <typename T>
569 inline void zero(T* ptr)
570 {
571  zero(ptr, sizeof(T));
572 }
573 
574 namespace detail_ {
575 
576 template<dimensionality_t NumDimensions>
577 struct base_copy_params;
578 
579 template<>
580 struct base_copy_params<2> {
581  using intra_context_type = CUDA_MEMCPY2D;
582  using type = intra_context_type; // Why is there no inter-context type, CUDA_MEMCPY2D_PEER ?
583 };
584 
585 template<>
586 struct base_copy_params<3> {
587  using type = CUDA_MEMCPY3D_PEER;
588  using intra_context_type = CUDA_MEMCPY3D;
589 };
590 
591 // Note these, by default, support inter-context
592 template<dimensionality_t NumDimensions>
593 using base_copy_params_t = typename base_copy_params<NumDimensions>::type;
594 
595 
596 enum class endpoint_t {
597  source, destination
598 };
599 
600 template<dimensionality_t NumDimensions>
601 struct copy_parameters_t : base_copy_params_t<NumDimensions> {
602  // TODO: Perhaps use proxies?
603 
604  using intra_context_type = typename base_copy_params<NumDimensions>::intra_context_type;
605 
606  using dimensions_type = array::dimensions_t<NumDimensions>;
607 
608  template<typename T>
609  void set_endpoint(endpoint_t endpoint, const cuda::array_t<T, NumDimensions> &array);
610 
611  template<typename T>
612  void set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<NumDimensions> dimensions);
613 
614  template<typename T>
615  void set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
617 
618  // TODO: Perhaps we should have an dimensioned offset type?
619  template<typename T>
620  void set_offset(endpoint_t endpoint, dimensions_type offset);
621 
622  template<typename T>
623  void clear_offset(endpoint_t endpoint)
624  { set_offset<T>(endpoint, dimensions_type::zero()); }
625 
626  template<typename T>
627  void set_extent(dimensions_type extent);
628  // Sets how much is being copies, as opposed to the sizes of the endpoints which may be larger
629 
630  void clear_rest();
631  // Clear any dummy fields which are required to be set to 0. Note that important fields,
632  // which you have not set explicitly, will _not_ be cleared by this method.
633 
634 };
635 
636 template<>
637 template<typename T>
638 void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 2> &array)
639 {
640  (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
641  (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
642  // Can't set the endpoint context - the basic data structure doesn't support that!
643 }
644 
645 template<>
646 template<typename T>
647 void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, const cuda::array_t<T, 3> &array)
648 {
649  (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = CU_MEMORYTYPE_ARRAY;
650  (endpoint == endpoint_t::source ? srcArray : dstArray) = array.get();
651  (endpoint == endpoint_t::source ? srcContext : dstContext) = array.context_handle();
652 }
653 
654 template<>
655 template<typename T>
656 inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
657  array::dimensions_t<2> dimensions)
658 {
659  if (context_handle != context::detail_::none) {
660  throw cuda::runtime_error(
661  cuda::status::named_t::not_supported,
662  "Inter-context copying of 2D arrays is not supported by the CUDA driver");
663  }
664  set_endpoint<2>(endpoint, ptr, dimensions);
665 }
666 
667 template<>
668 template<typename T>
669 inline void copy_parameters_t<2>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<2> dimensions)
670 {
671  auto memory_type = memory::type_of(ptr);
672  if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) {
673  (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
674  } else {
675  if (endpoint == endpoint_t::source) { srcHost = ptr; }
676  else { dstHost = ptr; }
677  }
678  (endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T);
679  (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type;
680  // Can't set the endpoint context - the basic data structure doesn't support that!
681 }
682 
683 template<>
684 template<typename T>
685 inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, context::handle_t context_handle, T *ptr,
686  array::dimensions_t<3> dimensions)
687 {
688  cuda::memory::pointer_t<void> wrapped{ptr};
689  auto memory_type = memory::type_of(ptr);
690  if (memory_type == memory::type_t::unified_ or memory_type == type_t::device_) {
691  (endpoint == endpoint_t::source ? srcDevice : dstDevice) = device::address(ptr);
692  } else {
693  if (endpoint == endpoint_t::source) { srcHost = ptr; }
694  else { dstHost = ptr; }
695  }
696  (endpoint == endpoint_t::source ? srcPitch : dstPitch) = dimensions.width * sizeof(T);
697  (endpoint == endpoint_t::source ? srcHeight : dstHeight) = dimensions.height;
698  (endpoint == endpoint_t::source ? srcMemoryType : dstMemoryType) = (CUmemorytype) memory_type;
699  (endpoint == endpoint_t::source ? srcContext : dstContext) = context_handle;
700 }
701 
702 template<>
703 template<typename T>
704 inline void copy_parameters_t<3>::set_endpoint(endpoint_t endpoint, T *ptr, array::dimensions_t<3> dimensions)
705 {
706  set_endpoint<T>(endpoint, context::detail_::none, ptr, dimensions);
707 }
708 
709 template<>
710 inline void copy_parameters_t<2>::clear_rest()
711 {}
712 
713 template<>
714 inline void copy_parameters_t<3>::clear_rest()
715 {
716  srcLOD = 0;
717  dstLOD = 0;
718 }
719 
720 template<>
721 template<typename T>
722 inline void copy_parameters_t<2>::set_extent(dimensions_type extent)
723 {
724  WidthInBytes = extent.width * sizeof(T);
725  Height = extent.height;
726 }
727 
728 template<>
729 template<typename T>
730 void copy_parameters_t<3>::set_extent(dimensions_type extent)
731 {
732  WidthInBytes = extent.width * sizeof(T);
733  Height = extent.height;
734  Depth = extent.depth;
735 }
736 
737 template<>
738 template<typename T>
739 void copy_parameters_t<3>::set_offset(endpoint_t endpoint, dimensions_type offset)
740 {
741  (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T);
742  (endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
743  (endpoint == endpoint_t::source ? srcZ : dstZ) = offset.depth;
744 }
745 
746 template<>
747 template<typename T>
748 void copy_parameters_t<2>::set_offset(endpoint_t endpoint, dimensions_type offset)
749 {
750  (endpoint == endpoint_t::source ? srcXInBytes : dstXInBytes) = offset.width * sizeof(T);
751  (endpoint == endpoint_t::source ? srcY : dstY) = offset.height;
752 }
753 
754 void set_endpoint(endpoint_t endpoint, void *src);
755 
756 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params)
757 {
758  // Note this _must_ be an intra-context copy, as inter-context is not supported
759  // and there's no indication of context in the relevant data structures
760  return cuMemcpy2D(&params);
761 }
762 
763 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params)
764 {
765  if (params.srcContext == params.dstContext) {
766  auto *intra_context_params = reinterpret_cast<base_copy_params<3>::intra_context_type *>(&params);
767  return cuMemcpy3D(intra_context_params);
768  }
769  return cuMemcpy3DPeer(&params);
770 }
771 
772 template<dimensionality_t NumDimensions>
773 status_t multidim_copy(context::handle_t context_handle, copy_parameters_t<NumDimensions> params)
774 {
775  context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{context_handle};
776  return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params);
777 }
778 
779 } // namespace detail_
780 
791 template<typename T, dimensionality_t NumDimensions>
792 void copy(const array_t<T, NumDimensions>& destination, const T *source)
793 {
794  detail_::copy_parameters_t<NumDimensions> params{};
795  auto dims = destination.dimensions();
796  params.template clear_offset<T>(detail_::endpoint_t::source);
797  params.template clear_offset<T>(detail_::endpoint_t::destination);
798  params.template set_extent<T>(dims);
799  params.clear_rest();
800  params.set_endpoint(detail_::endpoint_t::source, const_cast<T*>(source), dims);
801  params.set_endpoint(detail_::endpoint_t::destination, destination);
802  auto status = detail_::multidim_copy<NumDimensions>(destination.context_handle(), params);
803  throw_if_error(status, "Copying from a regular memory region into a CUDA array");
804 }
815 template <typename T, dimensionality_t NumDimensions>
816 void copy(T *destination, const array_t<T, NumDimensions>& source)
817 {
818  detail_::copy_parameters_t<NumDimensions> params{};
819  auto dims = source.dimensions();
820  params.template clear_offset<T>(detail_::endpoint_t::source);
821  params.template clear_offset<T>(detail_::endpoint_t::destination);
822  params.template set_extent<T>(source.dimensions());
823  params.clear_rest();
824  params.set_endpoint(detail_::endpoint_t::source, source);
825  params.template set_endpoint<T>(detail_::endpoint_t::destination, destination, dims);
826  params.dstPitch = params.srcPitch = dims.width * sizeof(T);
827  auto status = detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
828  throw_if_error(status, "Copying from a CUDA array into a regular memory region");
829 }
830 
831 template <typename T, dimensionality_t NumDimensions>
832 void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source)
833 {
834  detail_::copy_parameters_t<NumDimensions> params{};
835  auto dims = source.dimensions();
836  params.template clear_offset<T>(detail_::endpoint_t::source);
837  params.template clear_offset<T>(detail_::endpoint_t::destination);
838  params.template set_extent<T>(source.dimensions());
839  params.clear_rest();
840  params.set_endpoint(detail_::endpoint_t::source, source);
841  params.set_endpoint(detail_::endpoint_t::destination, destination);
842  params.dstPitch = params.srcPitch = dims.width * sizeof(T);
843  auto status = //(source.context() == destination.context()) ?
844  detail_::multidim_copy<NumDimensions>(source.context_handle(), params);
845  throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
846 }
847 
848 template <typename T, dimensionality_t NumDimensions>
849 void copy(region_t destination, const array_t<T, NumDimensions>& source)
850 {
851  if (destination.size() < source.size_bytes()) {
852  throw ::std::logic_error("Attempt to copy an array into a memory region too small to hold the copy");
853  }
854  copy(destination.start(), source);
855 }
856 
857 template <typename T, dimensionality_t NumDimensions>
858 void copy(const array_t<T, NumDimensions>& destination, const_region_t source)
859 {
860  if (destination.size_bytes() < source.size()) {
861  throw ::std::logic_error("Attempt to copy into an array from a source region larger than the array's size");
862  }
863  copy(destination, source.start());
864 }
865 
874 template <typename T>
875 void copy_single(T* destination, const T* source)
876 {
877  copy(destination, source, sizeof(T));
878 }
879 
880 namespace async {
881 
882 namespace detail_ {
883 
897 
906 inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
907 {
908  auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);
909 
910  // TODO: Determine whether it was from host to device, device to host etc and
911  // add this information to the error string
912  throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
913 }
914 
922 inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
923 {
924 #ifndef NDEBUG
925  if (destination.size() < source.size()) {
926  throw ::std::logic_error("Source size exceeds destination size");
927  }
928 #endif
929  copy(destination.start(), source.start(), source.size(), stream_handle);
930 }
932 
933 using memory::detail_::copy_parameters_t;
934 
935 inline status_t multidim_copy_in_current_context(
936  ::std::integral_constant<dimensionality_t, 2>,
937  copy_parameters_t<2> params,
938  stream::handle_t stream_handle)
939 {
940  // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
941  // structure holds no information about contexts.
942  return cuMemcpy2DAsync(&params, stream_handle);
943 }
944 
945 inline status_t multidim_copy_in_current_context(
946  ::std::integral_constant<dimensionality_t, 3>,
947  copy_parameters_t<3> params,
948  stream::handle_t stream_handle)
949 {
950  if (params.srcContext == params.dstContext) {
951  using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
952  auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
953  return cuMemcpy3DAsync(intra_context_params, stream_handle);
954  }
955  return cuMemcpy3DPeerAsync(&params, stream_handle);
956 
957 }
958 
959 template<dimensionality_t NumDimensions>
960 status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle) {
961  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
962 }
963 
964 // Note: Assumes the stream handle is for a stream in the current context
965 template<dimensionality_t NumDimensions>
966 status_t multidim_copy(
967  context::handle_t context_handle,
968  copy_parameters_t<NumDimensions> params,
969  stream::handle_t stream_handle)
970 {
971  context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
972  return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
973 }
974 
975 
976 // Assumes the array and the stream share the same context, and that the destination is
977 // accessible from that context (e.g. allocated within it, or being managed memory, etc.)
978 template <typename T, dimensionality_t NumDimensions>
979 void copy(T *destination, const array_t<T, NumDimensions>& source, stream::handle_t stream_handle)
980 {
981  using memory::detail_::endpoint_t;
982  auto dims = source.dimensions();
983  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
984  detail_::copy_parameters_t<NumDimensions> params{};
985  params.template clear_offset<T>(endpoint_t::source);
986  params.template clear_offset<T>(endpoint_t::destination);
987  params.template set_extent<T>(dims);
988  params.clear_rest();
989  params.set_endpoint(endpoint_t::source, source);
990  params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
991  params.dstPitch = dims.width * sizeof(T);
992  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
993  throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
994 }
995 
996 
997 template <typename T, dimensionality_t NumDimensions>
998 void copy(const array_t<T, NumDimensions>& destination, const T* source, stream::handle_t stream_handle)
999 {
1000  using memory::detail_::endpoint_t;
1001  auto dims = destination.dimensions();
1002  //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
1003  detail_::copy_parameters_t<NumDimensions> params{};
1004  params.template clear_offset<T>(endpoint_t::source);
1005  params.template clear_offset<T>(endpoint_t::destination);
1006  params.template set_extent<T>(destination.dimensions());
1007  params.srcPitch = dims.width * sizeof(T);
1008  params.clear_rest();
1009  params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
1010  params.set_endpoint(endpoint_t::destination, destination);
1011  auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
1012  throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
1013 }
1014 
1029 template <typename T>
1030 void copy_single(T& destination, const T& source, stream::handle_t stream_handle)
1031 {
1032  copy(&destination, &source, sizeof(T), stream_handle);
1033 }
1034 
1035 } // namespace detail_
1036 
1055 void copy(void* destination, void const* source, size_t num_bytes, const stream_t& stream);
1057 
1058 inline void copy(void* destination, const_region_t source, size_t num_bytes, const stream_t& stream)
1059 {
1060 #ifndef NDEBUG
1061  if (source.size() < num_bytes) {
1062  throw ::std::logic_error("Attempt to copy more than the source region's size");
1063  }
1064 #endif
1065  copy(destination, source.start(), num_bytes, stream);
1066 }
1067 
1068 inline void copy(region_t destination, const_region_t source, size_t num_bytes, const stream_t& stream)
1069 {
1070 #ifndef NDEBUG
1071  if (destination.size() < num_bytes) {
1072  throw ::std::logic_error("Attempt to copy beyond the end of the destination region");
1073  }
1074 #endif
1075  copy(destination.start(), source.start(), num_bytes, stream);
1076 }
1077 
1078 inline void copy(void* destination, const_region_t source, const stream_t& stream)
1079 {
1080  copy(destination, source, source.size(), stream);
1081 }
1082 
1083 inline void copy(region_t destination, const_region_t source, const stream_t& stream)
1084 {
1085  copy(destination, source, source.size(), stream);
1086 }
1087 
1088 inline void copy(region_t destination, void* source, const stream_t& stream)
1089 {
1090  return copy(destination.start(), source, destination.size(), stream);
1091 }
1092 
1093 
1097 template <typename T, size_t N>
1098 inline void copy(region_t destination, const T(&source)[N], const stream_t& stream)
1099 {
1100 #ifndef NDEBUG
1101  if (destination.size() < N) {
1102  throw ::std::logic_error("Source size exceeds destination size");
1103  }
1104 #endif
1105  return copy(destination.start(), source, sizeof(T) * N, stream);
1106 }
1107 
1108 inline void copy(region_t destination, void* source, size_t num_bytes, const stream_t& stream)
1109 {
1110 #ifndef NDEBUG
1111  if (destination.size() < num_bytes) {
1112  throw ::std::logic_error("Number of bytes to copy exceeds destination size");
1113  }
1114 #endif
1115  return copy(destination.start(), source, num_bytes, stream);
1116 }
1118 
1128 template <typename T, dimensionality_t NumDimensions>
1129 void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream);
1130 
1131 template <typename T, dimensionality_t NumDimensions>
1132 void copy(array_t<T, NumDimensions>& destination, const_region_t source, const stream_t& stream)
1133 {
1134 #ifndef NDEBUG
1135  size_t required_size = destination.size() * sizeof(T);
1136  if (source.size() != required_size) {
1137  throw ::std::invalid_argument(
1138  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1139  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
1140  }
1141 #endif
1142  copy(destination, source.start(), stream);
1143 }
1144 
1154 template <typename T, dimensionality_t NumDimensions>
1155 void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream);
1156 
1157 template <typename T, dimensionality_t NumDimensions>
1158 void copy(region_t destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
1159 {
1160 #ifndef NDEBUG
1161  size_t required_size = source.size() * sizeof(T);
1162  if (destination.size() < required_size) {
1163  throw ::std::invalid_argument(
1164  "Attempt to copy " + ::std::to_string(required_size) + " bytes from an array into a "
1165  "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
1166  }
1167 #endif
1168  copy(destination.start(), source, stream);
1169 }
1170 
1176 template <typename T, size_t N>
1177 inline void copy(T(&destination)[N], T* source, const stream_t& stream)
1178 {
1179  return copy(destination, source, sizeof(T) * N, stream);
1180 }
1181 
1187 template <typename T, size_t N>
1188 inline void copy(T(&destination)[N], const_region_t source, const stream_t& stream)
1189 {
1190 #ifndef NDEBUG
1191  size_t required_size = N * sizeof(T);
1192  if (source.size() != required_size) {
1193  throw ::std::invalid_argument(
1194  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1195  " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
1196  }
1197 #endif
1198  return copy(destination, source.start(), sizeof(T) * N, stream);
1199 }
1200 
1201 
1213 template <typename T>
1214 void copy_single(T& destination, const T& source, const stream_t& stream);
1215 
1216 } // namespace async
1217 
1218 namespace device {
1219 
1220 namespace async {
1221 
1222 namespace detail_ {
1223 
1224 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
1225 {
1226  // TODO: Double-check that this call doesn't require setting the current device
1227  auto result = cuMemsetD8Async(address(start), (unsigned char) byte_value, num_bytes, stream_handle);
1228  throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
1229 }
1230 
1231 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
1232 {
1233  set(region.start(), byte_value, region.size(), stream_handle);
1234 }
1235 
1236 inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle)
1237 {
1238  set(start, 0, num_bytes, stream_handle);
1239 }
1240 
1241 inline void zero(region_t region, stream::handle_t stream_handle)
1242 {
1243  zero(region.start(), region.size(), stream_handle);
1244 }
1245 
1246 // TODO: Drop this in favor of <algorithm>-like functions under `cuda::`.
1247 template <typename T>
1248 inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle)
1249 {
1250  static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
1251  static_assert(
1252  sizeof(T) == 1 or sizeof(T) == 2 or
1253  sizeof(T) == 4 or sizeof(T) == 8,
1254  "Unsupported type size - only sizes 1, 2 and 4 are supported");
1255  // TODO: Consider checking for alignment when compiling without NDEBUG
1256  status_t result = static_cast<status_t>(cuda::status::success);
1257  switch(sizeof(T)) {
1258  case(1): result = cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle); break;
1259  case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
1260  case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
1261  }
1262  throw_if_error_lazy(result, "Setting global device memory bytes");
1263 }
1264 
1265 } // namespace detail_
1266 
1267 
1280 template <typename T>
1281 void typed_set(T* start, const T& value, size_t num_elements, const stream_t& stream);
1282 
1294 inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
1295 {
1296  return typed_set<unsigned char>(static_cast<unsigned char*>(start), (unsigned char) byte_value, num_bytes, stream);
1297 }
1298 
1302 void zero(void* start, size_t num_bytes, const stream_t& stream);
1303 
1314 template <typename T>
1315 inline void zero(T* ptr, const stream_t& stream)
1316 {
1317  zero(ptr, sizeof(T), stream);
1318 }
1319 
1320 } // namespace async
1321 
1322 
1323 } // namespace device
1324 
1325 namespace inter_context {
1326 
1327 namespace detail_ {
1328 
1329 inline void copy(
1330  void * destination_address,
1331  context::handle_t destination_context,
1332  const void * source_address,
1333  context::handle_t source_context,
1334  size_t num_bytes)
1335 {
1336  auto status = cuMemcpyPeer(
1337  reinterpret_cast<device::address_t>(destination_address),
1338  destination_context,
1339  reinterpret_cast<device::address_t>(source_address),
1340  source_context, num_bytes);
1341  throw_if_error_lazy(status,
1342  ::std::string("Failed copying data between devices: From address ")
1343  + cuda::detail_::ptr_as_hex(source_address) + " in "
1344  + context::detail_::identify(source_context) + " to address "
1345  + cuda::detail_::ptr_as_hex(destination_address) + " in "
1346  + context::detail_::identify(destination_context) );
1347 }
1348 
1349 } // namespace detail_
1350 
1351 void copy(
1352  void * destination,
1353  const context_t& destination_context,
1354  const void * source_address,
1355  const context_t& source_context,
1356  size_t num_bytes);
1357 
1358 inline void copy(
1359  void * destination,
1360  const context_t& destination_context,
1361  const_region_t source,
1362  const context_t& source_context)
1363 {
1364  copy(destination, destination_context, source.start(), source_context, source.size());
1365 }
1366 
1367 inline void copy(
1368  region_t destination,
1369  const context_t& destination_context,
1370  const_region_t source,
1371  const context_t& source_context)
1372 {
1373 #ifndef NDEBUG
1374  if (destination.size() < destination.size()) {
1375  throw ::std::invalid_argument(
1376  "Attempt to copy a region of " + ::std::to_string(source.size()) +
1377  " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
1378  }
1379 #endif
1380  copy(destination.start(), destination_context, source, source_context);
1381 }
1382 
1383 template <typename T, dimensionality_t NumDimensions>
1384 inline void copy(
1385  array_t<T, NumDimensions> destination,
1387 {
1388  // for arrays, a single mechanism handles both intra- and inter-context copying
1389  return memory::copy(destination, source);
1390 }
1391 
1392 namespace async {
1393 
1394 namespace detail_ {
1395 
1396 inline void copy(
1397  void *destination,
1398  context::handle_t destination_context_handle,
1399  const void *source,
1400  context::handle_t source_context_handle,
1401  size_t num_bytes,
1402  stream::handle_t stream_handle)
1403 {
1404  auto result = cuMemcpyPeerAsync(
1405  device::address(destination),
1406  destination_context_handle,
1407  device::address(source),
1408  source_context_handle,
1409  num_bytes, stream_handle);
1410 
1411  // TODO: Determine whether it was from host to device, device to host etc and
1412  // add this information to the error string
1413  throw_if_error_lazy(result, "Scheduling an inter-context memory copy from "
1414  + context::detail_::identify(source_context_handle) + " to "
1415  + context::detail_::identify(destination_context_handle) + " on "
1416  + stream::detail_::identify(stream_handle));
1417 }
1418 
1426 inline void copy(
1427  region_t destination,
1428  context::handle_t destination_context_handle,
1429  const_region_t source,
1430  context::handle_t source_context_handle,
1431  stream::handle_t stream_handle)
1432 {
1433 #ifndef NDEBUG
1434  if (destination.size() < source.size()) {
1435  throw ::std::logic_error("Can't copy a large region into a smaller one");
1436  }
1437 #endif
1438  copy(destination.start(), destination_context_handle, source.start(), source_context_handle, source.size(),
1439  stream_handle);
1440 }
1442 
1443 } // namespace detail_
1444 
1445 void copy(
1446  void * destination_address,
1447  context_t destination_context,
1448  const void * source_address,
1449  context_t source_context,
1450  size_t num_bytes,
1451  const stream_t& stream);
1452 
1453 void copy(
1454  void * destination,
1455  context_t destination_context,
1456  const_region_t source,
1457  context_t source_context,
1458  const stream_t& stream);
1459 
1460 inline void copy(
1461  region_t destination,
1462  context_t destination_context,
1463  const_region_t source,
1464  context_t source_context,
1465  const stream_t& stream);
1466 
1467 template <typename T, dimensionality_t NumDimensions>
1468 inline void copy(
1469  array_t<T, NumDimensions> destination,
1471  const stream_t& stream)
1472 {
1473  // for arrays, a single mechanism handles both intra- and inter-context copying
1474  return memory::async::copy(destination, source, stream);
1475 }
1476 
1477 
1478 } // namespace async
1479 
1480 } // namespace inter_context
1481 
1487 namespace host {
1488 
1510 void* allocate(
1511  size_t size_in_bytes,
1512  allocation_options options);
1513 
1514 
1515 inline void* allocate(
1516  size_t size_in_bytes,
1518  cpu_write_combining cpu_wc = cpu_write_combining(false))
1519 {
1520  return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
1521 }
1522 
1523 inline void* allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
1524 {
1525  return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
1526 }
1527 
1533 inline void free(void* host_ptr)
1534 {
1535  auto result = cuMemFreeHost(host_ptr);
1536 #if CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
1537  if (result == status::success) { return; }
1538 #else
1539  if (result == status::success or result == status::context_is_destroyed) { return; }
1540 #endif
1541  throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
1542 }
1543 
1544 inline void free(region_t region) { return free(region.data()); }
1545 
1546 namespace detail_ {
1547 
1548 struct allocator {
1549  void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes); }
1550 };
1551 struct deleter {
1552  void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
1553 };
1554 
1555 
1567 inline void register_(const void *ptr, size_t size, unsigned flags)
1568 {
1569  auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
1570  throw_if_error_lazy(result,
1571  "Could not register and page-lock the region of " + ::std::to_string(size) +
1572  " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr));
1573 }
1574 
1575 inline void register_(const_region_t region, unsigned flags)
1576 {
1577  register_(region.start(), region.size(), flags);
1578 }
1579 
1580 } // namespace detail_
1581 
1587 enum mapped_io_space : bool {
1588  is_mapped_io_space = true,
1589  is_not_mapped_io_space = false
1590 };
1591 
1598  map_into_device_memory = true,
1599  do_not_map_into_device_memory = false
1600 };
1601 
1610 };
1611 
1612 
1613 // Can't use register(), since that's a reserved word
1614 inline void register_(const void *ptr, size_t size,
1615  bool register_mapped_io_space,
1616  bool map_into_device_space,
1617  bool make_device_side_accesible_to_all)
1618 {
1619  detail_::register_(
1620  ptr, size,
1621  (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
1622  | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
1623  | (make_device_side_accesible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
1624  );
1625 }
1626 
1627 inline void register_(
1628  const_region_t region,
1629  bool register_mapped_io_space,
1630  bool map_into_device_space,
1631  bool make_device_side_accesible_to_all)
1632 {
1633  register_(
1634  region.start(),
1635  region.size(),
1636  register_mapped_io_space,
1637  map_into_device_space,
1638  make_device_side_accesible_to_all);
1639 }
1640 
1641 
1642 inline void register_(void const *ptr, size_t size)
1643 {
1644  unsigned no_flags_set { 0 };
1645  detail_::register_(ptr, size, no_flags_set);
1646 }
1647 
1648 inline void register_(const_region_t region)
1649 {
1650  register_(region.start(), region.size());
1651 }
1652 
1653 // the CUDA API calls this "unregister", but that's semantically
1654 // inaccurate. The registration is not undone, rolled back, it's
1655 // just ended
1656 inline void deregister(const void *ptr)
1657 {
1658  auto result = cuMemHostUnregister(const_cast<void *>(ptr));
1659  throw_if_error_lazy(result,
1660  "Could not unregister the memory segment starting at address *a");
1661 }
1662 
1663 inline void deregister(const_region_t region)
1664 {
1665  deregister(region.start());
1666 }
1667 
1678 inline void set(void* start, int byte_value, size_t num_bytes)
1679 {
1680  ::std::memset(start, byte_value, num_bytes);
1681  // TODO: Error handling?
1682 }
1683 
1684 inline void zero(void* start, size_t num_bytes)
1685 {
1686  set(start, 0, num_bytes);
1687 }
1688 
1689 template <typename T>
1690 inline void zero(T* ptr)
1691 {
1692  zero(ptr, sizeof(T));
1693 }
1694 
1695 
1696 } // namespace host
1697 
1714 namespace managed {
1715 
1716 struct const_region_t;
1717 
1718 namespace detail_ {
1719 
1720 using advice_t = CUmem_advise;
1721 
1722 template <typename T>
1723 inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute);
1724 
1725 inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id);
1726 // inline void advise(managed::const_region_t region, advice_t attribute);
1727 
1728 template <typename T>
1729 struct base_region_t : public memory::detail_::base_region_t<T> {
1730  using parent = memory::detail_::base_region_t<T>;
1731  using parent::parent;
1732 
1733  bool is_read_mostly() const
1734  {
1735  return get_scalar_range_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1736  }
1737 
1738  void designate_read_mostly() const
1739  {
1740  set_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1741  }
1742 
1743  void undesignate_read_mostly() const
1744  {
1745  unset_range_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
1746  }
1747 
1748  device_t preferred_location() const;
1749  void set_preferred_location(device_t& device) const;
1750  void clear_preferred_location() const;
1751 
1752  // TODO: Consider using a field proxy
1753 };
1754 
1755 } // namespace detail_
1756 
1757 struct region_t : public detail_::base_region_t<void> {
1758  using base_region_t<void>::base_region_t;
1759  operator memory::region_t() { return memory::region_t{ start(), size() }; }
1760 };
1761 
1762 struct const_region_t : public detail_::base_region_t<void const> {
1763  using base_region_t<void const>::base_region_t;
1764  const_region_t(const region_t& r) : detail_::base_region_t<void const>(r.start(), r.size()) {}
1765 };
1766 
1767 void advise_expected_access_by(managed::const_region_t region, device_t& device);
1768 void advise_no_access_expected_by(managed::const_region_t region, device_t& device);
1769 
1770 template <typename Allocator = ::std::allocator<cuda::device_t> >
1771  typename ::std::vector<device_t, Allocator> accessors(managed::const_region_t region, const Allocator& allocator = Allocator() );
1772 
1773 namespace detail_ {
1774 
1775 template <typename T>
1776 inline T get_scalar_range_attribute(managed::const_region_t region, range_attribute_t attribute)
1777 {
1778  uint32_t attribute_value { 0 };
1779  auto result = cuMemRangeGetAttribute(
1780  &attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
1781  throw_if_error_lazy(result,
1782  "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
1783  return static_cast<T>(attribute_value);
1784 }
1785 
1786 // CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's
1787 // not called cuMemRangeSetAttribute, and uses a different enum.
1788 inline void advise(managed::const_region_t region, advice_t advice, cuda::device::id_t device_id)
1789 {
1790  auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id);
1791  throw_if_error_lazy(result, "Setting an attribute for a managed memory range at "
1792  + cuda::detail_::ptr_as_hex(region.start()));
1793 }
1794 
1795 // inline void set_range_attribute(managed::const_region_t region, range_attribute_t attribute, cuda::device::handle_t device_id)
1796 
1797 inline advice_t as_advice(range_attribute_t attribute, bool set)
1798 {
1799  switch (attribute) {
1800  case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
1801  return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
1802  case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
1803  return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
1804  case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
1805  return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
1806  default:
1807  throw ::std::invalid_argument(
1808  "CUDA memory range attribute does not correspond to any range advice value");
1809  }
1810 }
1811 
1812 inline void set_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute, cuda::device::id_t device_id)
1813 {
1814  static constexpr const bool set { true };
1815  advise(region, as_advice(settable_attribute, set), device_id);
1816 }
1817 
1818 inline void unset_range_attribute(managed::const_region_t region, range_attribute_t settable_attribute)
1819 {
1820  static constexpr const bool unset { false };
1821  static constexpr const cuda::device::id_t dummy_device_id { 0 };
1822  advise(region, as_advice(settable_attribute, unset), dummy_device_id);
1823 }
1824 
1825 } // namespace detail_
1826 
1827 
1828 enum class attachment_t : unsigned {
1829  global = CU_MEM_ATTACH_GLOBAL,
1830  host = CU_MEM_ATTACH_HOST,
1831  single_stream = CU_MEM_ATTACH_SINGLE,
1832  };
1833 
1834 
1835 namespace detail_ {
1836 
1837 inline region_t allocate_in_current_context(
1838  size_t num_bytes,
1839  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1840 {
1841  device::address_t allocated = 0;
1842  auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
1843  attachment_t::global : attachment_t::host;
1844  // This is necessary because managed allocation requires at least one (primary)
1845  // context to have been constructed. We could theoretically check what our current
1846  // context is etc., but that would be brittle, since someone can managed-allocate,
1847  // then change contexts, then de-allocate, and we can't be certain that whoever
1848  // called us will call free
1849  cuda::device::primary_context::detail_::increase_refcount(cuda::device::default_device_id);
1850 
1851  // Note: Despite the templating by T, the size is still in bytes,
1852  // not in number of T's
1853  auto status = cuMemAllocManaged(&allocated, num_bytes, (unsigned) flags);
1854  if (is_success(status) && allocated == 0) {
1855  // Can this even happen? hopefully not
1856  status = (status_t) status::unknown;
1857  }
1858  throw_if_error_lazy(status, "Failed allocating "
1859  + ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
1860  return {as_pointer(allocated), num_bytes};
1861 }
1862 
1868 inline void free(void* ptr)
1870 {
1871  auto result = cuMemFree(device::address(ptr));
1872  cuda::device::primary_context::detail_::decrease_refcount(cuda::device::default_device_id);
1873  throw_if_error_lazy(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
1874 }
1875 inline void free(region_t region)
1876 {
1877  free(region.start());
1878 }
1880 
1881 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
1882 struct allocator {
1883  // Allocates in the current context!
1884  void* operator()(size_t num_bytes) const
1885  {
1886  return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
1887  }
1888 };
1889 
1890 struct deleter {
1891  void operator()(void* ptr) const { memory::device::free(ptr); }
1892 };
1893 
1894 inline region_t allocate(
1895  context::handle_t context_handle,
1896  size_t num_bytes,
1897  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices)
1898 {
1899  context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
1900  return allocate_in_current_context(num_bytes, initial_visibility);
1901 }
1902 
1903 } // namespace detail_
1904 
1918 inline region_t allocate(
1919  const context_t& context,
1920  size_t num_bytes,
1921  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
1922 
1936 inline region_t allocate(
1937  device_t device,
1938  size_t num_bytes,
1939  initial_visibility_t initial_visibility = initial_visibility_t::to_all_devices);
1940 
1950 region_t allocate(size_t num_bytes);
1951 
1957 inline void free(void* managed_ptr)
1958 {
1959  auto result = cuMemFree(device::address(managed_ptr));
1960  throw_if_error_lazy(result,
1961  "Freeing managed memory (host and device regions) at address "
1962  + cuda::detail_::ptr_as_hex(managed_ptr));
1963 }
1964 
1965 inline void free(region_t region)
1966 {
1967  free(region.start());
1968 }
1969 
1970 namespace advice {
1971 
1972 enum kind_t {
1973  read_mostly = CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY,
1974  preferred_location = CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION,
1975  accessor = CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY,
1976  // Note: CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION is never set
1977 };
1978 
1979 namespace detail_ {
1980 
1981 inline void set(const_region_t region, kind_t advice, cuda::device::id_t device_id)
1982 {
1983  auto result = cuMemAdvise(device::address(region.start()), region.size(), (managed::detail_::advice_t) advice, device_id);
1984  throw_if_error_lazy(result, "Setting advice on a (managed) memory region at"
1985  + cuda::detail_::ptr_as_hex(region.start()) + " w.r.t. " + cuda::device::detail_::identify(device_id));
1986 }
1987 
1988 } // namespace detail_
1989 
1990 void set(const_region_t region, kind_t advice, const device_t& device);
1991 
1992 } // namespace advice
1993 
1994 namespace async {
1995 
1996 namespace detail_ {
1997 
1998 inline void prefetch(
1999  const_region_t region,
2000  cuda::device::id_t destination,
2001  stream::handle_t source_stream_handle)
2002 {
2003  auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle);
2004  throw_if_error_lazy(result,
2005  "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
2006  + cuda::detail_::ptr_as_hex(region.start()) + " to " + (
2007  (destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination)) );
2008 }
2009 
2010 } // namespace detail_
2011 
2017 void prefetch(
2018  const_region_t region,
2019  const cuda::device_t& destination,
2020  const stream_t& stream);
2021 
2026 void prefetch_to_host(
2027  const_region_t region,
2028  const stream_t& stream);
2029 
2030 } // namespace async
2031 
2032 } // namespace managed
2033 
2034 namespace mapped {
2035 
2040 template <typename T>
2041 inline T* device_side_pointer_for(T* host_memory_ptr)
2042 {
2043  device::address_t device_side_ptr;
2044  auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation
2045  auto status = cuMemHostGetDevicePointer(
2046  &device_side_ptr,
2047  host_memory_ptr,
2048  get_device_pointer_flags);
2049  throw_if_error_lazy(status,
2050  "Failed obtaining the device-side pointer for host-memory pointer "
2051  + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
2052  return as_pointer(device_side_ptr);
2053 }
2054 
2055 namespace detail_ {
2056 
2066 inline region_pair allocate_in_current_context(
2067  context::handle_t current_context_handle,
2068  size_t size_in_bytes,
2069  allocation_options options)
2070 {
2071  region_pair allocated {};
2072  // The default initialization is unnecessary, but let's play it safe
2073  allocated.size_in_bytes = size_in_bytes;
2074  auto flags = CU_MEMHOSTALLOC_DEVICEMAP &
2075  cuda::memory::detail_::make_cuda_host_alloc_flags(options);
2076  auto status = cuMemHostAlloc(&allocated.host_side, size_in_bytes, flags);
2077  if (is_success(status) && (allocated.host_side == nullptr)) {
2078  // Can this even happen? hopefully not
2079  status = (status_t) status::named_t::unknown;
2080  }
2081  throw_if_error_lazy(status,
2082  "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
2083  + " bytes of global memory in " + context::detail_::identify(current_context_handle));
2084  allocated.device_side = device_side_pointer_for(allocated.host_side);
2085  return allocated;
2086 }
2087 
2088 inline region_pair allocate(
2089  context::handle_t context_handle,
2090  size_t size_in_bytes,
2091  allocation_options options)
2092 {
2093  context::current::detail_::scoped_override_t set_context_for_this_scope(context_handle);
2094  return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
2095 }
2096 
2097 inline void free(void* host_side_pair)
2098 {
2099  auto result = cuMemFreeHost(host_side_pair);
2100  throw_if_error_lazy(result, "Freeing a mapped memory region pair with host-side address "
2101  + cuda::detail_::ptr_as_hex(host_side_pair));
2102 }
2103 
2104 } // namespace detail_
2105 
2115 region_pair allocate(
2116  cuda::context_t& context,
2117  size_t size_in_bytes,
2118  allocation_options options);
2119 
2128 region_pair allocate(
2129  cuda::device_t& device,
2130  size_t size_in_bytes,
2132 
2133 
2140 inline void free(region_pair pair)
2141 {
2142  detail_::free(pair.host_side);
2143 }
2144 
2151 inline void free_region_pair_of(void* ptr)
2152 {
2153  // TODO: What if the pointer is not part of a mapped region pair?
2154  // We could check this...
2155  void* host_side_ptr;
2156  auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
2157  throw_if_error_lazy(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
2158  + cuda::detail_::ptr_as_hex(ptr));
2159  detail_::free(host_side_ptr);
2160 }
2161 
2173 inline bool is_part_of_a_region_pair(const void* ptr)
2174 {
2175  auto wrapped_ptr = pointer_t<const void> { ptr };
2176  return wrapped_ptr.other_side_of_region_pair().get() != nullptr;
2177 }
2178 
2179 } // namespace mapped
2180 
2181 } // namespace memory
2182 
2183 namespace symbol {
2191 template <typename T>
2192 inline memory::region_t locate(T&& symbol)
2193 {
2194  void *start;
2195  size_t symbol_size;
2196  auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
2197  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for a symbol");
2198  api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
2199  throw_if_error_lazy(api_call_result, "Could not locate the device memory address for the symbol at address"
2200  + cuda::detail_::ptr_as_hex(start));
2201  return { start, symbol_size };
2202 }
2203 
2204 } // namespace symbol
2205 
2206 } // namespace cuda
2207 
2208 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:79
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:59
Proxy class for a CUDA stream.
Definition: stream.hpp:206
is_not_accessible_on_all_devices
Definition: memory.hpp:1609
Wrapper class for a CUDA context.
Definition: context.hpp:220
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:25
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53
void copy(void *destination, const void *source, size_t num_bytes)
Synchronously copies data between memory spaces or within a memory space.
Definition: memory.hpp:521
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2173
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:30
void typed_set(T *start, const T &value, size_t num_elements)
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:533
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:732
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1607
Dimensions for 2D CUDA arrays.
Definition: types.hpp:236
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:321
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:218
Dimensions for 3D CUDA arrays.
Definition: types.hpp:195
T * get() const
Definition: pointer.hpp:120
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2151
void copy_single(T *destination, const T *source)
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:875
Memory regions appearing in both on the host-side and device-side address spaces with the regions in ...
Definition: memory.hpp:1757
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:88
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:269
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:190
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:113
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2192
Definition: memory.hpp:1762
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
is_accessible_on_all_devices
Definition: memory.hpp:1608
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Return a pointers address as a numeric value of the type appropriate for device.
Definition: types.hpp:545
CUstream handle_t
The CUDA API&#39;s handle for streams.
Definition: types.hpp:307
A wrapper class for host and/or device pointers, allowing easy access to CUDA&#39;s pointer attributes...
Representation, allocation and manipulation of CUDA-related memory, of different kinds.
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) for the device-side memory mapped ...
Definition: memory.hpp:2041
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1587
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1597
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:536
Definition: types.hpp:632
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:124
Definition: types.hpp:623
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:205
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:166
CUDA&#39;s array memory-objects are multi-dimensional; but their dimensions, or extents, are not the same as cuda::grid::dimensions_t ; they may be much larger in each axis.
Definition: types.hpp:189
void zero(void *start, size_t num_bytes)
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:361
Host-side (= system) memory which is "pinned", i.e.