eyalroz/cuda-api-wrappers/memory_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_MEMORY_HPP_
 #define CUDA_API_WRAPPERS_MEMORY_HPP_

 #include "copy_parameters.hpp"
 #include "array.hpp"
 #include "constants.hpp"
 #include "current_device.hpp"
 #include "error.hpp"
 #include "pointer.hpp"
 #include "current_context.hpp"
 #include "detail/unique_span.hpp"

 // The following is needed for cudaGetSymbolAddress, cudaGetSymbolSize
 #include <cuda_runtime.h>

 #include <memory>
 #include <cstring> // for ::std::memset
 #include <vector>
 #include <utility>

 namespace cuda {

 class device_t;
 class context_t;
 class stream_t;
 class module_t;

 namespace memory {

 enum class portability_across_contexts : bool {
     isnt_portable = false,
     is_portable   = true,
 };

 enum cpu_write_combining : bool {
     without_wc = false,
     with_wc    = true,
 };

 struct allocation_options {
     portability_across_contexts  portability;

     cpu_write_combining          write_combining;
 };

 namespace detail_ {

 template <typename T, bool CheckConstructibility = false>
 inline void check_allocation_type() noexcept
 {
     static_assert(::std::is_trivially_constructible<T>::value,
         "Attempt to create a typed buffer of a non-trivially-constructive type");
     static_assert(not CheckConstructibility or ::std::is_trivially_destructible<T>::value,
         "Attempt to create a typed buffer of a non-trivially-destructible type "
         "without allowing for its destruction");
     static_assert(::std::is_trivially_copyable<T>::value,
         "Attempt to create a typed buffer of a non-trivially-copyable type");
 }

 inline unsigned make_cuda_host_alloc_flags(allocation_options options)
 {
     return
         (options.portability     == portability_across_contexts::is_portable ? CU_MEMHOSTALLOC_PORTABLE      : 0) |
         (options.write_combining == cpu_write_combining::with_wc             ? CU_MEMHOSTALLOC_WRITECOMBINED : 0);
 }

 } // namespace detail_

 namespace mapped {

 // TODO: Perhaps make this an array of size 2 and use aspects to index it?

 template <typename T>
 struct span_pair_t {
     span<T> host_side, device_side;

     constexpr operator ::std::pair<span<T>, span<T>>() const { return { host_side, device_side }; }
     constexpr operator ::std::pair<region_t, region_t>() const { return { host_side, device_side }; }
 };

 struct region_pair_t {
     memory::region_t host_side, device_side;

     template <typename T>
     constexpr span_pair_t<T> as_spans() const
     {
         return { host_side.as_span<T>(), device_side.as_span<T>() };
     }
 };

 } // namespace mapped

 namespace device {

 namespace detail_ {

 #if CUDA_VERSION >= 11020
 inline cuda::memory::region_t allocate_in_current_context(
     size_t num_bytes, optional<stream::handle_t> stream_handle = {})
 #else
 inline cuda::memory::region_t allocate_in_current_context(size_t num_bytes)
 #endif
 {
 #if CUDA_VERSION >= 11020
     if (stream_handle) {
         device::address_t allocated = 0;
         // Note: the typed cudaMalloc also takes its size in bytes, apparently,
         // not in number of elements
         auto status = cuMemAllocAsync(&allocated, num_bytes, *stream_handle);
         if (is_success(status) && allocated == 0) {
             // Can this even happen? hopefully not
             status = static_cast<decltype(status)>(status::unknown);
         }
         throw_if_error_lazy(status,
             "Failed scheduling an asynchronous allocation of " + ::std::to_string(num_bytes) +
             " bytes of global memory on " + stream::detail_::identify(*stream_handle, context::current::detail_::get_handle()) );
         return {as_pointer(allocated), num_bytes};
     }
 #endif
     device::address_t allocated = 0;
     auto status = cuMemAlloc(&allocated, num_bytes);
     if (is_success(status) && allocated == 0) {
         // Can this even happen? hopefully not
         status = static_cast<status_t>(status::unknown);
     }
     throw_if_error_lazy(status, "Failed allocating " + ::std::to_string(num_bytes) +
         " bytes of global memory on the current CUDA device");
     return {as_pointer(allocated), num_bytes};
 }

 #if CUDA_VERSION >= 11020
 inline region_t allocate(
     context::handle_t           context_handle,
     size_t                      size_in_bytes,
     optional<stream::handle_t>  stream_handle = {})
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return allocate_in_current_context(size_in_bytes, stream_handle);
 }
 #else
 inline region_t allocate(
     context::handle_t           context_handle,
     size_t                      size_in_bytes)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return allocate_in_current_context(size_in_bytes);
 }
 #endif

 #if CUDA_VERSION >= 11020
 inline void free_on_stream(
     void*              allocated_region_start,
     stream::handle_t   stream_handle)
 {
     auto status = cuMemFreeAsync(device::address(allocated_region_start), stream_handle);
     throw_if_error_lazy(status,
         "Failed scheduling an asynchronous freeing of the global memory region starting at "
         + cuda::detail_::ptr_as_hex(allocated_region_start) + " on "
         + stream::detail_::identify(stream_handle));
 }
 #endif // CUDA_VERSION >= 11020

 inline void free_in_current_context(
     context::handle_t          current_context_handle,
     void*                      allocated_region_start)
 {
     auto result = cuMemFree(address(allocated_region_start));
     if (result == status::success) { return; }
 #ifndef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
     if (result == status::context_is_destroyed) { return; }
 #endif
     throw runtime_error(result, "Freeing device memory at "
         + cuda::detail_::ptr_as_hex(allocated_region_start)
         + " in " + context::detail_::identify(current_context_handle));
 }

 } // namespace detail_

 #if CUDA_VERSION >= 11020
 inline void free(void* region_start, optional_ref<const stream_t> stream = {});
 #else
 inline void free(void* ptr);
 #endif

 #if CUDA_VERSION >= 11020
 inline void free(region_t region, optional_ref<const stream_t> stream = {})
 {
     free(region.start(), stream);
 }
 #else
 inline void free(region_t region)
 {
     free(region.start());
 }
 #endif

 #if CUDA_VERSION >= 11020

 region_t allocate(size_t size_in_bytes, optional_ref<const stream_t> stream);
 #endif

 inline region_t allocate(const context_t& context, size_t size_in_bytes);

 inline region_t allocate(const device_t& device, size_t size_in_bytes);

 namespace detail_ {

 // Note: Allocates _in the current context_! No current context => failure!
 struct allocator {
     void* operator()(size_t num_bytes) const {
         return detail_::allocate_in_current_context(num_bytes).start();
     }
 };

 struct deleter {
     void operator()(void* ptr) const { cuda::memory::device::free(ptr); }
 };

 } // namespace detail_

 template <typename T>
 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream = {});

 inline void set(void* start, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
     return typed_set<unsigned char>(
         static_cast<unsigned char*>(start),
         static_cast<unsigned char>(byte_value),
         num_bytes,
         stream);
 }

 inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
 {
     set(region.start(), byte_value, region.size(), stream);
 }

 inline void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
     set(start, 0, num_bytes, stream);
 }

 inline void zero(region_t region, optional_ref<const stream_t> stream = {})
 {
     zero(region.start(), region.size(), stream);
 }

 template <typename T>
 inline void zero(T* ptr, optional_ref<const stream_t> stream = {})
 {
     zero(ptr, sizeof(T), stream);
 }

 } // namespace device

 namespace detail_ {


 inline void copy(void* destination, const void* source, size_t num_bytes, stream::handle_t stream_handle)
 {
     auto result = cuMemcpyAsync(device::address(destination), device::address(source), num_bytes, stream_handle);

     // TODO: Determine whether it was from host to device, device to host etc and
     // add this information to the error string
     throw_if_error_lazy(result, "Scheduling a memory copy on " + stream::detail_::identify(stream_handle));
 }

 inline void copy(region_t destination, const_region_t source, stream::handle_t stream_handle)
 {
 #ifndef NDEBUG
     if (destination.size() < source.size()) {
         throw ::std::logic_error("Source size exceeds destination size");
     }
 #endif
     copy(destination.start(), source.start(), source.size(), stream_handle);
 }

 using memory::copy_parameters_t;

 inline status_t multidim_copy_in_current_context(
     ::std::integral_constant<dimensionality_t, 2>,
     copy_parameters_t<2> params,
     optional<stream::handle_t> stream_handle)
 {
     // Must be an intra-context copy, because CUDA does not support 2D inter-context copies and the copy parameters
     // structure holds no information about contexts.
     //
     // Note: The stream handle, even if present, might be the null handle; for now
     // we distinguish between using the null stream handle - the default stream's -
     // and using the synchronous API
     return stream_handle ?
            cuMemcpy2DAsync(&params, *stream_handle) :
            cuMemcpy2D(&params);
 }

 inline status_t multidim_copy_in_current_context(
     ::std::integral_constant<dimensionality_t, 3>,
     copy_parameters_t<3> params,
     optional<stream::handle_t> stream_handle)
 {
     if (params.srcContext == params.dstContext) {
         // TODO: Should we check it's also the current context?
         using intra_context_type = memory::detail_::base_copy_params<3>::intra_context_type;
         auto* intra_context_params = reinterpret_cast<intra_context_type *>(&params);
         return stream_handle ?
                cuMemcpy3DAsync(intra_context_params, *stream_handle) :
                cuMemcpy3D(intra_context_params);
     }
     return stream_handle ?
         cuMemcpy3DPeerAsync(&params, *stream_handle) :
         cuMemcpy3DPeer(&params);
 }

 template<dimensionality_t NumDimensions>
 status_t multidim_copy_in_current_context(copy_parameters_t<NumDimensions> params, optional<stream::handle_t> stream_handle) {
     return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
 }

 // Note: Assumes the stream handle is for a stream in the current context
 template<dimensionality_t NumDimensions>
 status_t multidim_copy(
     context::handle_t                 context_handle,
     copy_parameters_t<NumDimensions>  params,
     optional<stream::handle_t>        stream_handle)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return multidim_copy_in_current_context(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
 }

 // Assumes the array and the stream share the same context, and that the destination is
 // accessible from that context (e.g. allocated within it, or being managed memory, etc.)
 template <typename T, dimensionality_t NumDimensions>
 void copy(T *destination, const array_t<T, NumDimensions>& source, optional<stream::handle_t> stream_handle)
 {
     using  memory::endpoint_t;
     auto dims = source.dimensions();
     //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
     auto params = copy_parameters_t<NumDimensions> {};
     params.clear_offset(endpoint_t::source);
     params.clear_offset(endpoint_t::destination);
     params.template set_extent<T>(dims);
     params.set_endpoint(endpoint_t::source, source);
     params.set_endpoint(endpoint_t::destination, const_cast<T*>(destination), dims);
     params.set_default_pitches();
     params.clear_rest();
     auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
     throw_if_error(status, "Scheduling an asynchronous copy from an array into a regular memory region");
 }


 template <typename T, dimensionality_t NumDimensions>
 void copy(const array_t<T, NumDimensions>&  destination, const T* source, optional<stream::handle_t> stream_handle)
 {
     using memory::endpoint_t;
     auto dims = destination.dimensions();
     //auto params = make_multidim_copy_params(destination, const_cast<T*>(source), destination.dimensions());
     auto params = copy_parameters_t<NumDimensions>{};
     params.clear_offset(endpoint_t::source);
     params.clear_offset(endpoint_t::destination);
     params.template set_extent<T>(dims);
     params.set_endpoint(endpoint_t::source, const_cast<T*>(source), dims);
     params.set_endpoint(endpoint_t::destination, destination);
     params.set_default_pitches();
     params.clear_rest();
     auto status = multidim_copy_in_current_context<NumDimensions>(params, stream_handle);
     throw_if_error(status, "Scheduling an asynchronous copy from regular memory into an array");
 }

 template <typename T>
 void copy_single(T* destination, const T* source, optional<stream::handle_t> stream_handle)
 {
     copy(destination, source, sizeof(T), stream_handle);
 }

 } // namespace detail_


 template <typename T, size_t N>
 inline void copy(span<T> destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < N) {
         throw ::std::logic_error("Source size exceeds destination size");
     }
 #endif
     return copy(destination.data(), source, sizeof(T) * N, stream);
 }

 template <typename T, size_t N>
 void copy(c_array<T,N>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (source.size() > N) {
         throw ::std::invalid_argument(
             "Attempt to copy a span of " + ::std::to_string(source.size()) +
             " elements into an array of " + ::std::to_string(N) + " elements");
     }
 #endif
     return copy(destination, source.start(), sizeof(T) * N, stream);
 }

 template <typename T, size_t N>
 inline void copy(void* destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
 {
     return copy(destination, source, sizeof(T) * N, stream);
 }

 template <typename T, size_t N>
 inline void copy(c_array<T,N>& destination, T* source, optional_ref<const stream_t> stream = {})
 {
     return copy(destination, source, sizeof(T) * N, stream);
 }


 void set(void* ptr, int byte_value, size_t num_bytes, optional_ref<const stream_t> stream = {});

 inline void set(region_t region, int byte_value, optional_ref<const stream_t> stream = {})
 {
     return set(region.start(), byte_value, region.size(), stream);
 }

 inline void zero(region_t region, optional_ref<const stream_t> stream = {})
 {
     return set(region, 0, stream);
 }

 inline void zero(void* ptr, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
     return set(ptr, 0, num_bytes, stream);
 }

 template <typename T>
 inline void zero(T* ptr)
 {
     zero(ptr, sizeof(T));
 }

 namespace detail_ {

 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 2> two, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
 {
     // TODO: Move this logic into the scoped ensurer class
     auto context_handle = context::current::detail_::get_handle();
     if  (context_handle != context::detail_::none) {
         return detail_::multidim_copy_in_current_context(two, params, stream_handle);
     }
     auto current_device_id = cuda::device::current::detail_::get_id();
     context_handle = cuda::device::primary_context::detail_::obtain_and_increase_refcount(current_device_id);
     context::current::detail_::push(context_handle);
     // Note this _must_ be an intra-context copy, as inter-context is not supported
     // and there's no indication of context in the relevant data structures
     auto status = detail_::multidim_copy_in_current_context(two, params, stream_handle);
     context::current::detail_::pop();
     cuda::device::primary_context::detail_::decrease_refcount(current_device_id);
     return status;
 }

 inline status_t multidim_copy(context::handle_t context_handle, ::std::integral_constant<dimensionality_t, 2>, copy_parameters_t<2> params, optional<stream::handle_t> stream_handle)
 {
     context::current::detail_::scoped_override_t context_for_this_scope(context_handle);
     return multidim_copy(::std::integral_constant<dimensionality_t, 2>{}, params, stream_handle);
 }

 inline status_t multidim_copy(::std::integral_constant<dimensionality_t, 3>, copy_parameters_t<3> params, optional<stream::handle_t> stream_handle)
 {
     if (params.srcContext == params.dstContext) {
         context::current::detail_::scoped_ensurer_t ensure_context_for_this_scope{params.srcContext};
         return detail_::multidim_copy_in_current_context(params, stream_handle);
     }
     return stream_handle ?
         cuMemcpy3DPeerAsync(&params, *stream_handle) :
         cuMemcpy3DPeer(&params);
 }

 template<dimensionality_t NumDimensions>
 status_t multidim_copy(copy_parameters_t<NumDimensions> params, stream::handle_t stream_handle)
 {
     return multidim_copy(::std::integral_constant<dimensionality_t, NumDimensions>{}, params, stream_handle);
 }


 } // namespace detail_

 template<dimensionality_t NumDimensions>
 void copy(copy_parameters_t<NumDimensions> params, optional_ref<const stream_t> stream = {});

 template<typename T, dimensionality_t NumDimensions>
 void copy(const array_t<T, NumDimensions>& destination, const context_t& source_context, const T *source, optional_ref<const stream_t> stream = {})
 {
     auto dims = destination.dimensions();
     auto params = copy_parameters_t<NumDimensions> {};
     params.clear_offsets();
     params.template set_extent<T>(dims);
     params.set_endpoint(endpoint_t::source, source_context.handle(), const_cast<T*>(source), dims);
     params.set_endpoint(endpoint_t::destination, destination);
     params.clear_rest();
     copy(params, stream);
 }

 template <typename T, dimensionality_t NumDimensions>
 void copy(array_t<T, NumDimensions>& destination, const T* source, optional_ref<const stream_t> stream = {});

 template<typename T, dimensionality_t NumDimensions>
 void copy(const array_t<T, NumDimensions>& destination, span<T const> source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < source.size()) {
         throw ::std::invalid_argument(
             "Attempt to copy a span of " + ::std::to_string(source.size()) +
             " elements into a CUDA array of " + ::std::to_string(destination.size()) + " elements");
     }
 #endif
     copy(destination, source.data(), stream);
 }

 template <typename T, dimensionality_t NumDimensions>
 void copy(const context_t& context, T *destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
 {
     auto dims = source.dimensions();
     auto params = copy_parameters_t<NumDimensions> {};
     params.clear_offset(endpoint_t::source);
     params.clear_offset(endpoint_t::destination);
     params.template set_extent<T>(dims);
     params.set_endpoint(endpoint_t::source, source);
     params.template set_endpoint<T>(endpoint_t::destination, context.handle(), destination, dims);
     params.set_default_pitches();
     params.clear_rest();
     copy(params, stream);
 }

 template <typename T, dimensionality_t NumDimensions>
 void copy(T* destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {});


 template <typename T, dimensionality_t NumDimensions>
 void copy(span<T> destination, const array_t<T, NumDimensions>& source, optional_ref <const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < source.size()) {
         throw ::std::invalid_argument(
             "Attempt to copy a CUDA array of " + ::std::to_string(source.size()) +
             " elements into a span of " + ::std::to_string(destination.size()) + " elements");
     }
 #endif
     copy(destination.data(), source, stream);
 }

 template <typename T, dimensionality_t NumDimensions>
 void copy(const array_t<T, NumDimensions>& destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream)
 {
     auto dims = source.dimensions();
     auto params = copy_parameters_t<NumDimensions> {};
     params.clear_offset(endpoint_t::source);
     params.clear_offset(endpoint_t::destination);
     params.template set_extent<T>(dims);
     params.set_endpoint(endpoint_t::source, source);
     params.set_endpoint(endpoint_t::destination, destination);
     params.set_default_pitches();
     params.clear_rest();
     auto status = //(source.context() == destination.context()) ?
         detail_::multidim_copy<NumDimensions>(source.context_handle(), params, stream);
     throw_if_error_lazy(status, "Copying from a CUDA array into a regular memory region");
 }

 template <typename T, dimensionality_t NumDimensions>
 void copy(region_t destination, const array_t<T, NumDimensions>& source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < source.size_bytes()) {
         throw ::std::invalid_argument(
             "Attempt to copy " + ::std::to_string(source.size_bytes()) + " bytes from an array into a "
                 "region of smaller size (" + ::std::to_string(destination.size()) + " bytes)");
     }
 #endif
     copy(destination.start(), source, stream);
 }

 template <typename T, dimensionality_t NumDimensions>
 void copy(array_t<T, NumDimensions>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size_bytes() < source.size()) {
         throw ::std::invalid_argument(
             "Attempt to copy a region of " + ::std::to_string(source.size()) +
             " bytes into an array of size " + ::std::to_string(destination.size_bytes()) + " bytes");
     }
 #endif
     copy(destination, static_cast<T const*>(source.start()), stream);
 }

 template <typename T>
 void copy_single(T* destination, const T* source, optional_ref<const stream_t> stream = {});

 void copy(void* destination, void const* source, size_t num_bytes, optional_ref<const stream_t> stream = {});


 template <typename T, size_t N>
 inline void copy(c_array<T,N>& destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     size_t required_size = N * sizeof(T);
     if (source.size() != required_size) {
         throw ::std::invalid_argument(
             "Attempt to copy a region of " + ::std::to_string(source.size()) +
             " bytes into an array of size " + ::std::to_string(required_size) + " bytes");
     }
 #endif
     return copy(&(destination[0]), source.start(), sizeof(T) * N, stream);
 }

 template <typename T, size_t N>
 inline void copy(region_t destination, c_array<const T,N> const& source, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < N) {
         throw ::std::logic_error("Source size exceeds destination size");
     }
 #endif
     return copy(destination.start(), source, sizeof(T) * N, stream);
 }


 inline void copy(region_t destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < num_bytes) {
         throw ::std::logic_error("Attempt to copy beyond the end of the destination region");
     }
 #endif
     copy(destination.start(), source.start(), num_bytes, stream);
 }


 inline void copy(region_t destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
     copy(destination, source, source.size(), stream);
 }


 inline void copy(region_t destination, void* source, optional_ref<const stream_t> stream = {})
 {
     return copy(destination.start(), source, destination.size(), stream);
 }

 inline void copy(region_t destination, void* source, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (destination.size() < num_bytes) {
         throw ::std::logic_error("Number of bytes to copy exceeds destination size");
     }
 #endif
     return copy(destination.start(), source, num_bytes, stream);
 }

 inline void copy(void* destination, const_region_t source, size_t num_bytes, optional_ref<const stream_t> stream = {})
 {
 #ifndef NDEBUG
     if (source.size() < num_bytes) {
         throw ::std::logic_error("Attempt to copy more than the source region's size");
     }
 #endif
     copy(destination, source.start(), num_bytes, stream);
 }

 inline void copy(void* destination, const_region_t source, optional_ref<const stream_t> stream = {})
 {
     copy(destination, source, source.size(), stream);
 }

 namespace device {

 namespace detail_ {

 inline void set(void* start, int byte_value, size_t num_bytes, stream::handle_t stream_handle)
 {
     // TODO: Double-check that this call doesn't require setting the current device
     auto result = cuMemsetD8Async(address(start), static_cast<unsigned char>(byte_value), num_bytes, stream_handle);
     throw_if_error_lazy(result, "asynchronously memsetting an on-device buffer");
 }


 inline void set(region_t region, int byte_value, stream::handle_t stream_handle)
 {
     set(region.start(), byte_value, region.size(), stream_handle);
 }

 inline void zero(void* start, size_t num_bytes, stream::handle_t stream_handle)
 {
     set(start, 0, num_bytes, stream_handle);
 }

 inline void zero(region_t region, stream::handle_t stream_handle)
 {
     zero(region.start(), region.size(), stream_handle);
 }

 // TODO: Drop this in favor of <algorithm>-like functions under `cuda::`.
 template <typename T>
 inline void typed_set(T* start, const T& value, size_t num_elements, stream::handle_t stream_handle)
 {
     static_assert(::std::is_trivially_copyable<T>::value, "Non-trivially-copyable types cannot be used for setting memory");
     static_assert(
         sizeof(T) == 1 or sizeof(T) == 2 or
         sizeof(T) == 4 or sizeof(T) == 8,
         "Unsupported type size - only sizes 1, 2 and 4 are supported");
     // TODO: Consider checking for alignment when compiling without NDEBUG
     status_t result = static_cast<status_t>(cuda::status::success);
     switch(sizeof(T)) {
         case(1): result = cuMemsetD8Async (address(start), reinterpret_cast<const ::std::uint8_t& >(value), num_elements, stream_handle); break;
         case(2): result = cuMemsetD16Async(address(start), reinterpret_cast<const ::std::uint16_t&>(value), num_elements, stream_handle); break;
         case(4): result = cuMemsetD32Async(address(start), reinterpret_cast<const ::std::uint32_t&>(value), num_elements, stream_handle); break;
     }
     throw_if_error_lazy(result, "Setting global device memory bytes");
 }

 } // namespace detail_


 template <typename T>
 void typed_set(T* start, const T& value, size_t num_elements, optional_ref<const stream_t> stream);

 void zero(void* start, size_t num_bytes, optional_ref<const stream_t> stream);

 } // namespace device

 namespace inter_context {

 void copy(
     void *                        destination,
     const context_t&              destination_context,
     const void *                  source_address,
     const context_t&              source_context,
     size_t                        num_bytes,
     optional_ref<const stream_t>  stream);

 /*
 inline void copy(
     region_t           destination,
     const context_t&   destination_context,
     const_region_t     source,
     const context_t&   source_context,
     optional_ref<const stream_t> stream)
 {
 #ifndef NDEBUG
     if (destination.size() < destination.size()) {
         throw ::std::invalid_argument(
             "Attempt to copy a region of " + ::std::to_string(source.size()) +
                 " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
     }
 #endif
     copy(destination.start(), destination_context, source, source_context, stream);
 }
 */


 /*

 template <typename T, dimensionality_t NumDimensions>
 inline void copy(
     array_t<T, NumDimensions>  destination,
     array_t<T, NumDimensions>  source,
     optional_ref<const stream_t> stream)
 {
     // for arrays, a single mechanism handles both intra- and inter-context copying
     return memory::copy(destination, source, stream);
 }
 */

 namespace detail_ {

 } // namespace detail_

 void copy(
     void *                        destination_address,
     const context_t&              destination_context,
     const void *                  source_address,
     const context_t&              source_context,
     size_t                        num_bytes,
     optional_ref<const stream_t> stream);

 inline void copy(
     void *                        destination,
     const context_t&              destination_context,
     const_region_t                source,
     const context_t&              source_context,
     optional_ref<const stream_t>  stream)
 {
     copy(destination, destination_context, source.start(), source_context, source.size(), stream);
 }

 inline void copy(
     region_t                      destination,
     const context_t&              destination_context,
     const void*                   source,
     const context_t&              source_context,
     optional_ref<const stream_t>  stream)
 {
     copy(destination.start(), destination_context, source, source_context, destination.size(), stream);
 }

 inline void copy(
     region_t                      destination,
     const context_t&              destination_context,
     const_region_t                source,
     const context_t&              source_context,
     optional_ref<const stream_t>  stream)
 {
 #ifndef NDEBUG
     if (destination.size() < destination.size()) {
         throw ::std::invalid_argument(
             "Attempt to copy a region of " + ::std::to_string(source.size()) +
             " bytes into a region of size " + ::std::to_string(destination.size()) + " bytes");
     }
 #endif
     copy(destination.start(), destination_context, source, source_context, stream);
 }

 template <typename T, dimensionality_t NumDimensions>
 inline void copy(
     array_t<T, NumDimensions>     destination,
     array_t<T, NumDimensions>     source,
     optional_ref<const stream_t>  stream)
 {
     // for arrays, a single mechanism handles both intra- and inter-context copying
     return memory::copy(destination, source, stream);
 }

 } // namespace inter_context

 namespace host {

 namespace detail_ {

 // Even though the pinned memory should not in principle be associated in principle with a context or a device, in
 // practice it needs to be registered somewhere - and that somewhere is a context. Passing a context does not mean
 // the allocation will have special affinity to the device terms of better performance etc.
 inline region_t allocate(
     const context::handle_t  context_handle,
     size_t                   size_in_bytes,
     allocation_options       options);

 } // namespace detail_

 region_t allocate(size_t size_in_bytes, allocation_options options);

 inline region_t allocate(
     size_t                       size_in_bytes,
     portability_across_contexts  portability = portability_across_contexts(false),
     cpu_write_combining          cpu_wc = cpu_write_combining(false))
 {
     return allocate(size_in_bytes, allocation_options{ portability, cpu_wc } );
 }

 inline region_t allocate(size_t size_in_bytes, cpu_write_combining cpu_wc)
 {
     return allocate(size_in_bytes, allocation_options{ portability_across_contexts(false), cpu_write_combining(cpu_wc)} );
 }

 inline void free(void* host_ptr)
 {
     auto result = cuMemFreeHost(host_ptr);
 #ifdef CAW_THROW_ON_FREE_IN_DESTROYED_CONTEXT
     if (result == status::success) { return; }
 #else
     if (result == status::success or result == status::context_is_destroyed) { return; }
 #endif
     throw runtime_error(result, "Freeing pinned host memory at " + cuda::detail_::ptr_as_hex(host_ptr));
 }

 inline void free(region_t region) { return free(region.data()); }

 namespace detail_ {

 struct allocator {
     void* operator()(size_t num_bytes) const { return cuda::memory::host::allocate(num_bytes).data(); }
 };
 struct deleter {
     void operator()(void* ptr) const { cuda::memory::host::free(ptr); }
 };

 inline void register_(const void *ptr, size_t size, unsigned flags)
 {
     auto result = cuMemHostRegister(const_cast<void *>(ptr), size, flags);
     throw_if_error_lazy(result,
         "Could not register and page-lock the region of " + ::std::to_string(size) +
         " bytes of host memory at " + cuda::detail_::ptr_as_hex(ptr) +
         " with flags " + cuda::detail_::as_hex(flags));
 }

 inline void register_(const_region_t region, unsigned flags)
 {
     register_(region.start(), region.size(), flags);
 }

 } // namespace detail_

 enum mapped_io_space : bool {
     is_mapped_io_space               = true,
     is_not_mapped_io_space           = false
 };

 enum map_into_device_memory : bool {
     map_into_device_memory           = true,
     do_not_map_into_device_memory    = false
 };

 enum accessibility_on_all_devices : bool {
     is_accessible_on_all_devices     = true,
     is_not_accessible_on_all_devices = false
 };

 inline void register_(const void *ptr, size_t size,
     bool register_mapped_io_space,
     bool map_into_device_space,
     bool make_device_side_accessible_to_all
 #if CUDA_VERSION >= 11010
     , bool considered_read_only_by_device
 #endif // CUDA_VERSION >= 11010
     )
 {
     detail_::register_(
         ptr, size,
         (register_mapped_io_space ? CU_MEMHOSTREGISTER_IOMEMORY : 0)
         | (map_into_device_space ? CU_MEMHOSTREGISTER_DEVICEMAP : 0)
         | (make_device_side_accessible_to_all ? CU_MEMHOSTREGISTER_PORTABLE : 0)
 #if CUDA_VERSION >= 11010
         | (considered_read_only_by_device ? CU_MEMHOSTREGISTER_READ_ONLY : 0)
 #endif // CUDA_VERSION >= 11010
     );
 }

 inline void register_(
     const_region_t region,
     bool register_mapped_io_space,
     bool map_into_device_space,
     bool make_device_side_accessible_to_all
 #if CUDA_VERSION >= 11010
     , bool considered_read_only_by_device
 #endif // CUDA_VERSION >= 11010
     )
 {
     register_(
         region.start(),
         region.size(),
         register_mapped_io_space,
         map_into_device_space,
         make_device_side_accessible_to_all
 #if CUDA_VERSION >= 11010
         , considered_read_only_by_device
 #endif // CUDA_VERSION >= 11010
         );
 }

 inline void register_(void const *ptr, size_t size)
 {
     unsigned no_flags_set { 0 };
     detail_::register_(ptr, size, no_flags_set);
 }

 inline void register_(const_region_t region)
 {
     register_(region.start(), region.size());
 }

 inline void deregister(const void *ptr)
 {
     auto result = cuMemHostUnregister(const_cast<void *>(ptr));
     throw_if_error_lazy(result,
         "Could not unregister the memory segment starting at address *a");
 }

 inline void deregister(const_region_t region)
 {
     deregister(region.start());
 }


 inline void set(void* start, int byte_value, size_t num_bytes)
 {
     ::std::memset(start, byte_value, num_bytes);
     // TODO: Error handling?
 }

 inline void set(region_t region, int byte_value)
 {
     memory::set(region.start(), byte_value, region.size(), nullopt);
 }

 inline void zero(void* start, size_t num_bytes)
 {
     set(start, 0, num_bytes);
 }

 inline void zero(region_t region)
 {
     host::set(region, 0);
 }

 template <typename T>
 inline void zero(T* ptr)
 {
     zero(ptr, sizeof(T));
 }


 } // namespace host

 namespace managed {

 namespace range {

 namespace detail_ {

 using attribute_t = CUmem_range_attribute;
 using advice_t = CUmem_advise;

 template <typename T>
 inline T get_scalar_attribute(const_region_t region, attribute_t attribute)
 {
     uint32_t attribute_value { 0 };
     auto result = cuMemRangeGetAttribute(
         &attribute_value, sizeof(attribute_value), attribute, device::address(region.start()), region.size());
     throw_if_error_lazy(result,
         "Obtaining an attribute for a managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
     return static_cast<T>(attribute_value);
 }

 // CUDA's range "advice" is simply a way to set the attributes of a range; unfortunately that's
 // not called cuMemRangeSetAttribute, and uses a different enum.
 inline void advise(const_region_t region, advice_t advice, cuda::device::id_t device_id)
 {
     auto result = cuMemAdvise(device::address(region.start()), region.size(), advice, device_id);
     throw_if_error_lazy(result, "Setting an attribute for a managed memory range at "
         + cuda::detail_::ptr_as_hex(region.start()));
 }

 inline advice_t as_advice(attribute_t attribute, bool set)
 {
     switch (attribute) {
     case CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY:
         return set ? CU_MEM_ADVISE_SET_READ_MOSTLY : CU_MEM_ADVISE_UNSET_READ_MOSTLY;
     case CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION:
         return set ? CU_MEM_ADVISE_SET_PREFERRED_LOCATION : CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION;
     case CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY:
         return set ? CU_MEM_ADVISE_SET_ACCESSED_BY : CU_MEM_ADVISE_UNSET_ACCESSED_BY;
     default:
         throw ::std::invalid_argument(
             "CUDA memory range attribute does not correspond to any range advice value");
     }
 }

 inline void set_attribute(const_region_t region, attribute_t settable_attribute, cuda::device::id_t device_id)
 {
     static constexpr const bool set { true };
     advise(region, as_advice(settable_attribute, set), device_id);
 }

 inline void set_attribute(const_region_t region, attribute_t settable_attribute)
 {
     static constexpr const bool set { true };
     static constexpr const cuda::device::id_t dummy_device_id { 0 };
     advise(region, as_advice(settable_attribute, set), dummy_device_id);
 }

 inline void unset_attribute(const_region_t region, attribute_t settable_attribute)
 {
     static constexpr const bool unset { false };
     static constexpr const cuda::device::id_t dummy_device_id { 0 };
     advise(region, as_advice(settable_attribute, unset), dummy_device_id);
 }

 } // namespace detail_

 } // namespace range

 namespace detail_ {

 template <typename GenericRegion>
 struct region_helper : public GenericRegion {
     using GenericRegion::GenericRegion;

     bool is_read_mostly() const
     {
         return range::detail_::get_scalar_attribute<bool>(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
     }

     void designate_read_mostly() const
     {
         range::detail_::set_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
     }

     void undesignate_read_mostly() const
     {
         range::detail_::unset_attribute(*this, CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY);
     }

     device_t preferred_location() const;
     void set_preferred_location(device_t& device) const;
     void clear_preferred_location() const;
 };

 } // namespace detail_

 using region_t = detail_::region_helper<memory::region_t>;
 using const_region_t = detail_::region_helper<memory::const_region_t>;

 void advise_expected_access_by(const_region_t region, device_t& device);

 void advise_no_access_expected_by(const_region_t region, device_t& device);

 template <typename Allocator = ::std::allocator<cuda::device_t> >
 typename ::std::vector<device_t, Allocator> expected_accessors(const_region_t region, const Allocator& allocator = Allocator() );

 enum class attachment_t : unsigned {
     global        = CU_MEM_ATTACH_GLOBAL,
     host          = CU_MEM_ATTACH_HOST,
     single_stream = CU_MEM_ATTACH_SINGLE,
     };

 namespace detail_ {

 inline managed::region_t allocate_in_current_context(
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
     device::address_t allocated = 0;
     auto flags = (initial_visibility == initial_visibility_t::to_all_devices) ?
         attachment_t::global : attachment_t::host;
     // This is necessary because managed allocation requires at least one (primary)
     // context to have been constructed. We could theoretically check what our current
     // context is etc., but that would be brittle, since someone can managed-allocate,
     // then change contexts, then de-allocate, and we can't be certain that whoever
     // called us will call free
     cuda::device::primary_context::detail_::increase_refcount(cuda::device::default_device_id);

     // Note: Despite the templating by T, the size is still in bytes,
     // not in number of T's
     auto status = cuMemAllocManaged(&allocated, num_bytes, static_cast<unsigned>(flags));
     if (is_success(status) && allocated == 0) {
         // Can this even happen? hopefully not
         status = static_cast<status_t>(status::unknown);
     }
     throw_if_error_lazy(status, "Failed allocating "
         + ::std::to_string(num_bytes) + " bytes of managed CUDA memory");
     return {as_pointer(allocated), num_bytes};
 }

 inline void free(void* ptr)
 {
     auto result = cuMemFree(device::address(ptr));
     cuda::device::primary_context::detail_::decrease_refcount(cuda::device::default_device_id);
     throw_if_error_lazy(result, "Freeing managed memory at " + cuda::detail_::ptr_as_hex(ptr));
 }

 inline void free(managed::region_t region)
 {
     free(region.start());
 }

 template <initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
 struct allocator {
     // Allocates in the current context!
     void* operator()(size_t num_bytes) const
     {
         return detail_::allocate_in_current_context(num_bytes, InitialVisibility).start();
     }
 };

 struct deleter {
     void operator()(void* ptr) const { detail_::free(ptr); }
 };

 inline managed::region_t allocate(
     context::handle_t     context_handle,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return allocate_in_current_context(num_bytes, initial_visibility);
 }

 } // namespace detail_

 inline region_t allocate(
     const context_t&      context,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);

 inline region_t allocate(
     const device_t&       device,
     size_t                num_bytes,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);

 region_t allocate(size_t num_bytes);

 inline void free(void* managed_ptr)
 {
     auto result = cuMemFree(device::address(managed_ptr));
     throw_if_error_lazy(result,
         "Freeing managed memory (host and device regions) at address "
         + cuda::detail_::ptr_as_hex(managed_ptr));
 }

 inline void free(region_t region)
 {
     free(region.start());
 }

 namespace detail_ {

 inline void prefetch(
     const_region_t      region,
     cuda::device::id_t  destination,
     stream::handle_t    source_stream_handle)
 {
     auto result = cuMemPrefetchAsync(device::address(region.start()), region.size(), destination, source_stream_handle);
     throw_if_error_lazy(result,
         "Prefetching " + ::std::to_string(region.size()) + " bytes of managed memory at address "
          + cuda::detail_::ptr_as_hex(region.start()) + " to " + (
             (destination == CU_DEVICE_CPU) ? "the host" : cuda::device::detail_::identify(destination))  );
 }

 } // namespace detail_

 void prefetch(
     const_region_t         region,
     const cuda::device_t&  destination,
     const stream_t&        stream);

 void prefetch_to_host(
     const_region_t   region,
     const stream_t&  stream);

 } // namespace managed

 namespace mapped {

 template <typename T>
 inline T* device_side_pointer_for(T* host_memory_ptr)
 {
     auto unconsted_host_mem_ptr = const_cast<typename ::std::remove_const<T>::type *>(host_memory_ptr);
     device::address_t device_side_ptr;
     auto get_device_pointer_flags = 0u; // see the CUDA runtime documentation
     auto status = cuMemHostGetDevicePointer(
         &device_side_ptr,
         unconsted_host_mem_ptr,
         get_device_pointer_flags);
     throw_if_error_lazy(status,
         "Failed obtaining the device-side pointer for host-memory pointer "
         + cuda::detail_::ptr_as_hex(host_memory_ptr) + " supposedly mapped to device memory");
     return as_pointer(device_side_ptr);
 }

 inline region_t device_side_region_for(region_t region)
 {
     return { device_side_pointer_for(region.start()), region.size() };
 }

 inline const_region_t device_side_region_for(const_region_t region)
 {
     return { device_side_pointer_for(region.start()), region.size() };
 }

 namespace detail_ {

 inline region_pair_t allocate_in_current_context(
     context::handle_t   current_context_handle,
     size_t              size_in_bytes,
     allocation_options  options)
 {
     region_pair_t allocated {};
     // The default initialization is unnecessary, but let's play it safe
     auto flags = cuda::memory::detail_::make_cuda_host_alloc_flags(options);
     void* allocated_ptr;
     auto status = cuMemHostAlloc(&allocated_ptr, size_in_bytes, flags);
     if (is_success(status) && (allocated_ptr == nullptr)) {
         // Can this even happen? hopefully not
         status = static_cast<status_t>(status::named_t::unknown);
     }
     throw_if_error_lazy(status,
         "Failed allocating a mapped pair of memory regions of size " + ::std::to_string(size_in_bytes)
         + " bytes of global memory in " + context::detail_::identify(current_context_handle));
     allocated.host_side = { allocated_ptr, size_in_bytes };
     allocated.device_side = device_side_region_for(allocated.host_side);
     return allocated;
 }

 inline region_pair_t allocate(
     context::handle_t   context_handle,
     size_t              size_in_bytes,
     allocation_options  options)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return detail_::allocate_in_current_context(context_handle, size_in_bytes, options);
 }

 inline void free(void* host_side_pair)
 {
     auto result = cuMemFreeHost(host_side_pair);
     throw_if_error_lazy(result, "Freeing a mapped memory region pair with host-side address "
         + cuda::detail_::ptr_as_hex(host_side_pair));
 }

 } // namespace detail_

 region_pair_t allocate(
     cuda::context_t&    context,
     size_t              size_in_bytes,
     allocation_options  options);

 region_pair_t allocate(
     cuda::device_t&     device,
     size_t              size_in_bytes,
     allocation_options  options = allocation_options{});


 inline void free(region_pair_t pair)
 {
     detail_::free(pair.host_side.data());
 }

 inline void free_region_pair_of(void* ptr)
 {
     // TODO: What if the pointer is not part of a mapped region pair?
     // We could check this...
     void* host_side_ptr;
     auto status = cuPointerGetAttribute (&host_side_ptr, CU_POINTER_ATTRIBUTE_HOST_POINTER, memory::device::address(ptr));
     throw_if_error_lazy(status, "Failed obtaining the host-side address of supposedly-device-side pointer "
         + cuda::detail_::ptr_as_hex(ptr));
     detail_::free(host_side_ptr);
 }

 inline bool is_part_of_a_region_pair(const void* ptr)
 {
     auto wrapped_ptr = pointer_t<const void> { ptr };
     return wrapped_ptr.other_side_of_region_pair().get() != nullptr;
 }

 } // namespace mapped

 namespace detail_ {
 template <typename T, typename RawDeleter, typename RegionAllocator>
 unique_span<T> make_convenient_type_unique_span(size_t size, RegionAllocator allocator)
 {
     memory::detail_::check_allocation_type<T>();
     auto deleter = [](span<T> sp) {
         return RawDeleter{}(sp.data());
     };
     region_t allocated_region = allocator(size * sizeof(T));
     return unique_span<T>(
         allocated_region.as_span<T>(), // no constructor calls - trivial construction
         deleter // no destructor calls - trivial destruction
     );
 }

 } // namespace detail_


 namespace device {

 namespace detail_ {

 template <typename T>
 unique_span<T> make_unique_span(const context::handle_t context_handle, size_t size)
 {
     auto allocate_in_current_context_ = [](size_t size) { return allocate_in_current_context(size); };
     CAW_SET_SCOPE_CONTEXT(context_handle);
     return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocate_in_current_context_);
 }

 } // namespace detail_

 template <typename T>
 unique_span<T> make_unique_span(const context_t& context, size_t size);

 template <typename T>
 unique_span<T> make_unique_span(const device_t& device, size_t size);

 template <typename T>
 unique_span<T> make_unique_span(size_t size);

 } // namespace device

 template <typename T>
 inline unique_span<T> make_unique_span(const context_t& context, size_t size)
 {
     return device::make_unique_span<T>(context, size);
 }

 template <typename T>
 inline unique_span<T> make_unique_span(const device_t& device, size_t size)
 {
     return device::make_unique_span<T>(device, size);
 }

 namespace host {

 template <typename T>
 unique_span<T> make_unique_span(size_t size)
 {
     // Need this because of allocate takes more arguments and has default ones
     auto allocator = [](size_t size) { return allocate(size); };
     return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
 }

 } // namespace host

 namespace managed {

 namespace detail_ {

 template <typename T, initial_visibility_t InitialVisibility = initial_visibility_t::to_all_devices>
 unique_span<T> make_unique_span(
     const context::handle_t  context_handle,
     size_t                   size)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     auto allocator = [](size_t size) {
         return allocate_in_current_context(size, InitialVisibility);
     };
     return memory::detail_::make_convenient_type_unique_span<T, detail_::deleter>(size, allocator);
 }

 } // namespace detail_

 template <typename T>
 unique_span<T> make_unique_span(
     const context_t&      context,
     size_t                size,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);

 template <typename T>
 unique_span<T> make_unique_span(
     const device_t&       device,
     size_t                size,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);

 template <typename T>
 unique_span<T> make_unique_span(
     size_t size,
     initial_visibility_t  initial_visibility = initial_visibility_t::to_all_devices);

 } // namespace managed

 } // namespace memory

 namespace symbol {

 template <typename T>
 memory::region_t locate(T&& symbol)
 {
     void *start;
     size_t symbol_size;
     auto api_call_result = cudaGetSymbolAddress(&start, ::std::forward<T>(symbol));
     throw_if_error_lazy(api_call_result, "Could not locate the device memory address for a symbol");
     api_call_result = cudaGetSymbolSize(&symbol_size, ::std::forward<T>(symbol));
     throw_if_error_lazy(api_call_result, "Could not locate the device memory address for the symbol at address"
         + cuda::detail_::ptr_as_hex(start));
     return { start, symbol_size };
 }

 } // namespace symbol

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_MEMORY_HPP_
cuda::memory::host::register_
void register_(const_region_t region)
Register a memory region with the CUDA driver.
Definition: memory.hpp:1775

cuda::memory::mapped::free_region_pair_of
void free_region_pair_of(void *ptr)
Free a pair of mapped memory regions using just one of them.
Definition: memory.hpp:2289

cuda::stream_t
Proxy class for a CUDA stream.
Definition: stream.hpp:246

cuda::memory::endpoint_t
endpoint_t
Type for choosing between endpoints of copy operations.
Definition: copy_parameters.hpp:19

cuda::memory::managed::prefetch_to_host
void prefetch_to_host(const_region_t region, const stream_t &stream)
Prefetches a region of managed memory into host memory.
Definition: memory.hpp:248

cuda::memory::allocation_options::write_combining
cpu_write_combining write_combining
whether or not the GPU can batch multiple writes to this area and propagate them at its convenience...
Definition: memory.hpp:96

cuda::memory::make_unique_span
unique_span< T > make_unique_span(const context_t &context, size_t size)
See device::make_unique_span(const context_t& context, size_t size)
Definition: memory.hpp:2407

cuda::context_t
Wrapper class for a CUDA context.
Definition: context.hpp:244

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::device::default_device_id
If the CUDA runtime has not been set to a specific device, this is the ID of the device it defaults t...
Definition: constants.hpp:53

cuda::memory::host::is_not_accessible_on_all_devices
is_not_accessible_on_all_devices
Definition: memory.hpp:1646

cuda::memory::managed::region_t
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960

cuda::context::handle_t
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878

cuda::memory::copy_parameters_t::clear_offsets
this_type & clear_offsets() noexcept
Clear the offsets into both the source and the destination endpoint regions.
Definition: copy_parameters.hpp:275

cuda::array_t
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:29

cuda::memory::device::typed_set
void typed_set(T *start, const T &value, size_t num_elements, optional_ref< const stream_t > stream={})
Sets consecutive elements of a region of memory to a fixed value of some width.
Definition: memory.hpp:395

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::array_t::size_bytes
::std::size_t size_bytes() const noexcept
Overall size in bytes of the elements of the array, over all dimensions.
Definition: array.hpp:248

cuda::memory::allocation_options::portability
portability_across_contexts portability
whether or not the allocated region can be used in different CUDA contexts.
Definition: memory.hpp:93

cuda::memory::mapped::region_pair_t::as_spans
constexpr span_pair_t< T > as_spans() const
Definition: memory.hpp:165

cuda::memory::managed::advise_expected_access_by
void advise_expected_access_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is expected to access region.
Definition: memory.hpp:208

cuda::throw_if_error
void throw_if_error(status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:335

cuda::memory::device::free
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130

cuda::memory::pointer_t::get
T * get() const
Definition: pointer.hpp:139

cuda::memory::cpu_write_combining
cpu_write_combining
A memory allocation setting: Should the allocated memory be configured as write-combined, i.e.
Definition: memory.hpp:82

cuda::memory::mapped::region_pair_t::host_side
memory::region_t host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:161

cuda::memory::host::set
void set(void *start, int byte_value, size_t num_bytes)
Sets all bytes in a stretch of host-side memory to a single value.
Definition: memory.hpp:1813

cuda::profiling::start
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229

copy_parameters.hpp
The cuda::memory::copy_parameters_t class template and related definitions.

current_context.hpp

cuda::memory::copy
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625

cuda::memory::managed::expected_accessors
::std::vector< device_t, Allocator > expected_accessors(const_region_t region, const Allocator &allocator)
Definition: memory.hpp:219

cuda::memory::allocation_options
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:91

cuda::runtime_error
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271

cuda::array_t::size
::std::size_t size() const noexcept
Overall number of elements in the array, over all dimensions.
Definition: array.hpp:245

cuda::memory::mapped::span_pair_t::host_side
span< T > host_side
The two regions mapped to each other by the CUDA driver; they must be identical in size...
Definition: memory.hpp:143

cuda::memory::pointer_t::other_side_of_region_pair
pointer_t other_side_of_region_pair() const
Definition: pointer.hpp:208

cuda::memory::mapped::region_pair_t
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:158

array.hpp
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...

cuda::memory::host::map_into_device_memory
map_into_device_memory
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1634

cuda::memory::host::deregister
void deregister(const_region_t region)
Have the CUDA driver "forget" about a region of memory which was previously registered with it...
Definition: memory.hpp:1795

cuda::memory::host::accessibility_on_all_devices
accessibility_on_all_devices
Whether the allocated host-side memory should be recognized as pinned memory by all CUDA contexts...
Definition: memory.hpp:1644

cuda::memory::pointer_t
A convenience wrapper around a raw pointer "known" to the CUDA runtime and which thus has various kin...
Definition: pointer.hpp:131

cuda::memory::mapped::allocate
region_pair_t allocate(cuda::device_t &device, size_t size_in_bytes, allocation_options options=allocation_options{})
Allocate a memory region on the host, which is also mapped to a memory region in the global memory of...
Definition: memory.hpp:280

cuda::symbol::locate
memory::region_t locate(T &&symbol)
Locates a CUDA symbol in global or constant device memory.
Definition: memory.hpp:2533

cuda::memory::copy_parameters_t::clear_offset
this_type & clear_offset(endpoint_t endpoint) noexcept
Set the copy operation to use the multi-dimensional region of the specified endpoint without skipping...
Definition: copy_parameters.hpp:269

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::memory::copy_parameters_t
A builder-ish subclass template around the basic 2D or 3D copy parameters which CUDA&#39;s complex copyin...
Definition: copy_parameters.hpp:68

current_device.hpp
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.

cuda::memory::managed::const_region_t
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962

error.hpp
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...

cuda::memory::device::address
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682

cuda::memory::host::mapped_io_space
mapped_io_space
Whether or not the registration of the host-side pointer should map it into the CUDA address space fo...
Definition: memory.hpp:1623

cuda::memory::host::free
void free(void *host_ptr)
Frees a region of pinned host memory which was allocated with one of the pinned host memory allocatio...
Definition: memory.hpp:1563

cuda::stream::handle_t
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239

pointer.hpp
A wrapper class for host and/or device pointers, allowing easy access to CUDA&#39;s pointer attributes...

cuda::memory::as_pointer
void * as_pointer(device::address_t address) noexcept
Definition: types.hpp:700

cuda::memory::set
void set(void *ptr, int byte_value, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets a number of bytes in memory to a fixed value.
Definition: memory.hpp:422

constants.hpp
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...

cuda::memory::mapped::span_pair_t
A pair of memory spans, one in device-global memory and one in host/system memory, mapped to it.
Definition: memory.hpp:140

cuda::memory::mapped::device_side_region_for
const_region_t device_side_region_for(const_region_t region)
Get the memory region mapped to a given host-side region.
Definition: memory.hpp:2189

cuda::memory::mapped::free
void free(region_pair_t pair)
Free a pair of mapped memory regions.
Definition: memory.hpp:2278

cuda::memory::managed::advise_no_access_expected_by
void advise_no_access_expected_by(const_region_t region, device_t &device)
Advice the CUDA driver that device is not expected to access region.
Definition: memory.hpp:213

cuda::memory::copy_parameters_t::set_endpoint
this_type & set_endpoint(endpoint_t endpoint, const cuda::array_t< T, NumDimensions > &array) noexcept
Set one of the copy endpoints to a CUDA array.

cuda::memory::device::address_t
CUdeviceptr address_t
The numeric type which can represent the range of memory addresses on a CUDA device.
Definition: types.hpp:672

cuda::memory::mapped::device_side_pointer_for
T * device_side_pointer_for(T *host_memory_ptr)
Obtain a pointer in the device-side memory space (= address range) given given a host-side pointer ma...
Definition: memory.hpp:2163

cuda::memory::portability_across_contexts
portability_across_contexts
A memory allocation setting: Can the allocated memory be used in other CUDA driver contexts (in addit...
Definition: memory.hpp:62

cuda::memory::copy_single
void copy_single(T *destination, const T *source, optional_ref< const stream_t > stream={})
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:75

cuda::memory::host::is_accessible_on_all_devices
is_accessible_on_all_devices
Definition: memory.hpp:1645

cuda::memory::managed::prefetch
void prefetch(const_region_t region, const cuda::device_t &destination, const stream_t &stream)
Prefetches a region of managed memory to a specific device, so it can later be used there without wai...
Definition: memory.hpp:240

cuda::memory::host::allocate
region_t allocate(size_t size_in_bytes, allocation_options options)
Allocates pinned host memory.
Definition: memory.hpp:344

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

cuda::memory::zero
void zero(region_t region, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:734

cuda::memory::managed::initial_visibility_t
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753

cuda::is_success
constexpr bool is_success(status_t status)
Determine whether the API call returning the specified status had succeeded.
Definition: error.hpp:203

cuda::status_t
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77

cuda::memory::managed::attachment_t
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1975

cuda::memory::mapped::is_part_of_a_region_pair
bool is_part_of_a_region_pair(const void *ptr)
Determine whether a given stretch of memory was allocated as part of a mapped pair of host and device...
Definition: memory.hpp:2311