eyalroz/cuda-api-wrappers/stream_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_STREAM_HPP_
 #define CUDA_API_WRAPPERS_STREAM_HPP_

 #include "current_context.hpp"
 #include "current_device.hpp"
 #include "error.hpp"
 #include "kernel_launch.hpp"
 #include "memory.hpp"
 #include "miscellany.hpp"
 #include "types.hpp"

 #if CUDA_VERSION >= 10000
 #include "cuda/api/graph/template.hpp"
 #endif // CUDA_VERSION >= 10000

 #include <string>
 #include <memory>
 #include <utility>
 #include <tuple>
 #include <algorithm>

 namespace cuda {

 class device_t;
 class event_t;
 class stream_t;

 namespace memory {

 class pool_t;

 } // namespace memory

 namespace stream {

 // Use this for the second argument to create_on_current_device()
 enum : bool {
     implicitly_synchronizes_with_default_stream = true,
     no_implicit_synchronization_with_default_stream = false,
     sync = implicitly_synchronizes_with_default_stream,
     async = no_implicit_synchronization_with_default_stream,
     blocking = sync,
     nonblocking = async,
 };

 enum wait_condition_t : unsigned {
     greater_or_equal_to            = CU_STREAM_WAIT_VALUE_GEQ,
     geq                            = CU_STREAM_WAIT_VALUE_GEQ,

     equality                       = CU_STREAM_WAIT_VALUE_EQ,
     equals                         = CU_STREAM_WAIT_VALUE_EQ,

     nonzero_after_applying_bitmask = CU_STREAM_WAIT_VALUE_AND,
     one_bits_overlap               = CU_STREAM_WAIT_VALUE_AND,
     bitwise_and                    = CU_STREAM_WAIT_VALUE_AND,

     zero_bits_overlap              = CU_STREAM_WAIT_VALUE_NOR,
     bitwise_nor                    = CU_STREAM_WAIT_VALUE_NOR,
 } ;


 #if CUDA_VERSION >= 11000

 enum synchronization_policy_t : typename ::std::underlying_type<CUsynchronizationPolicy>::type {
     automatic = CU_SYNC_POLICY_AUTO,

     spin      = CU_SYNC_POLICY_SPIN,

     yield     = CU_SYNC_POLICY_YIELD,

     block  = CU_SYNC_POLICY_BLOCKING_SYNC
 };
 #endif // CUDA_VERSION >= 11000

 namespace detail_ {

 ::std::string identify(const stream_t& stream);

 inline handle_t create_raw_in_current_context(
     bool          synchronizes_with_default_stream,
     priority_t    priority = stream::default_priority
 )
 {
     const unsigned int flags = (synchronizes_with_default_stream == sync) ?
         CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
     handle_t new_stream_handle;
     auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
     throw_if_error_lazy(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
     return new_stream_handle;
 }

 #if CUDA_VERSION >= 9020
 inline context::handle_t context_handle_of(stream::handle_t stream_handle)
 {
     context::handle_t handle;
     auto result = cuStreamGetCtx(stream_handle, &handle);
     throw_if_error_lazy(result, "Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
     return handle;
 }
 #endif // CUDA_VERSION >= 9020


 inline device::id_t device_id_of(stream::handle_t stream_handle);

 inline void record_event_in_current_context(
     device::id_t       current_device_id,
     context::handle_t  current_context_handle_,
     stream::handle_t   stream_handle,
     event::handle_t    event_handle);

 template <typename Function>
 void enqueue_function_call(const stream_t& stream, Function function, void * argument);

 } // namespace detail_

 stream_t wrap(
     device::id_t       device_id,
     context::handle_t  context_handle,
     handle_t           stream_handle,
     bool               take_ownership = false,
     bool               hold_pc_refcount_unit = false) noexcept;

 namespace detail_ {

 // Providing the same signature to multiple CUDA driver calls, to allow
 // uniform templated use of all of them
 template<typename T>
 CUresult wait_on_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);

 // Providing the same signature to multiple CUDA driver calls, to allow
 // uniform templated use of all of them
 template<typename T>
 CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);

 } // namespace detail_

 #if CUDA_VERSION >= 10000
 namespace capture {

 inline state_t state(const stream_t& stream);

 void begin(const cuda::stream_t& stream, stream::capture::mode_t mode = cuda::stream::capture::mode_t::global);
 graph::template_t end(const cuda::stream_t& stream);

 } // namespace capture

 inline bool is_capturing(const stream_t& stream)
 {
     return is_capturing(stream::capture::state(stream));
 }

 #endif // CUDA_VERSION >= 10000
 } // namespace stream

 inline void synchronize(const stream_t& stream);

 class stream_t {

 public: // type definitions

     enum : bool {
         doesnt_synchronizes_with_default_stream  = false,
         does_synchronize_with_default_stream     = true,
     };

 public: // const getters
     stream::handle_t   handle() const noexcept  { return handle_; }

     context::handle_t  context_handle()  const noexcept { return context_handle_; }

     device::id_t       device_id()       const noexcept { return device_id_; }

     device_t           device()    const noexcept;

     context_t          context()   const noexcept;

     bool               is_owning() const noexcept { return owning_; }

 public: // other non-mutators

     bool synchronizes_with_default_stream() const
     {
         unsigned int flags;
         auto status = cuStreamGetFlags(handle_, &flags);
             // Could have used the equivalent Driver API call,
             // cuStreamGetFlags(handle_, &flags);
         throw_if_error_lazy(status, "Failed obtaining flags for a stream in "
                 + context::detail_::identify(context_handle_, device_id_));
         return flags & CU_STREAM_NON_BLOCKING;
     }

     stream::priority_t priority() const
     {
         int the_priority;
         auto status = cuStreamGetPriority(handle_, &the_priority);
             // Could have used the equivalent Runtime API call:
             // cuStreamGetPriority(handle_, &the_priority);
         throw_if_error_lazy(status, "Failed obtaining priority for a stream in "
             + context::detail_::identify(context_handle_, device_id_));
         return the_priority;
     }

     bool has_work_remaining() const
     {
         CAW_SET_SCOPE_CONTEXT(context_handle_);
         auto status = cuStreamQuery(handle_);
             // Could have used the equivalent runtime API call:
             // cuStreamQuery(handle_);
         switch(status) {
         case CUDA_SUCCESS:
             return false;
         case CUDA_ERROR_NOT_READY:
             return true;
         default:
             throw cuda::runtime_error(static_cast<cuda::status::named_t>(status),
                 "unexpected stream status for " + stream::detail_::identify(handle_, device_id_));
         }
     }

     bool is_clear() const { return !has_work_remaining(); }

     bool query() const { return is_clear(); }

 public: // mutators

     class enqueue_t {
     protected:
         const stream_t& associated_stream;

     public:
         enqueue_t(const stream_t& stream) : associated_stream(stream) {}

         template<typename KernelFunction, typename... KernelParameters>
         void kernel_launch(
             const KernelFunction&       kernel_function,
             launch_configuration_t      launch_configuration,
             KernelParameters &&...      parameters) const
         {
             return cuda::enqueue_launch(
                 kernel_function,
                 associated_stream,
                 launch_configuration,
                 ::std::forward<KernelParameters>(parameters)...);
         }

         void type_erased_kernel_launch(
             const kernel_t&         kernel,
             launch_configuration_t  launch_configuration,
             span<const void*>       marshalled_arguments) const
         {
             cuda::launch_type_erased(kernel, associated_stream, launch_configuration, marshalled_arguments);
         }

 #if CUDA_VERSION >= 10000

         void graph_launch(const graph::instance_t& graph_instance) const;
 #endif // CUDA_VERSION >= 10000

         void copy(void *destination, const void *source, size_t num_bytes) const
         {
             // CUDA doesn't seem to need us to be in the stream's context to enqueue the copy;
             // however, unfortunately, it does require us to be in _some_ context.
             context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_};
             memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
         }

         void copy(void* destination, memory::const_region_t source, size_t num_bytes) const
         {
 #ifndef NDEBUG
             if (source.size() < num_bytes) {
                 throw ::std::logic_error("Attempt to copy more than the source region's size");
             }
 #endif
             copy(destination, source.start(), num_bytes);
         }

         void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
         {
             copy(destination.start(), source, num_bytes);
         }

         void copy(memory::region_t destination, memory::const_region_t source) const
         {
             copy(destination, source, source.size());
         }

         void copy(void* destination, memory::const_region_t source) const
         {
             copy(destination, source, source.size());
         }


         void memset(void *start, int byte_value, size_t num_bytes) const
         {
             // Is it necessary to set the device? I wonder.
             CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
             memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
         }

         void memset(memory::region_t region, int byte_value) const
         {
             memset(region.data(), byte_value, region.size());
         }

         void memzero(void *start, size_t num_bytes) const
         {
             CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
             memory::device::detail_::zero(start, num_bytes, associated_stream.handle_);
         }

         void memzero(memory::region_t region) const
         {
             memzero(region.data(), region.size());
         }

         event_t& event(event_t& existing_event) const;

         event_t event(
             bool          uses_blocking_sync = event::sync_by_busy_waiting,
             bool          records_timing     = event::do_record_timings,
             bool          interprocess       = event::not_interprocess) const;

 # if CUDA_VERSION >= 10000

         template <typename Argument>
         void host_function_call(void (*function)(Argument*), Argument* argument) const
         {
             // I hope you like function declaration punning :-)
             stream::detail_::enqueue_function_call(
                 associated_stream, reinterpret_cast<stream::callback_t>(function), argument);
         }
 #endif

     private:
         template <typename Invokable>
         static void CUDA_CB stream_launched_invoker(void* type_erased_invokable) {
             auto invokable = reinterpret_cast<Invokable*>(type_erased_invokable);
             (*invokable)();
         }

     public:
         template <typename Invokable>
         void host_invokable(Invokable& invokable) const
         {
             auto type_erased_invoker = reinterpret_cast<stream::callback_t>(stream_launched_invoker<Invokable>);
             stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
         }

 #if CUDA_VERSION >= 11020

         memory::region_t allocate(size_t num_bytes) const
         {
             return memory::device::allocate(num_bytes, associated_stream);
         }

         memory::region_t allocate(const memory::pool_t& pool, size_t num_bytes);

         void free(void* region_start) const
         {
             memory::device::free(region_start, associated_stream);
         }

         void free(memory::region_t region) const
         {
             memory::device::free(region, associated_stream);
         }
 #endif // CUDA_VERSION >= 11020

         void attach_managed_region(
             const void* managed_region_start,
             memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
         {
             CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
             // This fixed value is required by the CUDA Runtime API,
             // to indicate that the entire memory region, rather than a part of it, will be
             // attached to this stream
             constexpr const size_t length = 0;
             auto flags = static_cast<unsigned>(attachment);
             auto status =  cuStreamAttachMemAsync(
                 associated_stream.handle_,  memory::device::address(managed_region_start), length, flags);
                 // Could have used the equivalent Driver API call cuStreamAttachMemAsync
             throw_if_error_lazy(status, "Failed scheduling an attachment of a managed memory region on "
                 + stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
                 associated_stream.device_id_));
         }

         void attach_managed_region(
             memory::region_t region,
             memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
         {
             attach_managed_region(region.start(), attachment);
         }

         void wait(const event_t& event_) const;

         template <typename T>
         void set_single_value(T* __restrict__ ptr, T value, bool with_memory_barrier = true) const
         {
             static_assert(
                 ::std::is_same<T,uint32_t>::value or ::std::is_same<T,uint64_t>::value,
                 "Unsupported type for stream value wait."
             );
             unsigned flags = with_memory_barrier ?
                 CU_STREAM_WRITE_VALUE_DEFAULT :
                 CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
             auto result = static_cast<status_t>(
                 stream::detail_::write_value(associated_stream.handle_, memory::device::address(ptr), value, flags));
             throw_if_error_lazy(result, "Failed scheduling a write to global memory on "
                 + stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
                 + associated_stream.device_id_));
         }

         template <typename T>
         void wait(const T* address, stream::wait_condition_t condition, T value, bool with_memory_barrier = false) const
         {
             static_assert(
                 ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
                 "Unsupported type for stream value wait."
             );
             unsigned flags = static_cast<unsigned>(condition) |
                 (with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
             auto result = static_cast<status_t>(
                 stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
             throw_if_error_lazy(result,
                 "Failed scheduling a wait on global memory address on "
                 + stream::detail_::identify(
                     associated_stream.handle_,
                     associated_stream.context_handle_,
                     associated_stream.device_id_) );
         }

         void flush_remote_writes() const
         {
             CUstreamBatchMemOpParams op_params;
             op_params.flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
             op_params.flushRemoteWrites.flags = 0;
             static const unsigned count = 1;
             static const unsigned flags = 0;
             // Let's cross our fingers and assume nothing else needs to be set here...
             auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
             throw_if_error_lazy(status, "scheduling a flush-remote-writes memory operation as a 1-op batch");
         }

 #if CUDA_VERSION >= 11070
         void memory_barrier(memory::barrier_scope_t scope) const
         {
             CUstreamBatchMemOpParams op_params;
             op_params.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
             op_params.memoryBarrier.flags = static_cast<unsigned>(scope);
             static const unsigned count = 1;
             static const unsigned flags = 0;
             // Let's cross our fingers and assume nothing else needs to be set here...
             auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
             throw_if_error_lazy(status, "scheduling a memory barrier operation as a 1-op batch");
         }
 #endif

         template <typename Iterator>
         void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
         {
             static_assert(
                 ::std::is_same<typename ::std::iterator_traits<Iterator>::value_type, CUstreamBatchMemOpParams>::value,
                 "Only accepting iterator pairs for the CUDA-driver-API memory operation descriptor,"
                 " CUstreamBatchMemOpParams, as the value type");
             auto num_ops = ::std::distance(ops_begin, ops_end);
             if (::std::is_same<typename ::std::remove_const<decltype(ops_begin)>::type, CUstreamBatchMemOpParams* >::value,
                 "Only accepting containers of the CUDA-driver-API memory operation descriptor, CUstreamBatchMemOpParams")
             {
                 auto ops_ptr = reinterpret_cast<const CUstreamBatchMemOpParams*>(ops_begin);
                 cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_ptr);
             }
             else {
                 auto ops_uptr = ::std::unique_ptr<CUstreamBatchMemOpParams[]>(new CUstreamBatchMemOpParams[num_ops]);
                 ::std::copy(ops_begin, ops_end, ops_uptr.get());
                 cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_uptr.get());
             }
         }

         template <typename Container>
         void single_value_operations_batch(const Container& single_value_ops) const
         {
             return single_value_operations_batch(single_value_ops.begin(), single_value_ops.end());
         }

     }; // class enqueue_t

     void synchronize() const
     {
         cuda::synchronize(*this);
     }

 #if CUDA_VERSION >= 11000
     stream::synchronization_policy_t synchronization_policy()
     {
         CAW_SET_SCOPE_CONTEXT(context_handle_);
         CUstreamAttrValue wrapped_result{};
         auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
         throw_if_error_lazy(status, ::std::string("Obtaining the synchronization policy of ") + stream::detail_::identify(*this));
         return static_cast<stream::synchronization_policy_t>(wrapped_result.syncPolicy);
     }

     void set_synchronization_policy(stream::synchronization_policy_t policy)
     {
         CAW_SET_SCOPE_CONTEXT(context_handle_);
         CUstreamAttrValue wrapped_value{};
         wrapped_value.syncPolicy = static_cast<CUsynchronizationPolicy>(policy);
         auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
         throw_if_error_lazy(status, ::std::string("Setting the synchronization policy of ") + stream::detail_::identify(*this));
     }
 #endif

     // TODO: Create a dummy capture object, then we could have capture.start(), capture.stop(), capture.status(),
     // and perhaps a capture_() which takes a lambda. Also offer a
     // cuda::stream::capture(const stream_t& stream, F f) template!


 #if CUDA_VERSION >= 10000

     void begin_capture(stream::capture::mode_t mode = cuda::stream::capture::mode_t::global) const
     {
         stream::capture::begin(*this, mode);
     }

     bool is_capturing() const { return stream::is_capturing(*this); }

     graph::template_t end_capture() const
     {
         return stream::capture::end(*this);
     }
 #endif // CUDA_VERSION >= 10000

 protected: // constructor

     stream_t(
         device::id_t       device_id,
         context::handle_t  context_handle,
         stream::handle_t   stream_handle,
         bool               take_ownership = false,
         bool               hold_primary_context_refcount_unit = false) noexcept
     :
         device_id_(device_id),
         context_handle_(context_handle),
         handle_(stream_handle),
         owning_(take_ownership),
         holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
     { }

 public: // constructors and destructor

     // Streams cannot be copied, despite our allowing non-owning class instances.
     // The reason is that we might inadvertently copy of an owning stream, creating
     // a non-owning stream and letting the original owning stream go out of scope -
     // thus destructing the object, and destroying the underlying CUDA object.
     // Essentially, that is like passing a reference to a local variable - which we
     // may not do.
     stream_t(const stream_t& other) = delete;

     stream_t(stream_t&& other) noexcept :
         stream_t(other.device_id_, other.context_handle_, other.handle_, other.owning_, other.holds_pc_refcount_unit_)
     {
         other.owning_ = false;
         other.holds_pc_refcount_unit_ = false;
     }

     ~stream_t() noexcept(false)
     {
         if (owning_) {
             CAW_SET_SCOPE_CONTEXT(context_handle_);
             cuStreamDestroy(handle_);
         }
         // TODO: DRY
         if (holds_pc_refcount_unit_) {
 #ifdef NDEBUG
             device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
                 // Note: "Swallowing" any potential error to avoid ::std::terminate(); also,
                 // because a failure probably means the primary context is inactive already
 #else
             device::primary_context::detail_::decrease_refcount(device_id_);
 #endif
         }
     }

 public: // operators

     stream_t& operator=(const stream_t& other) = delete;
     stream_t& operator=(stream_t&& other) noexcept
     {
         ::std::swap(device_id_, other.device_id_);
         ::std::swap(context_handle_, other.context_handle_);
         ::std::swap(handle_, other.handle_);
         ::std::swap(owning_, other.owning_);
         ::std::swap(holds_pc_refcount_unit_, holds_pc_refcount_unit_);
         return *this;
     }

 public: // friendship

     friend stream_t stream::wrap(
         device::id_t       device_id,
         context::handle_t  context_handle,
         stream::handle_t   stream_handle,
         bool               take_ownership,
         bool               hold_pc_refcount_unit) noexcept;

     friend inline bool operator==(const stream_t& lhs, const stream_t& rhs) noexcept
     {
         return
             lhs.context_handle_ == rhs.context_handle_
 #ifndef NDEBUG
             and lhs.device_id_ == rhs.device_id_
 #endif
             and lhs.handle_ == rhs.handle_;
     }

 protected: // data members
     device::id_t       device_id_;
     context::handle_t  context_handle_;
     stream::handle_t   handle_;
     bool               owning_;
     bool               holds_pc_refcount_unit_;
         // When context_handle_ is the handle of a primary context, this event may
         // be "keeping that context alive" through the refcount - in which case
         // it must release its refcount unit on destruction

 public: // data members - which only exist in lieu of namespaces

     const enqueue_t     enqueue { *this };
         // The use of *this here is safe, since enqueue_t doesn't do anything with it
         // on its own. Any use of enqueue only happens through, well, *this - and
         // after construction.
 };

 inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
 {
     return not (lhs == rhs);
 }

 namespace stream {

 inline stream_t wrap(
     device::id_t       device_id,
     context::handle_t  context_handle,
     stream::handle_t   stream_handle,
     bool               take_ownership,
     bool               hold_pc_refcount_unit) noexcept
 {
     return { device_id, context_handle, stream_handle, take_ownership, hold_pc_refcount_unit };
 }

 namespace detail_ {

 inline stream_t create(
     device::id_t       device_id,
     context::handle_t  context_handle,
     bool               synchronizes_with_default_stream,
     priority_t         priority = stream::default_priority,
     bool               hold_pc_refcount_unit = false)
 {
     CAW_SET_SCOPE_CONTEXT(context_handle);
     auto new_stream_handle = cuda::stream::detail_::create_raw_in_current_context(
         synchronizes_with_default_stream, priority);
     return wrap(device_id, context_handle, new_stream_handle, do_take_ownership, hold_pc_refcount_unit);
 }

 template<>
 inline CUresult wait_on_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
 {
     return cuStreamWaitValue32(stream_handle, address, value, flags);
 }

 template<>
 inline CUresult wait_on_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
 {
     return cuStreamWaitValue64(stream_handle, address, value, flags);
 }


 template<>
 inline CUresult write_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
 {
     return cuStreamWriteValue32(stream_handle, address, value, flags);
 }

 template<>
 inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
 {
     return cuStreamWriteValue64(stream_handle, address, value, flags);
 }

 template <typename Function>
 void enqueue_function_call(const stream_t& stream, Function function, void* argument)
 {
     CAW_SET_SCOPE_CONTEXT(stream.context_handle());

     // While we always register the same static function, `callback_adapter` as the
     // callback - what it will actually _do_ is invoke the callback we were passed.

 #if CUDA_VERSION >= 10000
     auto status = cuLaunchHostFunc(stream.handle(), function, argument);
     // Could have used the equivalent Driver API call: cuLaunchHostFunc()
 #else
     // The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
     // variable, see:
     // http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
     static constexpr const unsigned fixed_flags { 0u };
     auto status = cuStreamAddCallback(stream.handle(), function, argument, fixed_flags);
 #endif
     throw_if_error_lazy(status, "Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
 }

 } // namespace detail_

 stream_t create(
     const device_t&   device,
     bool              synchronizes_with_default_stream,
     priority_t        priority = stream::default_priority);

 stream_t create(
     const context_t&  context,
     bool              synchronizes_with_default_stream,
     priority_t        priority = stream::default_priority,
     bool              hold_pc_refcount_unit = false);

 #if CUDA_VERSION >= 10000
 namespace capture {

 inline state_t state(const stream_t& stream)
 {
     context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
     CUstreamCaptureStatus capture_status;
     auto op_status = cuStreamIsCapturing(stream.handle(), &capture_status);
     throw_if_error_lazy(op_status, "Failed beginning to capture on " + stream::detail_::identify(stream));
     return static_cast<state_t>(capture_status);
 }

 inline void begin(const cuda::stream_t& stream, stream::capture::mode_t mode)
 {
     context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
     auto status = cuStreamBeginCapture(stream.handle(), static_cast<CUstreamCaptureMode>(mode));
     throw_if_error_lazy(status, "Failed beginning to capture on " + stream::detail_::identify(stream));
 }

 } // namespace capture
 #endif // CUDA_VERSION >= 10000

 } // namespace stream

  inline void synchronize(const stream_t& stream)
 {
     // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
     // and not have trouble acting on a stream in another context - it balks at doing so under
     // certain conditions, so we must place ourselves in the stream's context.
     CAW_SET_SCOPE_CONTEXT(stream.context_handle());
     auto status = cuStreamSynchronize(stream.handle());
     throw_if_error_lazy(status, "Failed synchronizing " + stream::detail_::identify(stream));
 }

 #if CUDA_VERSION >= 11000

 void copy_attributes(const stream_t& dest, const stream_t& src);
 #endif // CUDA_VERSION >= 11000

 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_STREAM_HPP_
cuda::stream_t::enqueue_t::memset
void memset(void *start, int byte_value, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:480

cuda::stream_t::has_work_remaining
bool has_work_remaining() const
Determines whether all work on this stream has been completed.
Definition: stream.hpp:315

cuda::stream_t::enqueue_t::copy
void copy(void *destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:438

cuda::stream_t::query
bool query() const
An alias for is_clear() - to conform to how the CUDA runtime API names this functionality.
Definition: stream.hpp:344

cuda::stream_t::context_handle
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260

cuda::stream_t
Proxy class for a CUDA stream.
Definition: stream.hpp:246

cuda::stream_t::enqueue_t::memzero
void memzero(memory::region_t region) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:513

cuda::stream_t::handle
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257

cuda::context_t
Wrapper class for a CUDA context.
Definition: context.hpp:244

cuda::stream_t::synchronize
void synchronize() const
Block or busy-wait until all previously-scheduled work on this stream has been completed.
Definition: stream.hpp:831

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::stream::priority_t
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246

cuda::device::count
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63

cuda::memory::managed::region_t
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960

cuda::stream_t::enqueue_t::single_value_operations_batch
void single_value_operations_batch(const Container &single_value_ops) const
Definition: stream.hpp:820

cuda::stream_t::operator==
friend bool operator==(const stream_t &lhs, const stream_t &rhs) noexcept
Definition: stream.hpp:967

cuda::launch_configuration_t
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69

cuda::stream_t::enqueue_t::copy
void copy(void *destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:465

cuda::context::handle_t
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878

cuda::event_t
Wrapper class for a CUDA event.
Definition: event.hpp:133

cuda::stream_t::enqueue_t
A gadget through which commands are enqueued on the stream.
Definition: stream.hpp:355

cuda::memory::device::allocate
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context.
Definition: memory.hpp:106

cuda::stream_t::enqueue_t::wait
void wait(const T *address, stream::wait_condition_t condition, T value, bool with_memory_barrier=false) const
Wait for a value in device global memory to change so as to meet some condition.
Definition: stream.hpp:734

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::wait
void wait(const event_t &event)
Have the calling thread wait - either busy-waiting or blocking - and return only after this event has...
Definition: event.hpp:467

cuda::memory::device::free
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130

cuda::event::handle_t
CUevent handle_t
The CUDA driver&#39;s raw handle for events.
Definition: types.hpp:217

cuda::stream_t::enqueue_t::attach_managed_region
void attach_managed_region(const void *managed_region_start, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:633

cuda::stream_t::is_clear
bool is_clear() const
The opposite of has_work_remaining()
Definition: stream.hpp:338

cuda::enqueue_launch
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25

cuda::profiling::start
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229

current_context.hpp

cuda::memory::copy
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625

cuda::runtime_error
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271

cuda::stream_t::enqueue_t::kernel_launch
void kernel_launch(const KernelFunction &kernel_function, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:376

cuda::synchronize
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968

cuda::stream::wait_condition_t
wait_condition_t
Kinds of conditions to apply to a value in GPU global memory when waiting on that value...
Definition: stream.hpp:62

cuda::stream::callback_t
CUstreamCallback callback_t
The CUDA driver&#39;s raw handle for a host-side callback function.
Definition: types.hpp:257

cuda::stream_t::enqueue_t::single_value_operations_batch
void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead o...
Definition: stream.hpp:796

cuda::stream_t::synchronizes_with_default_stream
bool synchronizes_with_default_stream() const
When true, work running in the created stream may run concurrently with work in stream 0 (the NULL st...
Definition: stream.hpp:281

cuda::stream_t::priority
stream::priority_t priority() const
Definition: stream.hpp:294

cuda::stream::wrap
stream_t wrap(device::id_t device_id, context::handle_t context_handle, handle_t stream_handle, bool take_ownership=false, bool hold_pc_refcount_unit=false) noexcept
Wrap an existing stream in a stream_t instance.
Definition: stream.hpp:1006

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::stream_t::enqueue_t::copy
void copy(memory::region_t destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:459

kernel_launch.hpp
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.

current_device.hpp
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.

cuda::memory::managed::const_region_t
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962

cuda::stream_t::enqueue_t::memset
void memset(memory::region_t region, int byte_value) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:488

error.hpp
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...

cuda::memory::device::address
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682

cuda::launch_type_erased
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:411

miscellany.hpp
Miscellaneous functionality which does not fit in another file, and does not depend on the main proxy...

cuda::stream::handle_t
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239

cuda::event::interprocess
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96

cuda::kernel_t
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159

cuda::stream::default_priority
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249

cuda::event::sync_by_busy_waiting
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70

cuda::stream_t::enqueue_t::flush_remote_writes
void flush_remote_writes() const
Guarantee all remote writes to the specified address are visible to subsequent operations scheduled o...
Definition: stream.hpp:756

cuda::stream_t::enqueue_t::attach_managed_region
void attach_managed_region(memory::region_t region, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:670

cuda::event::not_interprocess
Can only be used by the process which created it.
Definition: constants.hpp:95

cuda::stream_t::enqueue_t::type_erased_kernel_launch
void type_erased_kernel_launch(const kernel_t &kernel, launch_configuration_t launch_configuration, span< const void *> marshalled_arguments) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:401

cuda::stream_t::enqueue_t::copy
void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:453

cuda::stream_t::enqueue_t::set_single_value
void set_single_value(T *__restrict__ ptr, T value, bool with_memory_barrier=true) const
Schedule writing a single value to global device memory after all previous work has concluded...
Definition: stream.hpp:703

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

cuda::stream_t::enqueue_t::memzero
void memzero(void *start, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:504

types.hpp
Fundamental CUDA-related type definitions.

cuda::stream_t::enqueue_t::host_invokable
void host_invokable(Invokable &invokable) const
Enqueues a host-invokable object, typically a function or closure object call.
Definition: stream.hpp:576

memory.hpp
freestanding wrapper functions for working with CUDA&#39;s various kinds of memory spaces, arranged into a relevant namespace hierarchy.

cuda::stream_t::is_owning
bool is_owning() const noexcept
True if this wrapper is responsible for telling CUDA to destroy the stream upon the wrapper&#39;s own des...
Definition: stream.hpp:272

cuda::status_t
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77

cuda::stream_t::device_id
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:263

cuda::memory::device::zero
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:416

cuda::memory::managed::attachment_t
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1975

template.hpp
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...