10 #ifndef CUDA_API_WRAPPERS_STREAM_HPP_ 11 #define CUDA_API_WRAPPERS_STREAM_HPP_ 21 #if CUDA_VERSION >= 10000 23 #endif // CUDA_VERSION >= 10000 49 implicitly_synchronizes_with_default_stream =
true,
50 no_implicit_synchronization_with_default_stream =
false,
51 sync = implicitly_synchronizes_with_default_stream,
52 async = no_implicit_synchronization_with_default_stream,
63 greater_or_equal_to = CU_STREAM_WAIT_VALUE_GEQ,
64 geq = CU_STREAM_WAIT_VALUE_GEQ,
66 equality = CU_STREAM_WAIT_VALUE_EQ,
67 equals = CU_STREAM_WAIT_VALUE_EQ,
69 nonzero_after_applying_bitmask = CU_STREAM_WAIT_VALUE_AND,
70 one_bits_overlap = CU_STREAM_WAIT_VALUE_AND,
71 bitwise_and = CU_STREAM_WAIT_VALUE_AND,
73 zero_bits_overlap = CU_STREAM_WAIT_VALUE_NOR,
74 bitwise_nor = CU_STREAM_WAIT_VALUE_NOR,
78 #if CUDA_VERSION >= 11000 83 enum synchronization_policy_t : typename ::std::underlying_type<CUsynchronizationPolicy>::type {
87 automatic = CU_SYNC_POLICY_AUTO,
97 spin = CU_SYNC_POLICY_SPIN,
108 yield = CU_SYNC_POLICY_YIELD,
116 block = CU_SYNC_POLICY_BLOCKING_SYNC
118 #endif // CUDA_VERSION >= 11000 122 ::std::string identify(
const stream_t& stream);
124 inline handle_t create_raw_in_current_context(
125 bool synchronizes_with_default_stream,
129 const unsigned int flags = (synchronizes_with_default_stream == sync) ?
130 CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
132 auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
133 throw_if_error_lazy(status,
"Failed creating a new stream in " + detail_::identify(new_stream_handle));
134 return new_stream_handle;
139 CAW_SET_SCOPE_CONTEXT(context_handle);
140 return cuStreamDestroy(handle);
145 auto status = destroy_nothrow(handle, context_handle);
146 throw_if_error_lazy(status,
"Failed destroying " + identify(handle, context_handle, device_id));
149 #if CUDA_VERSION >= 9020 153 auto result = cuStreamGetCtx(stream_handle, &handle);
154 throw_if_error_lazy(result,
"Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
157 #endif // CUDA_VERSION >= 9020 171 inline void record_event_in_current_context(
177 template <
typename Function>
178 void enqueue_function_call(
const stream_t& stream, Function
function,
void * argument);
207 bool take_ownership =
false,
208 bool hold_pc_refcount_unit =
false) noexcept;
215 CUresult wait_on_value(CUstream stream_handle, CUdeviceptr
address, T value,
unsigned int flags);
220 CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value,
unsigned int flags);
224 #if CUDA_VERSION >= 10000 227 inline state_t state(
const stream_t& stream);
235 void begin(
const cuda::stream_t& stream, stream::capture::mode_t mode = cuda::stream::capture::mode_t::global);
240 inline bool is_capturing(
const stream_t& stream)
242 return is_capturing(stream::capture::state(stream));
245 #endif // CUDA_VERSION >= 10000 263 doesnt_synchronizes_with_default_stream =
false,
264 does_synchronize_with_default_stream =
true,
296 auto status = cuStreamGetFlags(handle_, &flags);
300 + context::detail_::identify(context_handle_, device_id_));
301 return flags & CU_STREAM_NON_BLOCKING;
309 auto status = cuStreamGetPriority(handle_, &the_priority);
313 + context::detail_::identify(context_handle_, device_id_));
329 CAW_SET_SCOPE_CONTEXT(context_handle_);
330 auto status = cuStreamQuery(handle_);
336 case CUDA_ERROR_NOT_READY:
340 "unexpected stream status for " + stream::detail_::identify(handle_, device_id_));
350 bool is_clear()
const {
return !has_work_remaining(); }
356 bool query()
const {
return is_clear(); }
387 template<
typename KernelFunction,
typename... KernelParameters>
389 const KernelFunction& kernel_function,
391 KernelParameters &&... parameters)
const 396 launch_configuration,
397 ::std::forward<KernelParameters>(parameters)...);
416 span<const void*> marshalled_arguments)
const 421 #if CUDA_VERSION >= 10000 429 void graph_launch(
const graph::instance_t& graph_instance)
const;
430 #endif // CUDA_VERSION >= 10000 439 void copy(
void *destination,
const void *source,
size_t num_bytes)
const 445 context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_};
446 memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
453 if (source.size() < num_bytes) {
454 throw ::std::logic_error(
"Attempt to copy more than the source region's size");
457 copy(destination, source.start(), num_bytes);
467 copy(destination.start(), source, num_bytes);
473 copy(destination, source, source.size());
479 copy(destination, source, source.size());
495 CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
496 memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
502 memset(region.data(), byte_value, region.size());
518 CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
527 memzero(region.data(), region.size());
558 bool records_timing = event::do_record_timings,
561 # if CUDA_VERSION >= 10000 569 template <
typename Argument>
570 void host_function_call(
void (*
function)(Argument*), Argument* argument)
const 573 stream::detail_::enqueue_function_call(
574 associated_stream, reinterpret_cast<stream::callback_t>(
function), argument);
579 template <
typename Invokable>
580 static void CUDA_CB stream_launched_invoker(
void* type_erased_invokable) {
581 auto invokable =
reinterpret_cast<Invokable*
>(type_erased_invokable);
587 template <
typename Invokable>
590 auto type_erased_invoker =
reinterpret_cast<stream::callback_t>(stream_launched_invoker<Invokable>);
591 stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
594 #if CUDA_VERSION >= 11020 612 memory::region_t allocate(
const memory::pool_t& pool,
size_t num_bytes)
const;
618 void free(
void* region_start)
const 629 #endif // CUDA_VERSION >= 11020 652 const void* managed_region_start,
655 CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
659 constexpr
const size_t length = 0;
660 auto flags =
static_cast<unsigned>(attachment);
661 auto status = cuStreamAttachMemAsync(
664 throw_if_error_lazy(status,
"Failed scheduling an attachment of a managed memory region on " 665 + stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
666 associated_stream.device_id_));
692 attach_managed_region(region.start(), attachment);
720 template <
typename T>
724 ::std::is_same<T,uint32_t>::value or ::std::is_same<T,uint64_t>::value,
725 "Unsupported type for stream value wait." 727 unsigned flags = with_memory_barrier ?
728 CU_STREAM_WRITE_VALUE_DEFAULT :
729 CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
730 auto result =
static_cast<status_t>(
733 + stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
734 + associated_stream.device_id_));
751 template <
typename T>
755 ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
756 "Unsupported type for stream value wait." 758 unsigned flags =
static_cast<unsigned>(condition) |
759 (with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
760 auto result =
static_cast<status_t>(
761 stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
763 "Failed scheduling a wait on global memory address on " 764 + stream::detail_::identify(
765 associated_stream.handle_,
766 associated_stream.context_handle_,
767 associated_stream.device_id_) );
776 CUstreamBatchMemOpParams op_params;
777 op_params.flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
778 op_params.flushRemoteWrites.flags = 0;
779 static const unsigned count = 1;
780 static const unsigned flags = 0;
782 auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
783 throw_if_error_lazy(status,
"scheduling a flush-remote-writes memory operation as a 1-op batch");
786 #if CUDA_VERSION >= 11070 787 void memory_barrier(memory::barrier_scope_t scope)
const 789 CUstreamBatchMemOpParams op_params;
790 op_params.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
791 op_params.memoryBarrier.flags =
static_cast<unsigned>(scope);
792 static const unsigned count = 1;
793 static const unsigned flags = 0;
795 auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
813 template <
typename Iterator>
817 ::std::is_same<typename ::std::iterator_traits<Iterator>::value_type, CUstreamBatchMemOpParams>::value,
818 "Only accepting iterator pairs for the CUDA-driver-API memory operation descriptor," 819 " CUstreamBatchMemOpParams, as the value type");
820 auto num_ops = ::std::distance(ops_begin, ops_end);
821 if (::std::is_same<typename ::std::remove_const<decltype(ops_begin)>::type, CUstreamBatchMemOpParams* >::value,
822 "Only accepting containers of the CUDA-driver-API memory operation descriptor, CUstreamBatchMemOpParams")
824 auto ops_ptr =
reinterpret_cast<const CUstreamBatchMemOpParams*
>(ops_begin);
825 cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_ptr);
828 auto ops_uptr = ::std::unique_ptr<CUstreamBatchMemOpParams[]>(
new CUstreamBatchMemOpParams[num_ops]);
829 ::std::copy(ops_begin, ops_end, ops_uptr.get());
830 cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_uptr.get());
837 template <
typename Container>
840 return single_value_operations_batch(single_value_ops.begin(), single_value_ops.end());
854 #if CUDA_VERSION >= 11000 855 stream::synchronization_policy_t synchronization_policy()
const 857 CAW_SET_SCOPE_CONTEXT(context_handle_);
858 CUstreamAttrValue wrapped_result{};
859 auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
860 throw_if_error_lazy(status, ::std::string(
"Obtaining the synchronization policy of ") + stream::detail_::identify(*
this));
861 return static_cast<stream::synchronization_policy_t
>(wrapped_result.syncPolicy);
864 void set_synchronization_policy(stream::synchronization_policy_t policy)
const 866 CAW_SET_SCOPE_CONTEXT(context_handle_);
867 CUstreamAttrValue wrapped_value{};
868 wrapped_value.syncPolicy =
static_cast<CUsynchronizationPolicy
>(policy);
869 auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
870 throw_if_error_lazy(status, ::std::string(
"Setting the synchronization policy of ") + stream::detail_::identify(*
this));
879 #if CUDA_VERSION >= 10000 885 void begin_capture(stream::capture::mode_t mode = cuda::stream::capture::mode_t::global)
const 887 stream::capture::begin(*
this, mode);
893 bool is_capturing()
const {
return stream::is_capturing(*
this); }
901 graph::template_t end_capture()
const 903 return stream::capture::end(*
this);
905 #endif // CUDA_VERSION >= 10000 913 bool take_ownership =
false,
914 bool hold_primary_context_refcount_unit =
false) noexcept
916 device_id_(device_id),
917 context_handle_(context_handle),
918 handle_(stream_handle),
919 owning_(take_ownership),
920 holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
934 stream_t(other.device_id_, other.context_handle_, other.handle_, other.owning_, other.holds_pc_refcount_unit_)
936 other.owning_ =
false;
937 other.holds_pc_refcount_unit_ =
false;
940 ~
stream_t() DESTRUCTOR_EXCEPTION_SPEC
943 #if THROW_IN_DESTRUCTORS 944 stream::detail_::destroy(handle_, context_handle_, device_id_);
946 stream::detail_::destroy_nothrow(handle_, context_handle_);
949 if (holds_pc_refcount_unit_) {
950 device::primary_context::detail_::decrease_refcount_in_dtor(device_id_);
959 ::std::swap(device_id_, other.device_id_);
960 ::std::swap(context_handle_, other.context_handle_);
961 ::std::swap(handle_, other.handle_);
962 ::std::swap(owning_, other.owning_);
963 ::std::swap(holds_pc_refcount_unit_, holds_pc_refcount_unit_);
974 bool hold_pc_refcount_unit) noexcept;
984 lhs.context_handle_ == rhs.context_handle_
986 and lhs.device_id_ == rhs.device_id_
988 and lhs.handle_ == rhs.handle_;
996 bool holds_pc_refcount_unit_;
1014 return not (lhs == rhs);
1024 bool take_ownership,
1025 bool hold_pc_refcount_unit) noexcept
1027 return { device_id, context_handle, stream_handle, take_ownership, hold_pc_refcount_unit };
1035 bool synchronizes_with_default_stream,
1037 bool hold_pc_refcount_unit =
false)
1039 CAW_SET_SCOPE_CONTEXT(context_handle);
1040 auto new_stream_handle = cuda::stream::detail_::create_raw_in_current_context(
1041 synchronizes_with_default_stream, priority);
1042 return wrap(device_id, context_handle, new_stream_handle, do_take_ownership, hold_pc_refcount_unit);
1046 inline CUresult wait_on_value<uint32_t>(CUstream stream_handle, CUdeviceptr
address, uint32_t value,
unsigned int flags)
1048 return cuStreamWaitValue32(stream_handle, address, value, flags);
1052 inline CUresult wait_on_value<uint64_t>(CUstream stream_handle, CUdeviceptr
address, uint64_t value,
unsigned int flags)
1054 return cuStreamWaitValue64(stream_handle, address, value, flags);
1059 inline CUresult write_value<uint32_t>(CUstream stream_handle, CUdeviceptr
address, uint32_t value,
unsigned int flags)
1061 return cuStreamWriteValue32(stream_handle, address, value, flags);
1065 inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr
address, uint64_t value,
unsigned int flags)
1067 return cuStreamWriteValue64(stream_handle, address, value, flags);
1070 template <
typename Function>
1071 void enqueue_function_call(
const stream_t& stream, Function
function,
void* argument)
1078 #if CUDA_VERSION >= 10000 1079 auto status = cuLaunchHostFunc(stream.
handle(),
function, argument);
1085 static constexpr
const unsigned fixed_flags { 0u };
1086 auto status = cuStreamAddCallback(stream.
handle(),
function, argument, fixed_flags);
1088 throw_if_error_lazy(status,
"Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
1108 bool synchronizes_with_default_stream,
1128 bool synchronizes_with_default_stream,
1130 bool hold_pc_refcount_unit =
false);
1133 #if CUDA_VERSION >= 10000 1136 inline state_t state(
const stream_t& stream)
1138 context::current::detail_::scoped_override_t set_context_for_this_scope(stream.
context_handle());
1139 CUstreamCaptureStatus capture_status;
1140 auto op_status = cuStreamIsCapturing(stream.
handle(), &capture_status);
1141 throw_if_error_lazy(op_status,
"Failed beginning to capture on " + stream::detail_::identify(stream));
1142 return static_cast<state_t
>(capture_status);
1145 inline void begin(
const cuda::stream_t& stream, stream::capture::mode_t mode)
1147 context::current::detail_::scoped_override_t set_context_for_this_scope(stream.
context_handle());
1148 auto status = cuStreamBeginCapture(stream.
handle(),
static_cast<CUstreamCaptureMode
>(mode));
1149 throw_if_error_lazy(status,
"Failed beginning to capture on " + stream::detail_::identify(stream));
1153 #endif // CUDA_VERSION >= 10000 1172 auto status = cuStreamSynchronize(stream.
handle());
1176 #if CUDA_VERSION >= 11000 1189 #endif // CUDA_VERSION >= 11000 1193 #endif // CUDA_API_WRAPPERS_STREAM_HPP_ void memset(void *start, int byte_value, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:492
bool has_work_remaining() const
Determines whether all work on this stream has been completed.
Definition: stream.hpp:327
void copy(void *destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:450
bool query() const
An alias for is_clear() - to conform to how the CUDA runtime API names this functionality.
Definition: stream.hpp:356
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:272
Proxy class for a CUDA stream.
Definition: stream.hpp:258
void memzero(memory::region_t region) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:525
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:269
Wrapper class for a CUDA context.
Definition: context.hpp:249
void synchronize() const
Block or busy-wait until all previously-scheduled work on this stream has been completed.
Definition: stream.hpp:849
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:243
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1974
void single_value_operations_batch(const Container &single_value_ops) const
Definition: stream.hpp:838
friend bool operator==(const stream_t &lhs, const stream_t &rhs) noexcept
Definition: stream.hpp:981
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
void copy(void *destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:477
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
Wrapper class for a CUDA event.
Definition: event.hpp:147
A gadget through which commands are enqueued on the stream.
Definition: stream.hpp:367
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context.
Definition: memory.hpp:102
void wait(const T *address, stream::wait_condition_t condition, T value, bool with_memory_barrier=false) const
Wait for a value in device global memory to change so as to meet some condition.
Definition: stream.hpp:752
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
void wait(const event_t &event)
Have the calling thread wait - either busy-waiting or blocking - and return only after this event has...
Definition: event.hpp:457
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:126
CUevent handle_t
The CUDA driver's raw handle for events.
Definition: types.hpp:214
void attach_managed_region(const void *managed_region_start, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:651
bool is_clear() const
The opposite of has_work_remaining()
Definition: stream.hpp:350
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:627
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
void kernel_launch(const KernelFunction &kernel_function, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:388
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:980
wait_condition_t
Kinds of conditions to apply to a value in GPU global memory when waiting on that value...
Definition: stream.hpp:62
CUstreamCallback callback_t
The CUDA driver's raw handle for a host-side callback function.
Definition: types.hpp:254
void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead o...
Definition: stream.hpp:814
bool synchronizes_with_default_stream() const
When true, work running in the created stream may run concurrently with work in stream 0 (the NULL st...
Definition: stream.hpp:293
stream::priority_t priority() const
Definition: stream.hpp:306
stream_t wrap(device::id_t device_id, context::handle_t context_handle, handle_t stream_handle, bool take_ownership=false, bool hold_pc_refcount_unit=false) noexcept
Wrap an existing stream in a stream_t instance.
Definition: stream.hpp:1020
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
void copy(memory::region_t destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:471
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Wrappers for getting and setting CUDA's choice of which device is 'current'.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1976
void memset(memory::region_t region, int byte_value) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:500
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:684
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver's launch...
Definition: kernel_launch.hpp:413
Miscellaneous functionality which does not fit in another file, and does not depend on the main proxy...
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:236
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:246
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
void flush_remote_writes() const
Guarantee all remote writes to the specified address are visible to subsequent operations scheduled o...
Definition: stream.hpp:774
void attach_managed_region(memory::region_t region, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:688
Can only be used by the process which created it.
Definition: constants.hpp:95
void type_erased_kernel_launch(const kernel_t &kernel, launch_configuration_t launch_configuration, span< const void *> marshalled_arguments) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:413
void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:465
void set_single_value(T *__restrict__ ptr, T value, bool with_memory_barrier=true) const
Schedule writing a single value to global device memory after all previous work has concluded...
Definition: stream.hpp:721
Wrapper class for a CUDA device.
Definition: device.hpp:135
void memzero(void *start, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:516
Fundamental CUDA-related type definitions.
void host_invokable(Invokable &invokable) const
Enqueues a host-invokable object, typically a function or closure object call.
Definition: stream.hpp:588
freestanding wrapper functions for working with CUDA's various kinds of memory spaces, arranged into a relevant namespace hierarchy.
bool is_owning() const noexcept
True if this wrapper is responsible for telling CUDA to destroy the stream upon the wrapper's own des...
Definition: stream.hpp:284
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:275
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:418
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1989
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...