10 #ifndef CUDA_API_WRAPPERS_STREAM_HPP_ 11 #define CUDA_API_WRAPPERS_STREAM_HPP_ 21 #if CUDA_VERSION >= 10000 23 #endif // CUDA_VERSION >= 10000 49 implicitly_synchronizes_with_default_stream =
true,
50 no_implicit_synchronization_with_default_stream =
false,
51 sync = implicitly_synchronizes_with_default_stream,
52 async = no_implicit_synchronization_with_default_stream,
63 greater_or_equal_to = CU_STREAM_WAIT_VALUE_GEQ,
64 geq = CU_STREAM_WAIT_VALUE_GEQ,
66 equality = CU_STREAM_WAIT_VALUE_EQ,
67 equals = CU_STREAM_WAIT_VALUE_EQ,
69 nonzero_after_applying_bitmask = CU_STREAM_WAIT_VALUE_AND,
70 one_bits_overlap = CU_STREAM_WAIT_VALUE_AND,
71 bitwise_and = CU_STREAM_WAIT_VALUE_AND,
73 zero_bits_overlap = CU_STREAM_WAIT_VALUE_NOR,
74 bitwise_nor = CU_STREAM_WAIT_VALUE_NOR,
78 #if CUDA_VERSION >= 11000 83 enum synchronization_policy_t : typename ::std::underlying_type<CUsynchronizationPolicy>::type {
87 automatic = CU_SYNC_POLICY_AUTO,
97 spin = CU_SYNC_POLICY_SPIN,
108 yield = CU_SYNC_POLICY_YIELD,
116 block = CU_SYNC_POLICY_BLOCKING_SYNC
118 #endif // CUDA_VERSION >= 11000 122 ::std::string identify(
const stream_t& stream);
124 inline handle_t create_raw_in_current_context(
125 bool synchronizes_with_default_stream,
129 const unsigned int flags = (synchronizes_with_default_stream == sync) ?
130 CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
132 auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
133 throw_if_error_lazy(status,
"Failed creating a new stream in " + detail_::identify(new_stream_handle));
134 return new_stream_handle;
137 #if CUDA_VERSION >= 9020 141 auto result = cuStreamGetCtx(stream_handle, &handle);
142 throw_if_error_lazy(result,
"Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
145 #endif // CUDA_VERSION >= 9020 159 inline void record_event_in_current_context(
165 template <
typename Function>
166 void enqueue_function_call(
const stream_t& stream, Function
function,
void * argument);
195 bool take_ownership =
false,
196 bool hold_pc_refcount_unit =
false) noexcept;
203 CUresult wait_on_value(CUstream stream_handle, CUdeviceptr
address, T value,
unsigned int flags);
208 CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value,
unsigned int flags);
212 #if CUDA_VERSION >= 10000 215 inline state_t state(
const stream_t& stream);
223 void begin(
const cuda::stream_t& stream, stream::capture::mode_t mode = cuda::stream::capture::mode_t::global);
228 inline bool is_capturing(
const stream_t& stream)
230 return is_capturing(stream::capture::state(stream));
233 #endif // CUDA_VERSION >= 10000 251 doesnt_synchronizes_with_default_stream =
false,
252 does_synchronize_with_default_stream =
true,
284 auto status = cuStreamGetFlags(handle_, &flags);
288 + context::detail_::identify(context_handle_, device_id_));
289 return flags & CU_STREAM_NON_BLOCKING;
297 auto status = cuStreamGetPriority(handle_, &the_priority);
301 + context::detail_::identify(context_handle_, device_id_));
317 CAW_SET_SCOPE_CONTEXT(context_handle_);
318 auto status = cuStreamQuery(handle_);
324 case CUDA_ERROR_NOT_READY:
328 "unexpected stream status for " + stream::detail_::identify(handle_, device_id_));
338 bool is_clear()
const {
return !has_work_remaining(); }
344 bool query()
const {
return is_clear(); }
375 template<
typename KernelFunction,
typename... KernelParameters>
377 const KernelFunction& kernel_function,
379 KernelParameters &&... parameters)
const 384 launch_configuration,
385 ::std::forward<KernelParameters>(parameters)...);
404 span<const void*> marshalled_arguments)
const 409 #if CUDA_VERSION >= 10000 417 void graph_launch(
const graph::instance_t& graph_instance)
const;
418 #endif // CUDA_VERSION >= 10000 427 void copy(
void *destination,
const void *source,
size_t num_bytes)
const 433 context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_};
434 memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
441 if (source.size() < num_bytes) {
442 throw ::std::logic_error(
"Attempt to copy more than the source region's size");
445 copy(destination, source.start(), num_bytes);
455 copy(destination.start(), source, num_bytes);
461 copy(destination, source, source.size());
467 copy(destination, source, source.size());
483 CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
484 memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
490 memset(region.data(), byte_value, region.size());
506 CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
515 memzero(region.data(), region.size());
546 bool records_timing = event::do_record_timings,
549 # if CUDA_VERSION >= 10000 557 template <
typename Argument>
558 void host_function_call(
void (*
function)(Argument*), Argument* argument)
const 561 stream::detail_::enqueue_function_call(
562 associated_stream, reinterpret_cast<stream::callback_t>(
function), argument);
567 template <
typename Invokable>
568 static void CUDA_CB stream_launched_invoker(
void* type_erased_invokable) {
569 auto invokable =
reinterpret_cast<Invokable*
>(type_erased_invokable);
575 template <
typename Invokable>
578 auto type_erased_invoker =
reinterpret_cast<stream::callback_t>(stream_launched_invoker<Invokable>);
579 stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
582 #if CUDA_VERSION >= 11020 601 void free(
void* region_start)
const 611 #endif // CUDA_VERSION >= 11020 634 const void* managed_region_start,
637 CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
641 constexpr
const size_t length = 0;
642 auto flags =
static_cast<unsigned>(attachment);
643 auto status = cuStreamAttachMemAsync(
646 throw_if_error_lazy(status,
"Failed scheduling an attachment of a managed memory region on " 647 + stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
648 associated_stream.device_id_));
674 attach_managed_region(region.start(), attachment);
702 template <
typename T>
706 ::std::is_same<T,uint32_t>::value or ::std::is_same<T,uint64_t>::value,
707 "Unsupported type for stream value wait." 709 unsigned flags = with_memory_barrier ?
710 CU_STREAM_WRITE_VALUE_DEFAULT :
711 CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
712 auto result =
static_cast<status_t>(
715 + stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
716 + associated_stream.device_id_));
733 template <
typename T>
737 ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
738 "Unsupported type for stream value wait." 740 unsigned flags =
static_cast<unsigned>(condition) |
741 (with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
742 auto result =
static_cast<status_t>(
743 stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
745 "Failed scheduling a wait on global memory address on " 746 + stream::detail_::identify(
747 associated_stream.handle_,
748 associated_stream.context_handle_,
749 associated_stream.device_id_) );
758 CUstreamBatchMemOpParams op_params;
759 op_params.flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
760 op_params.flushRemoteWrites.flags = 0;
761 static const unsigned count = 1;
762 static const unsigned flags = 0;
764 auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
765 throw_if_error_lazy(status,
"scheduling a flush-remote-writes memory operation as a 1-op batch");
768 #if CUDA_VERSION >= 11070 769 void memory_barrier(memory::barrier_scope_t scope)
const 771 CUstreamBatchMemOpParams op_params;
772 op_params.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
773 op_params.memoryBarrier.flags =
static_cast<unsigned>(scope);
774 static const unsigned count = 1;
775 static const unsigned flags = 0;
777 auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
795 template <
typename Iterator>
799 ::std::is_same<typename ::std::iterator_traits<Iterator>::value_type, CUstreamBatchMemOpParams>::value,
800 "Only accepting iterator pairs for the CUDA-driver-API memory operation descriptor," 801 " CUstreamBatchMemOpParams, as the value type");
802 auto num_ops = ::std::distance(ops_begin, ops_end);
803 if (::std::is_same<typename ::std::remove_const<decltype(ops_begin)>::type, CUstreamBatchMemOpParams* >::value,
804 "Only accepting containers of the CUDA-driver-API memory operation descriptor, CUstreamBatchMemOpParams")
806 auto ops_ptr =
reinterpret_cast<const CUstreamBatchMemOpParams*
>(ops_begin);
807 cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_ptr);
810 auto ops_uptr = ::std::unique_ptr<CUstreamBatchMemOpParams[]>(
new CUstreamBatchMemOpParams[num_ops]);
811 ::std::copy(ops_begin, ops_end, ops_uptr.get());
812 cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_uptr.get());
819 template <
typename Container>
822 return single_value_operations_batch(single_value_ops.begin(), single_value_ops.end());
836 #if CUDA_VERSION >= 11000 837 stream::synchronization_policy_t synchronization_policy()
839 CAW_SET_SCOPE_CONTEXT(context_handle_);
840 CUstreamAttrValue wrapped_result{};
841 auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
842 throw_if_error_lazy(status, ::std::string(
"Obtaining the synchronization policy of ") + stream::detail_::identify(*
this));
843 return static_cast<stream::synchronization_policy_t
>(wrapped_result.syncPolicy);
846 void set_synchronization_policy(stream::synchronization_policy_t policy)
848 CAW_SET_SCOPE_CONTEXT(context_handle_);
849 CUstreamAttrValue wrapped_value{};
850 wrapped_value.syncPolicy =
static_cast<CUsynchronizationPolicy
>(policy);
851 auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
852 throw_if_error_lazy(status, ::std::string(
"Setting the synchronization policy of ") + stream::detail_::identify(*
this));
861 #if CUDA_VERSION >= 10000 867 void begin_capture(stream::capture::mode_t mode = cuda::stream::capture::mode_t::global)
const 869 stream::capture::begin(*
this, mode);
875 bool is_capturing()
const {
return stream::is_capturing(*
this); }
883 graph::template_t end_capture()
const 885 return stream::capture::end(*
this);
887 #endif // CUDA_VERSION >= 10000 895 bool take_ownership =
false,
896 bool hold_primary_context_refcount_unit =
false) noexcept
898 device_id_(device_id),
899 context_handle_(context_handle),
900 handle_(stream_handle),
901 owning_(take_ownership),
902 holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
916 stream_t(other.device_id_, other.context_handle_, other.handle_, other.owning_, other.holds_pc_refcount_unit_)
918 other.owning_ =
false;
919 other.holds_pc_refcount_unit_ =
false;
925 CAW_SET_SCOPE_CONTEXT(context_handle_);
926 cuStreamDestroy(handle_);
929 if (holds_pc_refcount_unit_) {
931 device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
935 device::primary_context::detail_::decrease_refcount(device_id_);
945 ::std::swap(device_id_, other.device_id_);
946 ::std::swap(context_handle_, other.context_handle_);
947 ::std::swap(handle_, other.handle_);
948 ::std::swap(owning_, other.owning_);
949 ::std::swap(holds_pc_refcount_unit_, holds_pc_refcount_unit_);
960 bool hold_pc_refcount_unit) noexcept;
970 lhs.context_handle_ == rhs.context_handle_
972 and lhs.device_id_ == rhs.device_id_
974 and lhs.handle_ == rhs.handle_;
982 bool holds_pc_refcount_unit_;
1000 return not (lhs == rhs);
1010 bool take_ownership,
1011 bool hold_pc_refcount_unit) noexcept
1013 return { device_id, context_handle, stream_handle, take_ownership, hold_pc_refcount_unit };
1021 bool synchronizes_with_default_stream,
1023 bool hold_pc_refcount_unit =
false)
1025 CAW_SET_SCOPE_CONTEXT(context_handle);
1026 auto new_stream_handle = cuda::stream::detail_::create_raw_in_current_context(
1027 synchronizes_with_default_stream, priority);
1028 return wrap(device_id, context_handle, new_stream_handle, do_take_ownership, hold_pc_refcount_unit);
1032 inline CUresult wait_on_value<uint32_t>(CUstream stream_handle, CUdeviceptr
address, uint32_t value,
unsigned int flags)
1034 return cuStreamWaitValue32(stream_handle, address, value, flags);
1038 inline CUresult wait_on_value<uint64_t>(CUstream stream_handle, CUdeviceptr
address, uint64_t value,
unsigned int flags)
1040 return cuStreamWaitValue64(stream_handle, address, value, flags);
1045 inline CUresult write_value<uint32_t>(CUstream stream_handle, CUdeviceptr
address, uint32_t value,
unsigned int flags)
1047 return cuStreamWriteValue32(stream_handle, address, value, flags);
1051 inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr
address, uint64_t value,
unsigned int flags)
1053 return cuStreamWriteValue64(stream_handle, address, value, flags);
1056 template <
typename Function>
1057 void enqueue_function_call(
const stream_t& stream, Function
function,
void* argument)
1064 #if CUDA_VERSION >= 10000 1065 auto status = cuLaunchHostFunc(stream.
handle(),
function, argument);
1071 static constexpr
const unsigned fixed_flags { 0u };
1072 auto status = cuStreamAddCallback(stream.
handle(),
function, argument, fixed_flags);
1074 throw_if_error_lazy(status,
"Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
1094 bool synchronizes_with_default_stream,
1114 bool synchronizes_with_default_stream,
1116 bool hold_pc_refcount_unit =
false);
1119 #if CUDA_VERSION >= 10000 1122 inline state_t state(
const stream_t& stream)
1124 context::current::detail_::scoped_override_t set_context_for_this_scope(stream.
context_handle());
1125 CUstreamCaptureStatus capture_status;
1126 auto op_status = cuStreamIsCapturing(stream.
handle(), &capture_status);
1127 throw_if_error_lazy(op_status,
"Failed beginning to capture on " + stream::detail_::identify(stream));
1128 return static_cast<state_t
>(capture_status);
1131 inline void begin(
const cuda::stream_t& stream, stream::capture::mode_t mode)
1133 context::current::detail_::scoped_override_t set_context_for_this_scope(stream.
context_handle());
1134 auto status = cuStreamBeginCapture(stream.
handle(),
static_cast<CUstreamCaptureMode
>(mode));
1135 throw_if_error_lazy(status,
"Failed beginning to capture on " + stream::detail_::identify(stream));
1139 #endif // CUDA_VERSION >= 10000 1158 auto status = cuStreamSynchronize(stream.
handle());
1162 #if CUDA_VERSION >= 11000 1175 #endif // CUDA_VERSION >= 11000 1179 #endif // CUDA_API_WRAPPERS_STREAM_HPP_ void memset(void *start, int byte_value, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:480
bool has_work_remaining() const
Determines whether all work on this stream has been completed.
Definition: stream.hpp:315
void copy(void *destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:438
bool query() const
An alias for is_clear() - to conform to how the CUDA runtime API names this functionality.
Definition: stream.hpp:344
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260
Proxy class for a CUDA stream.
Definition: stream.hpp:246
void memzero(memory::region_t region) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:513
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257
Wrapper class for a CUDA context.
Definition: context.hpp:244
void synchronize() const
Block or busy-wait until all previously-scheduled work on this stream has been completed.
Definition: stream.hpp:831
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960
void single_value_operations_batch(const Container &single_value_ops) const
Definition: stream.hpp:820
friend bool operator==(const stream_t &lhs, const stream_t &rhs) noexcept
Definition: stream.hpp:967
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
void copy(void *destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:465
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
Wrapper class for a CUDA event.
Definition: event.hpp:133
A gadget through which commands are enqueued on the stream.
Definition: stream.hpp:355
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context.
Definition: memory.hpp:106
void wait(const T *address, stream::wait_condition_t condition, T value, bool with_memory_barrier=false) const
Wait for a value in device global memory to change so as to meet some condition.
Definition: stream.hpp:734
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
void wait(const event_t &event)
Have the calling thread wait - either busy-waiting or blocking - and return only after this event has...
Definition: event.hpp:467
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130
CUevent handle_t
The CUDA driver's raw handle for events.
Definition: types.hpp:217
void attach_managed_region(const void *managed_region_start, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:633
bool is_clear() const
The opposite of has_work_remaining()
Definition: stream.hpp:338
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
void kernel_launch(const KernelFunction &kernel_function, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:376
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968
wait_condition_t
Kinds of conditions to apply to a value in GPU global memory when waiting on that value...
Definition: stream.hpp:62
CUstreamCallback callback_t
The CUDA driver's raw handle for a host-side callback function.
Definition: types.hpp:257
void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead o...
Definition: stream.hpp:796
bool synchronizes_with_default_stream() const
When true, work running in the created stream may run concurrently with work in stream 0 (the NULL st...
Definition: stream.hpp:281
stream::priority_t priority() const
Definition: stream.hpp:294
stream_t wrap(device::id_t device_id, context::handle_t context_handle, handle_t stream_handle, bool take_ownership=false, bool hold_pc_refcount_unit=false) noexcept
Wrap an existing stream in a stream_t instance.
Definition: stream.hpp:1006
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
void copy(memory::region_t destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:459
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Wrappers for getting and setting CUDA's choice of which device is 'current'.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962
void memset(memory::region_t region, int byte_value) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:488
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver's launch...
Definition: kernel_launch.hpp:411
Miscellaneous functionality which does not fit in another file, and does not depend on the main proxy...
CUstream handle_t
The CUDA driver's raw handle for streams.
Definition: types.hpp:239
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
void flush_remote_writes() const
Guarantee all remote writes to the specified address are visible to subsequent operations scheduled o...
Definition: stream.hpp:756
void attach_managed_region(memory::region_t region, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:670
Can only be used by the process which created it.
Definition: constants.hpp:95
void type_erased_kernel_launch(const kernel_t &kernel, launch_configuration_t launch_configuration, span< const void *> marshalled_arguments) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:401
void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:453
void set_single_value(T *__restrict__ ptr, T value, bool with_memory_barrier=true) const
Schedule writing a single value to global device memory after all previous work has concluded...
Definition: stream.hpp:703
Wrapper class for a CUDA device.
Definition: device.hpp:135
void memzero(void *start, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:504
Fundamental CUDA-related type definitions.
void host_invokable(Invokable &invokable) const
Enqueues a host-invokable object, typically a function or closure object call.
Definition: stream.hpp:576
freestanding wrapper functions for working with CUDA's various kinds of memory spaces, arranged into a relevant namespace hierarchy.
bool is_owning() const noexcept
True if this wrapper is responsible for telling CUDA to destroy the stream upon the wrapper's own des...
Definition: stream.hpp:272
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:263
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:416
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1975
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...