cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
|
A gadget through which commands are enqueued on the stream. More...
#include <stream.hpp>
Public Member Functions | |
template<typename KernelFunction , typename... KernelParameters> | |
void | kernel_launch (const KernelFunction &kernel_function, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const |
Schedule a kernel launch on the associated stream. More... | |
void | type_erased_kernel_launch (const kernel_t &kernel, launch_configuration_t launch_configuration, span< const void *> marshalled_arguments) const |
Schedule a kernel launch on the associated stream. More... | |
void | memset (void *start, int byte_value, size_t num_bytes) const |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to a single fixed value. More... | |
void | memset (memory::region_t region, int byte_value) const |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to a single fixed value. More... | |
void | memzero (void *start, size_t num_bytes) const |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to zero. More... | |
void | memzero (memory::region_t region) const |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to zero. More... | |
event_t & | event (event_t &existing_event) const |
Have an event 'fire', i.e. More... | |
event_t | event (bool uses_blocking_sync=event::sync_by_busy_waiting, bool records_timing=event::do_record_timings, bool interprocess=event::not_interprocess) const |
Have an event 'fire', i.e. More... | |
template<typename Invokable > | |
void | host_invokable (Invokable &invokable) const |
Enqueues a host-invokable object, typically a function or closure object call. | |
void | attach_managed_region (const void *managed_region_start, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const |
Sets the attachment of a region of managed memory (i.e. More... | |
void | attach_managed_region (memory::region_t region, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const |
Sets the attachment of a region of managed memory (i.e. More... | |
void | wait (const event_t &event_) const |
Will pause all further activity on the stream until the specified event has occurred (i.e. More... | |
template<typename T > | |
void | set_single_value (T *__restrict__ ptr, T value, bool with_memory_barrier=true) const |
Schedule writing a single value to global device memory after all previous work has concluded. More... | |
template<typename T > | |
void | wait (const T *address, stream::wait_condition_t condition, T value, bool with_memory_barrier=false) const |
Wait for a value in device global memory to change so as to meet some condition. More... | |
void | flush_remote_writes () const |
Guarantee all remote writes to the specified address are visible to subsequent operations scheduled on this stream. | |
void | copy (void *destination, const void *source, size_t num_bytes) const |
Copy operations. More... | |
void | copy (void *destination, memory::const_region_t source, size_t num_bytes) const |
Copy operations. | |
void | copy (memory::region_t destination, memory::const_region_t source, size_t num_bytes) const |
Copy operations. More... | |
void | copy (memory::region_t destination, memory::const_region_t source) const |
Copy operations. | |
void | copy (void *destination, memory::const_region_t source) const |
Copy operations. | |
template<typename Iterator > | |
void | single_value_operations_batch (Iterator ops_begin, Iterator ops_end) const |
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead of multiple enqueue calls). More... | |
template<typename Container > | |
void | single_value_operations_batch (const Container &single_value_ops) const |
A gadget through which commands are enqueued on the stream.
my_stream.enqueue.copy(foo, bar, my_size)
|
inline |
Sets the attachment of a region of managed memory (i.e.
in the address space visible on all CUDA devices and the host) in one of several supported attachment modes.
managed_region_start | a pointer to the beginning of the managed memory region. This cannot be a pointer to anywhere in the middle of an allocated region - you must pass whatever cuda::memory::managed::allocate() returned. |
The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself that it doesn't need to worry about accesses to this memory from devices other than its object of attachment, so that the driver can optimize scheduling accordingly.
|
inline |
Sets the attachment of a region of managed memory (i.e.
region | the entire managed memory region; note this must not be a sub-region; you must pass whatever the CUDA memory allocation or construction code provided you with, in full. |
The attachment is actually a commitment vis-a-vis the CUDA driver and the GPU itself that it doesn't need to worry about accesses to this memory from devices other than its object of attachment, so that the driver can optimize scheduling accordingly.
|
inline |
Copy operations.
The source and destination memory regions may be anywhere the CUDA driver can map (e.g. the device's global memory, host/system memory, the global memory of another device, constant memory etc.) Schedule a copy of one region of memory to another
|
inline |
Copy operations.
num_bytes
may be smaller than the sizes of any of the regions Have an event 'fire', i.e.
marked as having occurred, after all hereto-scheduled work on this stream has been completed. Threads which are waiting on the event (via the wait method) will become available for continued execution.
existing_event | A pre-created CUDA event (for the stream's device); any existing "registration" of the event to occur elsewhere is overwritten. |
|
inline |
Have an event 'fire', i.e.
marked as having occurred, after all hereto-scheduled work on this stream has been completed. Threads which are waiting on the event (via the wait method) will become available for continued execution.
|
inline |
Schedule a kernel launch on the associated stream.
kernel | A wrapper around the kernel to launch |
launch_configuration | A description of how to launch the kernel (e.g. block and grid dimensions). |
parameters | to arguments to be passed to the kernel for this launch |
|
inline |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to a single fixed value.
start | Beginning of the region to fill |
byte_value | the value with which to fill the memory region bytes |
num_bytes | size in bytes of the region to fill |
|
inline |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to a single fixed value.
start | Beginning of the region to fill |
byte_value | the value with which to fill the memory region bytes |
num_bytes | size in bytes of the region to fill |
|
inline |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to zero.
start | Beginning of the region to fill |
num_bytes | size of the region to fill |
|
inline |
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to do it) to zero.
start | Beginning of the region to fill |
num_bytes | size of the region to fill |
|
inline |
Schedule writing a single value to global device memory after all previous work has concluded.
T | the value to schedule a setting of. Can only be a raw uint32_t or uint64_t ! |
ptr | location in global device memory to set at the appropriate time. |
value | the value to write to address . |
with_memory_barrier | if false, allows reordering of this write operation with writes scheduled before it. |
|
inline |
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead of multiple enqueue calls).
ops_begin | beginning of a sequence of single-value operation specifications |
ops_end | end of a sequence of single-value operation specifications |
|
inline |
single_value_ops | A sequence of single-value operation specifiers to enqueue together. |
|
inline |
Schedule a kernel launch on the associated stream.
kernel | A wrapper around the kernel to launch |
launch_configuration | A description of how to launch the kernel (e.g. block and grid dimensions). |
marshalled_arguments | Pointers to arguments to be passed to the kernel for this launch |
|
inline |
Will pause all further activity on the stream until the specified event has occurred (i.e.
has fired, i.e. has had all preceding scheduled work on the stream on which it was recorded completed).
event_ | the event for whose occurrence to wait; the event would typically be recorded on another stream. |
|
inline |
Wait for a value in device global memory to change so as to meet some condition.
T | the value to schedule a setting of. Can only be a raw uint32_t or uint64_t ! |
address | location in global device memory to set at the appropriate time. |
condition | the kind of condition to check against the reference value. Examples: equal to 5, greater-or-equal to 5, non-zero bitwise-and with 5 etc. |
value | the condition is checked against this reference value. Example: waiting on the value at address to be greater-or-equal to this value. |
with_memory_barrier | If true, all remote writes guaranteed to have reached the device before the wait is performed will be visible to all operations on this stream/queue scheduled after the wait. |