cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
stream.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_STREAM_HPP_
11 #define CUDA_API_WRAPPERS_STREAM_HPP_
12 
13 #include "current_context.hpp"
14 #include "current_device.hpp"
15 #include "error.hpp"
16 #include "kernel_launch.hpp"
17 #include "memory.hpp"
18 #include "miscellany.hpp"
19 #include "types.hpp"
20 
21 #if CUDA_VERSION >= 10000
23 #endif // CUDA_VERSION >= 10000
24 
25 #include <string>
26 #include <memory>
27 #include <utility>
28 #include <tuple>
29 #include <algorithm>
30 
31 namespace cuda {
32 
34 class device_t;
35 class event_t;
36 class stream_t;
38 
39 namespace memory {
40 
41 class pool_t;
42 
43 } // namespace memory
44 
45 namespace stream {
46 
47 // Use this for the second argument to create_on_current_device()
48 enum : bool {
49  implicitly_synchronizes_with_default_stream = true,
50  no_implicit_synchronization_with_default_stream = false,
51  sync = implicitly_synchronizes_with_default_stream,
52  async = no_implicit_synchronization_with_default_stream,
53  blocking = sync,
54  nonblocking = async,
55 };
56 
62 enum wait_condition_t : unsigned {
63  greater_or_equal_to = CU_STREAM_WAIT_VALUE_GEQ,
64  geq = CU_STREAM_WAIT_VALUE_GEQ,
65 
66  equality = CU_STREAM_WAIT_VALUE_EQ,
67  equals = CU_STREAM_WAIT_VALUE_EQ,
68 
69  nonzero_after_applying_bitmask = CU_STREAM_WAIT_VALUE_AND,
70  one_bits_overlap = CU_STREAM_WAIT_VALUE_AND,
71  bitwise_and = CU_STREAM_WAIT_VALUE_AND,
72 
73  zero_bits_overlap = CU_STREAM_WAIT_VALUE_NOR,
74  bitwise_nor = CU_STREAM_WAIT_VALUE_NOR,
75 } ;
76 
77 
78 #if CUDA_VERSION >= 11000
79 
83 enum synchronization_policy_t : typename ::std::underlying_type<CUsynchronizationPolicy>::type {
87  automatic = CU_SYNC_POLICY_AUTO,
88 
97  spin = CU_SYNC_POLICY_SPIN,
98 
108  yield = CU_SYNC_POLICY_YIELD,
109 
116  block = CU_SYNC_POLICY_BLOCKING_SYNC
117 };
118 #endif // CUDA_VERSION >= 11000
119 
120 namespace detail_ {
121 
122 ::std::string identify(const stream_t& stream);
123 
124 inline handle_t create_raw_in_current_context(
125  bool synchronizes_with_default_stream,
127 )
128 {
129  const unsigned int flags = (synchronizes_with_default_stream == sync) ?
130  CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
131  handle_t new_stream_handle;
132  auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
133  throw_if_error_lazy(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
134  return new_stream_handle;
135 }
136 
137 #if CUDA_VERSION >= 9020
138 inline context::handle_t context_handle_of(stream::handle_t stream_handle)
139 {
140  context::handle_t handle;
141  auto result = cuStreamGetCtx(stream_handle, &handle);
142  throw_if_error_lazy(result, "Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
143  return handle;
144 }
145 #endif // CUDA_VERSION >= 9020
146 
147 
157 inline device::id_t device_id_of(stream::handle_t stream_handle);
158 
159 inline void record_event_in_current_context(
160  device::id_t current_device_id,
161  context::handle_t current_context_handle_,
162  stream::handle_t stream_handle,
163  event::handle_t event_handle);
164 
165 template <typename Function>
166 void enqueue_function_call(const stream_t& stream, Function function, void * argument);
167 
168 } // namespace detail_
169 
191 stream_t wrap(
192  device::id_t device_id,
193  context::handle_t context_handle,
194  handle_t stream_handle,
195  bool take_ownership = false,
196  bool hold_pc_refcount_unit = false) noexcept;
197 
198 namespace detail_ {
199 
200 // Providing the same signature to multiple CUDA driver calls, to allow
201 // uniform templated use of all of them
202 template<typename T>
203 CUresult wait_on_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);
204 
205 // Providing the same signature to multiple CUDA driver calls, to allow
206 // uniform templated use of all of them
207 template<typename T>
208 CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);
209 
210 } // namespace detail_
211 
212 #if CUDA_VERSION >= 10000
213 namespace capture {
214 
215 inline state_t state(const stream_t& stream);
216 
223 void begin(const cuda::stream_t& stream, stream::capture::mode_t mode = cuda::stream::capture::mode_t::global);
224 graph::template_t end(const cuda::stream_t& stream);
225 
226 } // namespace capture
227 
228 inline bool is_capturing(const stream_t& stream)
229 {
230  return is_capturing(stream::capture::state(stream));
231 }
232 
233 #endif // CUDA_VERSION >= 10000
234 } // namespace stream
235 
236 inline void synchronize(const stream_t& stream);
237 
246 class stream_t {
247 
248 public: // type definitions
249 
250  enum : bool {
251  doesnt_synchronizes_with_default_stream = false,
252  does_synchronize_with_default_stream = true,
253  };
254 
255 public: // const getters
257  stream::handle_t handle() const noexcept { return handle_; }
258 
260  context::handle_t context_handle() const noexcept { return context_handle_; }
261 
263  device::id_t device_id() const noexcept { return device_id_; }
264 
266  device_t device() const noexcept;
267 
269  context_t context() const noexcept;
270 
272  bool is_owning() const noexcept { return owning_; }
273 
274 public: // other non-mutators
275 
282  {
283  unsigned int flags;
284  auto status = cuStreamGetFlags(handle_, &flags);
285  // Could have used the equivalent Driver API call,
286  // cuStreamGetFlags(handle_, &flags);
287  throw_if_error_lazy(status, "Failed obtaining flags for a stream in "
288  + context::detail_::identify(context_handle_, device_id_));
289  return flags & CU_STREAM_NON_BLOCKING;
290  }
291 
295  {
296  int the_priority;
297  auto status = cuStreamGetPriority(handle_, &the_priority);
298  // Could have used the equivalent Runtime API call:
299  // cuStreamGetPriority(handle_, &the_priority);
300  throw_if_error_lazy(status, "Failed obtaining priority for a stream in "
301  + context::detail_::identify(context_handle_, device_id_));
302  return the_priority;
303  }
304 
315  bool has_work_remaining() const
316  {
317  CAW_SET_SCOPE_CONTEXT(context_handle_);
318  auto status = cuStreamQuery(handle_);
319  // Could have used the equivalent runtime API call:
320  // cuStreamQuery(handle_);
321  switch(status) {
322  case CUDA_SUCCESS:
323  return false;
324  case CUDA_ERROR_NOT_READY:
325  return true;
326  default:
327  throw cuda::runtime_error(static_cast<cuda::status::named_t>(status),
328  "unexpected stream status for " + stream::detail_::identify(handle_, device_id_));
329  }
330  }
331 
338  bool is_clear() const { return !has_work_remaining(); }
339 
344  bool query() const { return is_clear(); }
345 
346 public: // mutators
347 
355  class enqueue_t {
356  protected:
357  const stream_t& associated_stream;
358 
359  public:
361  enqueue_t(const stream_t& stream) : associated_stream(stream) {}
363 
375  template<typename KernelFunction, typename... KernelParameters>
377  const KernelFunction& kernel_function,
378  launch_configuration_t launch_configuration,
379  KernelParameters &&... parameters) const
380  {
381  return cuda::enqueue_launch(
382  kernel_function,
383  associated_stream,
384  launch_configuration,
385  ::std::forward<KernelParameters>(parameters)...);
386  }
387 
402  const kernel_t& kernel,
403  launch_configuration_t launch_configuration,
404  span<const void*> marshalled_arguments) const
405  {
406  cuda::launch_type_erased(kernel, associated_stream, launch_configuration, marshalled_arguments);
407  }
408 
409 #if CUDA_VERSION >= 10000
410 
417  void graph_launch(const graph::instance_t& graph_instance) const;
418 #endif // CUDA_VERSION >= 10000
419 
427  void copy(void *destination, const void *source, size_t num_bytes) const
430  {
431  // CUDA doesn't seem to need us to be in the stream's context to enqueue the copy;
432  // however, unfortunately, it does require us to be in _some_ context.
433  context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_};
434  memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
435  }
436 
438  void copy(void* destination, memory::const_region_t source, size_t num_bytes) const
439  {
440 #ifndef NDEBUG
441  if (source.size() < num_bytes) {
442  throw ::std::logic_error("Attempt to copy more than the source region's size");
443  }
444 #endif
445  copy(destination, source.start(), num_bytes);
446  }
447 
453  void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
454  {
455  copy(destination.start(), source, num_bytes);
456  }
457 
459  void copy(memory::region_t destination, memory::const_region_t source) const
460  {
461  copy(destination, source, source.size());
462  }
463 
465  void copy(void* destination, memory::const_region_t source) const
466  {
467  copy(destination, source, source.size());
468  }
469 
471 
480  void memset(void *start, int byte_value, size_t num_bytes) const
481  {
482  // Is it necessary to set the device? I wonder.
483  CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
484  memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
485  }
486 
488  void memset(memory::region_t region, int byte_value) const
489  {
490  memset(region.data(), byte_value, region.size());
491  }
492 
504  void memzero(void *start, size_t num_bytes) const
505  {
506  CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
507  memory::device::detail_::zero(start, num_bytes, associated_stream.handle_);
508  }
509 
513  void memzero(memory::region_t region) const
514  {
515  memzero(region.data(), region.size());
516  }
517 
530  event_t& event(event_t& existing_event) const;
531 
544  event_t event(
545  bool uses_blocking_sync = event::sync_by_busy_waiting,
546  bool records_timing = event::do_record_timings,
548 
549 # if CUDA_VERSION >= 10000
550 
557  template <typename Argument>
558  void host_function_call(void (*function)(Argument*), Argument* argument) const
559  {
560  // I hope you like function declaration punning :-)
561  stream::detail_::enqueue_function_call(
562  associated_stream, reinterpret_cast<stream::callback_t>(function), argument);
563  }
564 #endif
565 
566  private:
567  template <typename Invokable>
568  static void CUDA_CB stream_launched_invoker(void* type_erased_invokable) {
569  auto invokable = reinterpret_cast<Invokable*>(type_erased_invokable);
570  (*invokable)();
571  }
572 
573  public:
575  template <typename Invokable>
576  void host_invokable(Invokable& invokable) const
577  {
578  auto type_erased_invoker = reinterpret_cast<stream::callback_t>(stream_launched_invoker<Invokable>);
579  stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
580  }
581 
582 #if CUDA_VERSION >= 11020
583 
591  memory::region_t allocate(size_t num_bytes) const
592  {
593  return memory::device::allocate(num_bytes, associated_stream);
594  }
595 
596  memory::region_t allocate(const memory::pool_t& pool, size_t num_bytes);
597 
601  void free(void* region_start) const
603  {
604  memory::device::free(region_start, associated_stream);
605  }
606 
607  void free(memory::region_t region) const
608  {
609  memory::device::free(region, associated_stream);
610  }
611 #endif // CUDA_VERSION >= 11020
612 
634  const void* managed_region_start,
635  memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
636  {
637  CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
638  // This fixed value is required by the CUDA Runtime API,
639  // to indicate that the entire memory region, rather than a part of it, will be
640  // attached to this stream
641  constexpr const size_t length = 0;
642  auto flags = static_cast<unsigned>(attachment);
643  auto status = cuStreamAttachMemAsync(
644  associated_stream.handle_, memory::device::address(managed_region_start), length, flags);
645  // Could have used the equivalent Driver API call cuStreamAttachMemAsync
646  throw_if_error_lazy(status, "Failed scheduling an attachment of a managed memory region on "
647  + stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
648  associated_stream.device_id_));
649  }
650 
671  memory::region_t region,
672  memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
673  {
674  attach_managed_region(region.start(), attachment);
675  }
676 
689  void wait(const event_t& event_) const;
690 
702  template <typename T>
703  void set_single_value(T* __restrict__ ptr, T value, bool with_memory_barrier = true) const
704  {
705  static_assert(
706  ::std::is_same<T,uint32_t>::value or ::std::is_same<T,uint64_t>::value,
707  "Unsupported type for stream value wait."
708  );
709  unsigned flags = with_memory_barrier ?
710  CU_STREAM_WRITE_VALUE_DEFAULT :
711  CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
712  auto result = static_cast<status_t>(
713  stream::detail_::write_value(associated_stream.handle_, memory::device::address(ptr), value, flags));
714  throw_if_error_lazy(result, "Failed scheduling a write to global memory on "
715  + stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
716  + associated_stream.device_id_));
717  }
718 
733  template <typename T>
734  void wait(const T* address, stream::wait_condition_t condition, T value, bool with_memory_barrier = false) const
735  {
736  static_assert(
737  ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
738  "Unsupported type for stream value wait."
739  );
740  unsigned flags = static_cast<unsigned>(condition) |
741  (with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
742  auto result = static_cast<status_t>(
743  stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
744  throw_if_error_lazy(result,
745  "Failed scheduling a wait on global memory address on "
746  + stream::detail_::identify(
747  associated_stream.handle_,
748  associated_stream.context_handle_,
749  associated_stream.device_id_) );
750  }
751 
756  void flush_remote_writes() const
757  {
758  CUstreamBatchMemOpParams op_params;
759  op_params.flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
760  op_params.flushRemoteWrites.flags = 0;
761  static const unsigned count = 1;
762  static const unsigned flags = 0;
763  // Let's cross our fingers and assume nothing else needs to be set here...
764  auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
765  throw_if_error_lazy(status, "scheduling a flush-remote-writes memory operation as a 1-op batch");
766  }
767 
768 #if CUDA_VERSION >= 11070
769  void memory_barrier(memory::barrier_scope_t scope) const
770  {
771  CUstreamBatchMemOpParams op_params;
772  op_params.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
773  op_params.memoryBarrier.flags = static_cast<unsigned>(scope);
774  static const unsigned count = 1;
775  static const unsigned flags = 0;
776  // Let's cross our fingers and assume nothing else needs to be set here...
777  auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
778  throw_if_error_lazy(status, "scheduling a memory barrier operation as a 1-op batch");
779  }
780 #endif
781 
795  template <typename Iterator>
796  void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
797  {
798  static_assert(
799  ::std::is_same<typename ::std::iterator_traits<Iterator>::value_type, CUstreamBatchMemOpParams>::value,
800  "Only accepting iterator pairs for the CUDA-driver-API memory operation descriptor,"
801  " CUstreamBatchMemOpParams, as the value type");
802  auto num_ops = ::std::distance(ops_begin, ops_end);
803  if (::std::is_same<typename ::std::remove_const<decltype(ops_begin)>::type, CUstreamBatchMemOpParams* >::value,
804  "Only accepting containers of the CUDA-driver-API memory operation descriptor, CUstreamBatchMemOpParams")
805  {
806  auto ops_ptr = reinterpret_cast<const CUstreamBatchMemOpParams*>(ops_begin);
807  cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_ptr);
808  }
809  else {
810  auto ops_uptr = ::std::unique_ptr<CUstreamBatchMemOpParams[]>(new CUstreamBatchMemOpParams[num_ops]);
811  ::std::copy(ops_begin, ops_end, ops_uptr.get());
812  cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_uptr.get());
813  }
814  }
815 
819  template <typename Container>
820  void single_value_operations_batch(const Container& single_value_ops) const
821  {
822  return single_value_operations_batch(single_value_ops.begin(), single_value_ops.end());
823  }
824 
825  }; // class enqueue_t
826 
831  void synchronize() const
832  {
833  cuda::synchronize(*this);
834  }
835 
836 #if CUDA_VERSION >= 11000
837  stream::synchronization_policy_t synchronization_policy()
838  {
839  CAW_SET_SCOPE_CONTEXT(context_handle_);
840  CUstreamAttrValue wrapped_result{};
841  auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
842  throw_if_error_lazy(status, ::std::string("Obtaining the synchronization policy of ") + stream::detail_::identify(*this));
843  return static_cast<stream::synchronization_policy_t>(wrapped_result.syncPolicy);
844  }
845 
846  void set_synchronization_policy(stream::synchronization_policy_t policy)
847  {
848  CAW_SET_SCOPE_CONTEXT(context_handle_);
849  CUstreamAttrValue wrapped_value{};
850  wrapped_value.syncPolicy = static_cast<CUsynchronizationPolicy>(policy);
851  auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
852  throw_if_error_lazy(status, ::std::string("Setting the synchronization policy of ") + stream::detail_::identify(*this));
853  }
854 #endif
855 
856  // TODO: Create a dummy capture object, then we could have capture.start(), capture.stop(), capture.status(),
857  // and perhaps a capture_() which takes a lambda. Also offer a
858  // cuda::stream::capture(const stream_t& stream, F f) template!
859 
860 
861 #if CUDA_VERSION >= 10000
862 
867  void begin_capture(stream::capture::mode_t mode = cuda::stream::capture::mode_t::global) const
868  {
869  stream::capture::begin(*this, mode);
870  }
871 
875  bool is_capturing() const { return stream::is_capturing(*this); }
876 
883  graph::template_t end_capture() const
884  {
885  return stream::capture::end(*this);
886  }
887 #endif // CUDA_VERSION >= 10000
888 
889 protected: // constructor
890 
891  stream_t(
892  device::id_t device_id,
893  context::handle_t context_handle,
894  stream::handle_t stream_handle,
895  bool take_ownership = false,
896  bool hold_primary_context_refcount_unit = false) noexcept
897  :
898  device_id_(device_id),
899  context_handle_(context_handle),
900  handle_(stream_handle),
901  owning_(take_ownership),
902  holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
903  { }
904 
905 public: // constructors and destructor
906 
907  // Streams cannot be copied, despite our allowing non-owning class instances.
908  // The reason is that we might inadvertently copy of an owning stream, creating
909  // a non-owning stream and letting the original owning stream go out of scope -
910  // thus destructing the object, and destroying the underlying CUDA object.
911  // Essentially, that is like passing a reference to a local variable - which we
912  // may not do.
913  stream_t(const stream_t& other) = delete;
914 
915  stream_t(stream_t&& other) noexcept :
916  stream_t(other.device_id_, other.context_handle_, other.handle_, other.owning_, other.holds_pc_refcount_unit_)
917  {
918  other.owning_ = false;
919  other.holds_pc_refcount_unit_ = false;
920  }
921 
922  ~stream_t() noexcept(false)
923  {
924  if (owning_) {
925  CAW_SET_SCOPE_CONTEXT(context_handle_);
926  cuStreamDestroy(handle_);
927  }
928  // TODO: DRY
929  if (holds_pc_refcount_unit_) {
930 #ifdef NDEBUG
931  device::primary_context::detail_::decrease_refcount_nothrow(device_id_);
932  // Note: "Swallowing" any potential error to avoid ::std::terminate(); also,
933  // because a failure probably means the primary context is inactive already
934 #else
935  device::primary_context::detail_::decrease_refcount(device_id_);
936 #endif
937  }
938  }
939 
940 public: // operators
941 
942  stream_t& operator=(const stream_t& other) = delete;
943  stream_t& operator=(stream_t&& other) noexcept
944  {
945  ::std::swap(device_id_, other.device_id_);
946  ::std::swap(context_handle_, other.context_handle_);
947  ::std::swap(handle_, other.handle_);
948  ::std::swap(owning_, other.owning_);
949  ::std::swap(holds_pc_refcount_unit_, holds_pc_refcount_unit_);
950  return *this;
951  }
952 
953 public: // friendship
954 
955  friend stream_t stream::wrap(
956  device::id_t device_id,
957  context::handle_t context_handle,
958  stream::handle_t stream_handle,
959  bool take_ownership,
960  bool hold_pc_refcount_unit) noexcept;
961 
967  friend inline bool operator==(const stream_t& lhs, const stream_t& rhs) noexcept
968  {
969  return
970  lhs.context_handle_ == rhs.context_handle_
971 #ifndef NDEBUG
972  and lhs.device_id_ == rhs.device_id_
973 #endif
974  and lhs.handle_ == rhs.handle_;
975  }
976 
977 protected: // data members
978  device::id_t device_id_;
979  context::handle_t context_handle_;
980  stream::handle_t handle_;
981  bool owning_;
982  bool holds_pc_refcount_unit_;
983  // When context_handle_ is the handle of a primary context, this event may
984  // be "keeping that context alive" through the refcount - in which case
985  // it must release its refcount unit on destruction
986 
987 public: // data members - which only exist in lieu of namespaces
988 
991  const enqueue_t enqueue { *this };
992  // The use of *this here is safe, since enqueue_t doesn't do anything with it
993  // on its own. Any use of enqueue only happens through, well, *this - and
994  // after construction.
995 };
996 
998 inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
999 {
1000  return not (lhs == rhs);
1001 }
1003 
1004 namespace stream {
1005 
1007  device::id_t device_id,
1008  context::handle_t context_handle,
1009  stream::handle_t stream_handle,
1010  bool take_ownership,
1011  bool hold_pc_refcount_unit) noexcept
1012 {
1013  return { device_id, context_handle, stream_handle, take_ownership, hold_pc_refcount_unit };
1014 }
1015 
1016 namespace detail_ {
1017 
1018 inline stream_t create(
1019  device::id_t device_id,
1020  context::handle_t context_handle,
1021  bool synchronizes_with_default_stream,
1023  bool hold_pc_refcount_unit = false)
1024 {
1025  CAW_SET_SCOPE_CONTEXT(context_handle);
1026  auto new_stream_handle = cuda::stream::detail_::create_raw_in_current_context(
1027  synchronizes_with_default_stream, priority);
1028  return wrap(device_id, context_handle, new_stream_handle, do_take_ownership, hold_pc_refcount_unit);
1029 }
1030 
1031 template<>
1032 inline CUresult wait_on_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
1033 {
1034  return cuStreamWaitValue32(stream_handle, address, value, flags);
1035 }
1036 
1037 template<>
1038 inline CUresult wait_on_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
1039 {
1040  return cuStreamWaitValue64(stream_handle, address, value, flags);
1041 }
1042 
1043 
1044 template<>
1045 inline CUresult write_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
1046 {
1047  return cuStreamWriteValue32(stream_handle, address, value, flags);
1048 }
1049 
1050 template<>
1051 inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
1052 {
1053  return cuStreamWriteValue64(stream_handle, address, value, flags);
1054 }
1055 
1056 template <typename Function>
1057 void enqueue_function_call(const stream_t& stream, Function function, void* argument)
1058 {
1059  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
1060 
1061  // While we always register the same static function, `callback_adapter` as the
1062  // callback - what it will actually _do_ is invoke the callback we were passed.
1063 
1064 #if CUDA_VERSION >= 10000
1065  auto status = cuLaunchHostFunc(stream.handle(), function, argument);
1066  // Could have used the equivalent Driver API call: cuLaunchHostFunc()
1067 #else
1068  // The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
1069  // variable, see:
1070  // http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
1071  static constexpr const unsigned fixed_flags { 0u };
1072  auto status = cuStreamAddCallback(stream.handle(), function, argument, fixed_flags);
1073 #endif
1074  throw_if_error_lazy(status, "Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
1075 }
1076 
1077 } // namespace detail_
1078 
1092 stream_t create(
1093  const device_t& device,
1094  bool synchronizes_with_default_stream,
1096 
1112 stream_t create(
1113  const context_t& context,
1114  bool synchronizes_with_default_stream,
1116  bool hold_pc_refcount_unit = false);
1118 
1119 #if CUDA_VERSION >= 10000
1120 namespace capture {
1121 
1122 inline state_t state(const stream_t& stream)
1123 {
1124  context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
1125  CUstreamCaptureStatus capture_status;
1126  auto op_status = cuStreamIsCapturing(stream.handle(), &capture_status);
1127  throw_if_error_lazy(op_status, "Failed beginning to capture on " + stream::detail_::identify(stream));
1128  return static_cast<state_t>(capture_status);
1129 }
1130 
1131 inline void begin(const cuda::stream_t& stream, stream::capture::mode_t mode)
1132 {
1133  context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
1134  auto status = cuStreamBeginCapture(stream.handle(), static_cast<CUstreamCaptureMode>(mode));
1135  throw_if_error_lazy(status, "Failed beginning to capture on " + stream::detail_::identify(stream));
1136 }
1137 
1138 } // namespace capture
1139 #endif // CUDA_VERSION >= 10000
1140 
1141 } // namespace stream
1142 
1152  inline void synchronize(const stream_t& stream)
1153 {
1154  // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
1155  // and not have trouble acting on a stream in another context - it balks at doing so under
1156  // certain conditions, so we must place ourselves in the stream's context.
1157  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
1158  auto status = cuStreamSynchronize(stream.handle());
1159  throw_if_error_lazy(status, "Failed synchronizing " + stream::detail_::identify(stream));
1160 }
1161 
1162 #if CUDA_VERSION >= 11000
1163 
1174 void copy_attributes(const stream_t& dest, const stream_t& src);
1175 #endif // CUDA_VERSION >= 11000
1176 
1177 } // namespace cuda
1178 
1179 #endif // CUDA_API_WRAPPERS_STREAM_HPP_
void memset(void *start, int byte_value, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:480
bool has_work_remaining() const
Determines whether all work on this stream has been completed.
Definition: stream.hpp:315
void copy(void *destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:438
bool query() const
An alias for is_clear() - to conform to how the CUDA runtime API names this functionality.
Definition: stream.hpp:344
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260
Proxy class for a CUDA stream.
Definition: stream.hpp:246
void memzero(memory::region_t region) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:513
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257
Wrapper class for a CUDA context.
Definition: context.hpp:244
void synchronize() const
Block or busy-wait until all previously-scheduled work on this stream has been completed.
Definition: stream.hpp:831
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960
void single_value_operations_batch(const Container &single_value_ops) const
Definition: stream.hpp:820
friend bool operator==(const stream_t &lhs, const stream_t &rhs) noexcept
Definition: stream.hpp:967
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
void copy(void *destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:465
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
Wrapper class for a CUDA event.
Definition: event.hpp:133
A gadget through which commands are enqueued on the stream.
Definition: stream.hpp:355
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context.
Definition: memory.hpp:106
void wait(const T *address, stream::wait_condition_t condition, T value, bool with_memory_barrier=false) const
Wait for a value in device global memory to change so as to meet some condition.
Definition: stream.hpp:734
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
void wait(const event_t &event)
Have the calling thread wait - either busy-waiting or blocking - and return only after this event has...
Definition: event.hpp:467
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:130
CUevent handle_t
The CUDA driver&#39;s raw handle for events.
Definition: types.hpp:217
void attach_managed_region(const void *managed_region_start, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:633
bool is_clear() const
The opposite of has_work_remaining()
Definition: stream.hpp:338
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:625
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
void kernel_launch(const KernelFunction &kernel_function, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:376
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968
wait_condition_t
Kinds of conditions to apply to a value in GPU global memory when waiting on that value...
Definition: stream.hpp:62
CUstreamCallback callback_t
The CUDA driver&#39;s raw handle for a host-side callback function.
Definition: types.hpp:257
void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead o...
Definition: stream.hpp:796
bool synchronizes_with_default_stream() const
When true, work running in the created stream may run concurrently with work in stream 0 (the NULL st...
Definition: stream.hpp:281
stream::priority_t priority() const
Definition: stream.hpp:294
stream_t wrap(device::id_t device_id, context::handle_t context_handle, handle_t stream_handle, bool take_ownership=false, bool hold_pc_refcount_unit=false) noexcept
Wrap an existing stream in a stream_t instance.
Definition: stream.hpp:1006
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
void copy(memory::region_t destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:459
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1962
void memset(memory::region_t region, int byte_value) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:488
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:682
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:411
Miscellaneous functionality which does not fit in another file, and does not depend on the main proxy...
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
void flush_remote_writes() const
Guarantee all remote writes to the specified address are visible to subsequent operations scheduled o...
Definition: stream.hpp:756
void attach_managed_region(memory::region_t region, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:670
Can only be used by the process which created it.
Definition: constants.hpp:95
void type_erased_kernel_launch(const kernel_t &kernel, launch_configuration_t launch_configuration, span< const void *> marshalled_arguments) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:401
void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:453
void set_single_value(T *__restrict__ ptr, T value, bool with_memory_barrier=true) const
Schedule writing a single value to global device memory after all previous work has concluded...
Definition: stream.hpp:703
Wrapper class for a CUDA device.
Definition: device.hpp:135
void memzero(void *start, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:504
Fundamental CUDA-related type definitions.
void host_invokable(Invokable &invokable) const
Enqueues a host-invokable object, typically a function or closure object call.
Definition: stream.hpp:576
freestanding wrapper functions for working with CUDA&#39;s various kinds of memory spaces, arranged into a relevant namespace hierarchy.
bool is_owning() const noexcept
True if this wrapper is responsible for telling CUDA to destroy the stream upon the wrapper&#39;s own des...
Definition: stream.hpp:272
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:77
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:263
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:416
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1975
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...