cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
stream.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_STREAM_HPP_
11 #define CUDA_API_WRAPPERS_STREAM_HPP_
12 
13 #include "current_context.hpp"
14 #include "current_device.hpp"
15 #include "error.hpp"
16 #include "kernel_launch.hpp"
17 #include "memory.hpp"
18 #include "miscellany.hpp"
19 #include "types.hpp"
20 
21 #if CUDA_VERSION >= 10000
23 #endif // CUDA_VERSION >= 10000
24 
25 #include <string>
26 #include <memory>
27 #include <utility>
28 #include <tuple>
29 #include <algorithm>
30 
31 namespace cuda {
32 
34 class device_t;
35 class event_t;
36 class stream_t;
38 
39 namespace memory {
40 
41 class pool_t;
42 
43 } // namespace memory
44 
45 namespace stream {
46 
47 // Use this for the second argument to create_on_current_device()
48 enum : bool {
49  implicitly_synchronizes_with_default_stream = true,
50  no_implicit_synchronization_with_default_stream = false,
51  sync = implicitly_synchronizes_with_default_stream,
52  async = no_implicit_synchronization_with_default_stream,
53  blocking = sync,
54  nonblocking = async,
55 };
56 
62 enum wait_condition_t : unsigned {
63  greater_or_equal_to = CU_STREAM_WAIT_VALUE_GEQ,
64  geq = CU_STREAM_WAIT_VALUE_GEQ,
65 
66  equality = CU_STREAM_WAIT_VALUE_EQ,
67  equals = CU_STREAM_WAIT_VALUE_EQ,
68 
69  nonzero_after_applying_bitmask = CU_STREAM_WAIT_VALUE_AND,
70  one_bits_overlap = CU_STREAM_WAIT_VALUE_AND,
71  bitwise_and = CU_STREAM_WAIT_VALUE_AND,
72 
73  zero_bits_overlap = CU_STREAM_WAIT_VALUE_NOR,
74  bitwise_nor = CU_STREAM_WAIT_VALUE_NOR,
75 } ;
76 
77 
78 #if CUDA_VERSION >= 11000
79 
83 enum synchronization_policy_t : typename ::std::underlying_type<CUsynchronizationPolicy>::type {
87  automatic = CU_SYNC_POLICY_AUTO,
88 
97  spin = CU_SYNC_POLICY_SPIN,
98 
108  yield = CU_SYNC_POLICY_YIELD,
109 
116  block = CU_SYNC_POLICY_BLOCKING_SYNC
117 };
118 #endif // CUDA_VERSION >= 11000
119 
120 namespace detail_ {
121 
122 ::std::string identify(const stream_t& stream);
123 
124 inline handle_t create_raw_in_current_context(
125  bool synchronizes_with_default_stream,
127 )
128 {
129  const unsigned int flags = (synchronizes_with_default_stream == sync) ?
130  CU_STREAM_DEFAULT : CU_STREAM_NON_BLOCKING;
131  handle_t new_stream_handle;
132  auto status = cuStreamCreateWithPriority(&new_stream_handle, flags, priority);
133  throw_if_error_lazy(status, "Failed creating a new stream in " + detail_::identify(new_stream_handle));
134  return new_stream_handle;
135 }
136 
137 inline status_t destroy_nothrow(handle_t handle, context::handle_t context_handle)
138 {
139  CAW_SET_SCOPE_CONTEXT(context_handle);
140  return cuStreamDestroy(handle);
141 }
142 
143 inline void destroy(handle_t handle, context::handle_t context_handle, device::id_t device_id)
144 {
145  auto status = destroy_nothrow(handle, context_handle);
146  throw_if_error_lazy(status, "Failed destroying " + identify(handle, context_handle, device_id));
147 }
148 
149 #if CUDA_VERSION >= 9020
150 inline context::handle_t context_handle_of(stream::handle_t stream_handle)
151 {
152  context::handle_t handle;
153  auto result = cuStreamGetCtx(stream_handle, &handle);
154  throw_if_error_lazy(result, "Failed obtaining the context of " + cuda::detail_::ptr_as_hex(stream_handle));
155  return handle;
156 }
157 #endif // CUDA_VERSION >= 9020
158 
159 
169 inline device::id_t device_id_of(stream::handle_t stream_handle);
170 
171 inline void record_event_in_current_context(
172  device::id_t current_device_id,
173  context::handle_t current_context_handle_,
174  stream::handle_t stream_handle,
175  event::handle_t event_handle);
176 
177 template <typename Function>
178 void enqueue_function_call(const stream_t& stream, Function function, void * argument);
179 
180 } // namespace detail_
181 
203 stream_t wrap(
204  device::id_t device_id,
205  context::handle_t context_handle,
206  handle_t stream_handle,
207  bool take_ownership = false,
208  bool hold_pc_refcount_unit = false) noexcept;
209 
210 namespace detail_ {
211 
212 // Providing the same signature to multiple CUDA driver calls, to allow
213 // uniform templated use of all of them
214 template<typename T>
215 CUresult wait_on_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);
216 
217 // Providing the same signature to multiple CUDA driver calls, to allow
218 // uniform templated use of all of them
219 template<typename T>
220 CUresult write_value(CUstream stream_handle, CUdeviceptr address, T value, unsigned int flags);
221 
222 } // namespace detail_
223 
224 #if CUDA_VERSION >= 10000
225 namespace capture {
226 
227 inline state_t state(const stream_t& stream);
228 
235 void begin(const cuda::stream_t& stream, stream::capture::mode_t mode = cuda::stream::capture::mode_t::global);
236 graph::template_t end(const cuda::stream_t& stream);
237 
238 } // namespace capture
239 
240 inline bool is_capturing(const stream_t& stream)
241 {
242  return is_capturing(stream::capture::state(stream));
243 }
244 
245 #endif // CUDA_VERSION >= 10000
246 } // namespace stream
247 
248 inline void synchronize(const stream_t& stream);
249 
258 class stream_t {
259 
260 public: // type definitions
261 
262  enum : bool {
263  doesnt_synchronizes_with_default_stream = false,
264  does_synchronize_with_default_stream = true,
265  };
266 
267 public: // const getters
269  stream::handle_t handle() const noexcept { return handle_; }
270 
272  context::handle_t context_handle() const noexcept { return context_handle_; }
273 
275  device::id_t device_id() const noexcept { return device_id_; }
276 
278  device_t device() const noexcept;
279 
281  context_t context() const noexcept;
282 
284  bool is_owning() const noexcept { return owning_; }
285 
286 public: // other non-mutators
287 
294  {
295  unsigned int flags;
296  auto status = cuStreamGetFlags(handle_, &flags);
297  // Could have used the equivalent Driver API call,
298  // cuStreamGetFlags(handle_, &flags);
299  throw_if_error_lazy(status, "Failed obtaining flags for a stream in "
300  + context::detail_::identify(context_handle_, device_id_));
301  return flags & CU_STREAM_NON_BLOCKING;
302  }
303 
307  {
308  int the_priority;
309  auto status = cuStreamGetPriority(handle_, &the_priority);
310  // Could have used the equivalent Runtime API call:
311  // cuStreamGetPriority(handle_, &the_priority);
312  throw_if_error_lazy(status, "Failed obtaining priority for a stream in "
313  + context::detail_::identify(context_handle_, device_id_));
314  return the_priority;
315  }
316 
327  bool has_work_remaining() const
328  {
329  CAW_SET_SCOPE_CONTEXT(context_handle_);
330  auto status = cuStreamQuery(handle_);
331  // Could have used the equivalent runtime API call:
332  // cuStreamQuery(handle_);
333  switch(status) {
334  case CUDA_SUCCESS:
335  return false;
336  case CUDA_ERROR_NOT_READY:
337  return true;
338  default:
339  throw cuda::runtime_error(static_cast<cuda::status::named_t>(status),
340  "unexpected stream status for " + stream::detail_::identify(handle_, device_id_));
341  }
342  }
343 
350  bool is_clear() const { return !has_work_remaining(); }
351 
356  bool query() const { return is_clear(); }
357 
358 public: // mutators
359 
367  class enqueue_t {
368  protected:
369  const stream_t& associated_stream;
370 
371  public:
373  enqueue_t(const stream_t& stream) : associated_stream(stream) {}
375 
387  template<typename KernelFunction, typename... KernelParameters>
389  const KernelFunction& kernel_function,
390  launch_configuration_t launch_configuration,
391  KernelParameters &&... parameters) const
392  {
393  return cuda::enqueue_launch(
394  kernel_function,
395  associated_stream,
396  launch_configuration,
397  ::std::forward<KernelParameters>(parameters)...);
398  }
399 
414  const kernel_t& kernel,
415  launch_configuration_t launch_configuration,
416  span<const void*> marshalled_arguments) const
417  {
418  cuda::launch_type_erased(kernel, associated_stream, launch_configuration, marshalled_arguments);
419  }
420 
421 #if CUDA_VERSION >= 10000
422 
429  void graph_launch(const graph::instance_t& graph_instance) const;
430 #endif // CUDA_VERSION >= 10000
431 
439  void copy(void *destination, const void *source, size_t num_bytes) const
442  {
443  // CUDA doesn't seem to need us to be in the stream's context to enqueue the copy;
444  // however, unfortunately, it does require us to be in _some_ context.
445  context::current::detail_::scoped_ensurer_t ensure_we_have_a_current_scope{associated_stream.context_handle_};
446  memory::detail_::copy(destination, source, num_bytes, associated_stream.handle_);
447  }
448 
450  void copy(void* destination, memory::const_region_t source, size_t num_bytes) const
451  {
452 #ifndef NDEBUG
453  if (source.size() < num_bytes) {
454  throw ::std::logic_error("Attempt to copy more than the source region's size");
455  }
456 #endif
457  copy(destination, source.start(), num_bytes);
458  }
459 
465  void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
466  {
467  copy(destination.start(), source, num_bytes);
468  }
469 
471  void copy(memory::region_t destination, memory::const_region_t source) const
472  {
473  copy(destination, source, source.size());
474  }
475 
477  void copy(void* destination, memory::const_region_t source) const
478  {
479  copy(destination, source, source.size());
480  }
481 
483 
492  void memset(void *start, int byte_value, size_t num_bytes) const
493  {
494  // Is it necessary to set the device? I wonder.
495  CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
496  memory::device::detail_::set(start, byte_value, num_bytes, associated_stream.handle_);
497  }
498 
500  void memset(memory::region_t region, int byte_value) const
501  {
502  memset(region.data(), byte_value, region.size());
503  }
504 
516  void memzero(void *start, size_t num_bytes) const
517  {
518  CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
519  memory::device::detail_::zero(start, num_bytes, associated_stream.handle_);
520  }
521 
525  void memzero(memory::region_t region) const
526  {
527  memzero(region.data(), region.size());
528  }
529 
542  event_t& event(event_t& existing_event) const;
543 
556  event_t event(
557  bool uses_blocking_sync = event::sync_by_busy_waiting,
558  bool records_timing = event::do_record_timings,
560 
561 # if CUDA_VERSION >= 10000
562 
569  template <typename Argument>
570  void host_function_call(void (*function)(Argument*), Argument* argument) const
571  {
572  // I hope you like function declaration punning :-)
573  stream::detail_::enqueue_function_call(
574  associated_stream, reinterpret_cast<stream::callback_t>(function), argument);
575  }
576 #endif
577 
578  private:
579  template <typename Invokable>
580  static void CUDA_CB stream_launched_invoker(void* type_erased_invokable) {
581  auto invokable = reinterpret_cast<Invokable*>(type_erased_invokable);
582  (*invokable)();
583  }
584 
585  public:
587  template <typename Invokable>
588  void host_invokable(Invokable& invokable) const
589  {
590  auto type_erased_invoker = reinterpret_cast<stream::callback_t>(stream_launched_invoker<Invokable>);
591  stream::detail_::enqueue_function_call(associated_stream, type_erased_invoker, &invokable);
592  }
593 
594 #if CUDA_VERSION >= 11020
595 
603  memory::region_t allocate(size_t num_bytes) const
605  {
606  return memory::device::allocate(num_bytes, associated_stream);
607  }
608 
612  memory::region_t allocate(const memory::pool_t& pool, size_t num_bytes) const;
614 
618  void free(void* region_start) const
620  {
621  memory::device::free(region_start, associated_stream);
622  }
623 
624  void free(memory::region_t region) const
625  {
626  memory::device::free(region, associated_stream);
627  }
629 #endif // CUDA_VERSION >= 11020
630 
652  const void* managed_region_start,
653  memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
654  {
655  CAW_SET_SCOPE_CONTEXT(associated_stream.context_handle_);
656  // This fixed value is required by the CUDA Runtime API,
657  // to indicate that the entire memory region, rather than a part of it, will be
658  // attached to this stream
659  constexpr const size_t length = 0;
660  auto flags = static_cast<unsigned>(attachment);
661  auto status = cuStreamAttachMemAsync(
662  associated_stream.handle_, memory::device::address(managed_region_start), length, flags);
663  // Could have used the equivalent Driver API call cuStreamAttachMemAsync
664  throw_if_error_lazy(status, "Failed scheduling an attachment of a managed memory region on "
665  + stream::detail_::identify(associated_stream.handle_, associated_stream.context_handle_,
666  associated_stream.device_id_));
667  }
668 
689  memory::region_t region,
690  memory::managed::attachment_t attachment = memory::managed::attachment_t::single_stream) const
691  {
692  attach_managed_region(region.start(), attachment);
693  }
694 
707  void wait(const event_t& event_) const;
708 
720  template <typename T>
721  void set_single_value(T* __restrict__ ptr, T value, bool with_memory_barrier = true) const
722  {
723  static_assert(
724  ::std::is_same<T,uint32_t>::value or ::std::is_same<T,uint64_t>::value,
725  "Unsupported type for stream value wait."
726  );
727  unsigned flags = with_memory_barrier ?
728  CU_STREAM_WRITE_VALUE_DEFAULT :
729  CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER;
730  auto result = static_cast<status_t>(
731  stream::detail_::write_value(associated_stream.handle_, memory::device::address(ptr), value, flags));
732  throw_if_error_lazy(result, "Failed scheduling a write to global memory on "
733  + stream::detail_::identify(associated_stream.handle_,associated_stream.context_handle_,
734  + associated_stream.device_id_));
735  }
736 
751  template <typename T>
752  void wait(const T* address, stream::wait_condition_t condition, T value, bool with_memory_barrier = false) const
753  {
754  static_assert(
755  ::std::is_same<T,int32_t>::value or ::std::is_same<T,int64_t>::value,
756  "Unsupported type for stream value wait."
757  );
758  unsigned flags = static_cast<unsigned>(condition) |
759  (with_memory_barrier ? CU_STREAM_WAIT_VALUE_FLUSH : 0);
760  auto result = static_cast<status_t>(
761  stream::detail_::wait_on_value(associated_stream.handle_, address, value, flags));
762  throw_if_error_lazy(result,
763  "Failed scheduling a wait on global memory address on "
764  + stream::detail_::identify(
765  associated_stream.handle_,
766  associated_stream.context_handle_,
767  associated_stream.device_id_) );
768  }
769 
774  void flush_remote_writes() const
775  {
776  CUstreamBatchMemOpParams op_params;
777  op_params.flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
778  op_params.flushRemoteWrites.flags = 0;
779  static const unsigned count = 1;
780  static const unsigned flags = 0;
781  // Let's cross our fingers and assume nothing else needs to be set here...
782  auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
783  throw_if_error_lazy(status, "scheduling a flush-remote-writes memory operation as a 1-op batch");
784  }
785 
786 #if CUDA_VERSION >= 11070
787  void memory_barrier(memory::barrier_scope_t scope) const
788  {
789  CUstreamBatchMemOpParams op_params;
790  op_params.memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
791  op_params.memoryBarrier.flags = static_cast<unsigned>(scope);
792  static const unsigned count = 1;
793  static const unsigned flags = 0;
794  // Let's cross our fingers and assume nothing else needs to be set here...
795  auto status = cuStreamBatchMemOp(associated_stream.handle_, count, &op_params, flags);
796  throw_if_error_lazy(status, "scheduling a memory barrier operation as a 1-op batch");
797  }
798 #endif
799 
813  template <typename Iterator>
814  void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
815  {
816  static_assert(
817  ::std::is_same<typename ::std::iterator_traits<Iterator>::value_type, CUstreamBatchMemOpParams>::value,
818  "Only accepting iterator pairs for the CUDA-driver-API memory operation descriptor,"
819  " CUstreamBatchMemOpParams, as the value type");
820  auto num_ops = ::std::distance(ops_begin, ops_end);
821  if (::std::is_same<typename ::std::remove_const<decltype(ops_begin)>::type, CUstreamBatchMemOpParams* >::value,
822  "Only accepting containers of the CUDA-driver-API memory operation descriptor, CUstreamBatchMemOpParams")
823  {
824  auto ops_ptr = reinterpret_cast<const CUstreamBatchMemOpParams*>(ops_begin);
825  cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_ptr);
826  }
827  else {
828  auto ops_uptr = ::std::unique_ptr<CUstreamBatchMemOpParams[]>(new CUstreamBatchMemOpParams[num_ops]);
829  ::std::copy(ops_begin, ops_end, ops_uptr.get());
830  cuStreamBatchMemOp(associated_stream.handle_, num_ops, ops_uptr.get());
831  }
832  }
833 
837  template <typename Container>
838  void single_value_operations_batch(const Container& single_value_ops) const
839  {
840  return single_value_operations_batch(single_value_ops.begin(), single_value_ops.end());
841  }
842 
843  }; // class enqueue_t
844 
849  void synchronize() const
850  {
851  cuda::synchronize(*this);
852  }
853 
854 #if CUDA_VERSION >= 11000
855  stream::synchronization_policy_t synchronization_policy() const
856  {
857  CAW_SET_SCOPE_CONTEXT(context_handle_);
858  CUstreamAttrValue wrapped_result{};
859  auto status = cuStreamGetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_result);
860  throw_if_error_lazy(status, ::std::string("Obtaining the synchronization policy of ") + stream::detail_::identify(*this));
861  return static_cast<stream::synchronization_policy_t>(wrapped_result.syncPolicy);
862  }
863 
864  void set_synchronization_policy(stream::synchronization_policy_t policy) const
865  {
866  CAW_SET_SCOPE_CONTEXT(context_handle_);
867  CUstreamAttrValue wrapped_value{};
868  wrapped_value.syncPolicy = static_cast<CUsynchronizationPolicy>(policy);
869  auto status = cuStreamSetAttribute(handle_, CU_STREAM_ATTRIBUTE_SYNCHRONIZATION_POLICY, &wrapped_value);
870  throw_if_error_lazy(status, ::std::string("Setting the synchronization policy of ") + stream::detail_::identify(*this));
871  }
872 #endif
873 
874  // TODO: Create a dummy capture object, then we could have capture.start(), capture.stop(), capture.status(),
875  // and perhaps a capture_() which takes a lambda. Also offer a
876  // cuda::stream::capture(const stream_t& stream, F f) template!
877 
878 
879 #if CUDA_VERSION >= 10000
880 
885  void begin_capture(stream::capture::mode_t mode = cuda::stream::capture::mode_t::global) const
886  {
887  stream::capture::begin(*this, mode);
888  }
889 
893  bool is_capturing() const { return stream::is_capturing(*this); }
894 
901  graph::template_t end_capture() const
902  {
903  return stream::capture::end(*this);
904  }
905 #endif // CUDA_VERSION >= 10000
906 
907 protected: // constructor
908 
909  stream_t(
910  device::id_t device_id,
911  context::handle_t context_handle,
912  stream::handle_t stream_handle,
913  bool take_ownership = false,
914  bool hold_primary_context_refcount_unit = false) noexcept
915  :
916  device_id_(device_id),
917  context_handle_(context_handle),
918  handle_(stream_handle),
919  owning_(take_ownership),
920  holds_pc_refcount_unit_(hold_primary_context_refcount_unit)
921  { }
922 
923 public: // constructors and destructor
924 
925  // Streams cannot be copied, despite our allowing non-owning class instances.
926  // The reason is that we might inadvertently copy of an owning stream, creating
927  // a non-owning stream and letting the original owning stream go out of scope -
928  // thus destructing the object, and destroying the underlying CUDA object.
929  // Essentially, that is like passing a reference to a local variable - which we
930  // may not do.
931  stream_t(const stream_t& other) = delete;
932 
933  stream_t(stream_t&& other) noexcept :
934  stream_t(other.device_id_, other.context_handle_, other.handle_, other.owning_, other.holds_pc_refcount_unit_)
935  {
936  other.owning_ = false;
937  other.holds_pc_refcount_unit_ = false;
938  }
939 
940  ~stream_t() DESTRUCTOR_EXCEPTION_SPEC
941  {
942  if (owning_) {
943 #if THROW_IN_DESTRUCTORS
944  stream::detail_::destroy(handle_, context_handle_, device_id_);
945 #else
946  stream::detail_::destroy_nothrow(handle_, context_handle_);
947 #endif
948  }
949  if (holds_pc_refcount_unit_) {
950  device::primary_context::detail_::decrease_refcount_in_dtor(device_id_);
951  }
952  }
953 
954 public: // operators
955 
956  stream_t& operator=(const stream_t& other) = delete;
957  stream_t& operator=(stream_t&& other) noexcept
958  {
959  ::std::swap(device_id_, other.device_id_);
960  ::std::swap(context_handle_, other.context_handle_);
961  ::std::swap(handle_, other.handle_);
962  ::std::swap(owning_, other.owning_);
963  ::std::swap(holds_pc_refcount_unit_, holds_pc_refcount_unit_);
964  return *this;
965  }
966 
967 public: // friendship
968 
969  friend stream_t stream::wrap(
970  device::id_t device_id,
971  context::handle_t context_handle,
972  stream::handle_t stream_handle,
973  bool take_ownership,
974  bool hold_pc_refcount_unit) noexcept;
975 
981  friend inline bool operator==(const stream_t& lhs, const stream_t& rhs) noexcept
982  {
983  return
984  lhs.context_handle_ == rhs.context_handle_
985 #ifndef NDEBUG
986  and lhs.device_id_ == rhs.device_id_
987 #endif
988  and lhs.handle_ == rhs.handle_;
989  }
990 
991 protected: // data members
992  device::id_t device_id_;
993  context::handle_t context_handle_;
994  stream::handle_t handle_;
995  bool owning_;
996  bool holds_pc_refcount_unit_;
997  // When context_handle_ is the handle of a primary context, this event may
998  // be "keeping that context alive" through the refcount - in which case
999  // it must release its refcount unit on destruction
1000 
1001 public: // data members - which only exist in lieu of namespaces
1002 
1005  const enqueue_t enqueue { *this };
1006  // The use of *this here is safe, since enqueue_t doesn't do anything with it
1007  // on its own. Any use of enqueue only happens through, well, *this - and
1008  // after construction.
1009 };
1010 
1012 inline bool operator!=(const stream_t& lhs, const stream_t& rhs) noexcept
1013 {
1014  return not (lhs == rhs);
1015 }
1017 
1018 namespace stream {
1019 
1021  device::id_t device_id,
1022  context::handle_t context_handle,
1023  stream::handle_t stream_handle,
1024  bool take_ownership,
1025  bool hold_pc_refcount_unit) noexcept
1026 {
1027  return { device_id, context_handle, stream_handle, take_ownership, hold_pc_refcount_unit };
1028 }
1029 
1030 namespace detail_ {
1031 
1032 inline stream_t create(
1033  device::id_t device_id,
1034  context::handle_t context_handle,
1035  bool synchronizes_with_default_stream,
1037  bool hold_pc_refcount_unit = false)
1038 {
1039  CAW_SET_SCOPE_CONTEXT(context_handle);
1040  auto new_stream_handle = cuda::stream::detail_::create_raw_in_current_context(
1041  synchronizes_with_default_stream, priority);
1042  return wrap(device_id, context_handle, new_stream_handle, do_take_ownership, hold_pc_refcount_unit);
1043 }
1044 
1045 template<>
1046 inline CUresult wait_on_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
1047 {
1048  return cuStreamWaitValue32(stream_handle, address, value, flags);
1049 }
1050 
1051 template<>
1052 inline CUresult wait_on_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
1053 {
1054  return cuStreamWaitValue64(stream_handle, address, value, flags);
1055 }
1056 
1057 
1058 template<>
1059 inline CUresult write_value<uint32_t>(CUstream stream_handle, CUdeviceptr address, uint32_t value, unsigned int flags)
1060 {
1061  return cuStreamWriteValue32(stream_handle, address, value, flags);
1062 }
1063 
1064 template<>
1065 inline CUresult write_value<uint64_t>(CUstream stream_handle, CUdeviceptr address, uint64_t value, unsigned int flags)
1066 {
1067  return cuStreamWriteValue64(stream_handle, address, value, flags);
1068 }
1069 
1070 template <typename Function>
1071 void enqueue_function_call(const stream_t& stream, Function function, void* argument)
1072 {
1073  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
1074 
1075  // While we always register the same static function, `callback_adapter` as the
1076  // callback - what it will actually _do_ is invoke the callback we were passed.
1077 
1078 #if CUDA_VERSION >= 10000
1079  auto status = cuLaunchHostFunc(stream.handle(), function, argument);
1080  // Could have used the equivalent Driver API call: cuLaunchHostFunc()
1081 #else
1082  // The nVIDIA runtime API (at least up to v10.2) requires passing 0 as the flags
1083  // variable, see:
1084  // http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
1085  static constexpr const unsigned fixed_flags { 0u };
1086  auto status = cuStreamAddCallback(stream.handle(), function, argument, fixed_flags);
1087 #endif
1088  throw_if_error_lazy(status, "Failed enqueuing a host function/invokable to be launched on " + stream::detail_::identify(stream));
1089 }
1090 
1091 } // namespace detail_
1092 
1106 stream_t create(
1107  const device_t& device,
1108  bool synchronizes_with_default_stream,
1110 
1126 stream_t create(
1127  const context_t& context,
1128  bool synchronizes_with_default_stream,
1130  bool hold_pc_refcount_unit = false);
1132 
1133 #if CUDA_VERSION >= 10000
1134 namespace capture {
1135 
1136 inline state_t state(const stream_t& stream)
1137 {
1138  context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
1139  CUstreamCaptureStatus capture_status;
1140  auto op_status = cuStreamIsCapturing(stream.handle(), &capture_status);
1141  throw_if_error_lazy(op_status, "Failed beginning to capture on " + stream::detail_::identify(stream));
1142  return static_cast<state_t>(capture_status);
1143 }
1144 
1145 inline void begin(const cuda::stream_t& stream, stream::capture::mode_t mode)
1146 {
1147  context::current::detail_::scoped_override_t set_context_for_this_scope(stream.context_handle());
1148  auto status = cuStreamBeginCapture(stream.handle(), static_cast<CUstreamCaptureMode>(mode));
1149  throw_if_error_lazy(status, "Failed beginning to capture on " + stream::detail_::identify(stream));
1150 }
1151 
1152 } // namespace capture
1153 #endif // CUDA_VERSION >= 10000
1154 
1155 } // namespace stream
1156 
1166  inline void synchronize(const stream_t& stream)
1167 {
1168  // Note: Unfortunately, even though CUDA should be aware of which context a stream belongs to,
1169  // and not have trouble acting on a stream in another context - it balks at doing so under
1170  // certain conditions, so we must place ourselves in the stream's context.
1171  CAW_SET_SCOPE_CONTEXT(stream.context_handle());
1172  auto status = cuStreamSynchronize(stream.handle());
1173  throw_if_error_lazy(status, "Failed synchronizing " + stream::detail_::identify(stream));
1174 }
1175 
1176 #if CUDA_VERSION >= 11000
1177 
1188 void copy_attributes(const stream_t& dest, const stream_t& src);
1189 #endif // CUDA_VERSION >= 11000
1190 
1191 } // namespace cuda
1192 
1193 #endif // CUDA_API_WRAPPERS_STREAM_HPP_
void memset(void *start, int byte_value, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:492
bool has_work_remaining() const
Determines whether all work on this stream has been completed.
Definition: stream.hpp:327
void copy(void *destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:450
bool query() const
An alias for is_clear() - to conform to how the CUDA runtime API names this functionality.
Definition: stream.hpp:356
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:272
Proxy class for a CUDA stream.
Definition: stream.hpp:258
void memzero(memory::region_t region) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:525
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:269
Wrapper class for a CUDA context.
Definition: context.hpp:249
void synchronize() const
Block or busy-wait until all previously-scheduled work on this stream has been completed.
Definition: stream.hpp:849
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:243
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:63
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1974
void single_value_operations_batch(const Container &single_value_ops) const
Definition: stream.hpp:838
friend bool operator==(const stream_t &lhs, const stream_t &rhs) noexcept
Definition: stream.hpp:981
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
void copy(void *destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:477
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
Wrapper class for a CUDA event.
Definition: event.hpp:147
A gadget through which commands are enqueued on the stream.
Definition: stream.hpp:367
region_t allocate(const context_t &context, size_t size_in_bytes)
Allocate device-side memory on a CUDA device context.
Definition: memory.hpp:102
void wait(const T *address, stream::wait_condition_t condition, T value, bool with_memory_barrier=false) const
Wait for a value in device global memory to change so as to meet some condition.
Definition: stream.hpp:752
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
void wait(const event_t &event)
Have the calling thread wait - either busy-waiting or blocking - and return only after this event has...
Definition: event.hpp:457
void free(void *ptr)
Free a region of device-side memory (regardless of how it was allocated)
Definition: memory.hpp:126
CUevent handle_t
The CUDA driver&#39;s raw handle for events.
Definition: types.hpp:214
void attach_managed_region(const void *managed_region_start, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:651
bool is_clear() const
The opposite of has_work_remaining()
Definition: stream.hpp:350
void enqueue_launch(Kernel &&kernel, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: kernel_launch.hpp:25
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
void copy(span< T > destination, c_array< const T, N > const &source, optional_ref< const stream_t > stream={})
Copy the contents of a C-style array into a span of same-type elements.
Definition: memory.hpp:627
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
void kernel_launch(const KernelFunction &kernel_function, launch_configuration_t launch_configuration, KernelParameters &&... parameters) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:388
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:980
wait_condition_t
Kinds of conditions to apply to a value in GPU global memory when waiting on that value...
Definition: stream.hpp:62
CUstreamCallback callback_t
The CUDA driver&#39;s raw handle for a host-side callback function.
Definition: types.hpp:254
void single_value_operations_batch(Iterator ops_begin, Iterator ops_end) const
Enqueue multiple single-value write, wait and flush operations to the device (avoiding the overhead o...
Definition: stream.hpp:814
bool synchronizes_with_default_stream() const
When true, work running in the created stream may run concurrently with work in stream 0 (the NULL st...
Definition: stream.hpp:293
stream::priority_t priority() const
Definition: stream.hpp:306
stream_t wrap(device::id_t device_id, context::handle_t context_handle, handle_t stream_handle, bool take_ownership=false, bool hold_pc_refcount_unit=false) noexcept
Wrap an existing stream in a stream_t instance.
Definition: stream.hpp:1020
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
void copy(memory::region_t destination, memory::const_region_t source) const
Copy operations.
Definition: stream.hpp:471
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
Wrappers for getting and setting CUDA&#39;s choice of which device is &#39;current&#39;.
detail_::region_helper< memory::const_region_t > const_region_t
A child class of the generic const_region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1976
void memset(memory::region_t region, int byte_value) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:500
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
address_t address(const void *device_ptr) noexcept
Definition: types.hpp:684
void launch_type_erased(const kernel_t &kernel, const stream_t &stream, launch_configuration_t launch_configuration, SpanOfConstVoidPtrLike marshalled_arguments)
Launch a kernel with the arguments pre-marshalled into the (main) form which the CUDA driver&#39;s launch...
Definition: kernel_launch.hpp:413
Miscellaneous functionality which does not fit in another file, and does not depend on the main proxy...
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:236
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
A non-owning wrapper for CUDA kernels - whether they be __global__ functions compiled apriori...
Definition: kernel.hpp:159
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:246
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
void flush_remote_writes() const
Guarantee all remote writes to the specified address are visible to subsequent operations scheduled o...
Definition: stream.hpp:774
void attach_managed_region(memory::region_t region, memory::managed::attachment_t attachment=memory::managed::attachment_t::single_stream) const
Sets the attachment of a region of managed memory (i.e.
Definition: stream.hpp:688
Can only be used by the process which created it.
Definition: constants.hpp:95
void type_erased_kernel_launch(const kernel_t &kernel, launch_configuration_t launch_configuration, span< const void *> marshalled_arguments) const
Schedule a kernel launch on the associated stream.
Definition: stream.hpp:413
void copy(memory::region_t destination, memory::const_region_t source, size_t num_bytes) const
Copy operations.
Definition: stream.hpp:465
void set_single_value(T *__restrict__ ptr, T value, bool with_memory_barrier=true) const
Schedule writing a single value to global device memory after all previous work has concluded...
Definition: stream.hpp:721
Wrapper class for a CUDA device.
Definition: device.hpp:135
void memzero(void *start, size_t num_bytes) const
Set all bytes of a certain region in device memory (or unified memory, but using the CUDA device to d...
Definition: stream.hpp:516
Fundamental CUDA-related type definitions.
void host_invokable(Invokable &invokable) const
Enqueues a host-invokable object, typically a function or closure object call.
Definition: stream.hpp:588
freestanding wrapper functions for working with CUDA&#39;s various kinds of memory spaces, arranged into a relevant namespace hierarchy.
bool is_owning() const noexcept
True if this wrapper is responsible for telling CUDA to destroy the stream upon the wrapper&#39;s own des...
Definition: stream.hpp:284
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
device::id_t device_id() const noexcept
The raw CUDA ID for the device w.r.t. which the stream is defined.
Definition: stream.hpp:275
void zero(void *start, size_t num_bytes, optional_ref< const stream_t > stream={})
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:418
attachment_t
Kinds of managed memory region attachments.
Definition: memory.hpp:1989
An implementation of a subclass of kernel_t for kernels compiled together with the host-side program...