MuGdxy/muda/launch__base_8h_source.html

 #pragma once
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #include <device_launch_parameters.h>

 #include <string>
 #include <functional>
 #include <memory>
 #include <cooperative_groups.h>

 #include <cuda_profiler_api.h>
 #include <nvtx3/nvToolsExt.h>
 #include <nvtx3/nvToolsExtCuda.h>
 #include <muda/type_traits/type_modifier.h>
 #include <muda/tools/launch_info_cache.h>

 #include <muda/check/check_cuda_errors.h>
 #include <muda/muda_def.h>
 #include <muda/launch/event.h>
 #include <muda/launch/kernel_tag.h>

 namespace muda
 {
 namespace details
 {
     inline void stream_error_callback(cudaStream_t stream, cudaError error, void* userdata)
     {
         auto callback =
             reinterpret_cast<std::function<void(cudaStream_t, cudaError)>*>(userdata);
         (*callback)(stream, error);
         delete callback;
     }
 }  // namespace details

 class ComputeGraphVarBase;

 template <typename T>
 class ComputeGraphVar;

 class LaunchCore
 {
   protected:
     template <typename T>
     using S = std::shared_ptr<T>;
     MUDA_GENERIC ::cudaStream_t stream() const { return m_stream; }

     ::cudaStream_t m_stream;
     MUDA_HOST void pop_kernel_name();

   public:
     static void             kernel_name(std::string_view name);
     static std::string_view kernel_name();

     MUDA_GENERIC LaunchCore(::cudaStream_t stream) MUDA_NOEXCEPT;

     void init_stream(::cudaStream_t s) { m_stream = s; }

     void push_range(const std::string& name);
     void pop_range();

     void record(cudaEvent_t e, int flag = cudaEventRecordDefault);
     void record(ComputeGraphVar<cudaEvent_t>&            e,
                 const std::vector<ComputeGraphVarBase*>& vars);
     template <typename... ViewT>
     void record(ComputeGraphVar<cudaEvent_t>& e, ComputeGraphVar<ViewT>&... vars);
     void when(cudaEvent_t e, int flag = cudaEventWaitDefault);
     // let the host wait for the event
     void wait(cudaEvent_t e, int flag = cudaEventWaitDefault);
     void wait(const ComputeGraphVar<cudaEvent_t>&      e,
               const std::vector<ComputeGraphVarBase*>& vars);
     template <typename... ViewT>
     void wait(const ComputeGraphVar<cudaEvent_t>& e, ComputeGraphVar<ViewT>&... vars);
     void wait();
     void callback(const std::function<void(::cudaStream_t, ::cudaError)>& callback);

     static void wait_event(cudaEvent_t event);
     static void wait_stream(::cudaStream_t stream);
     static void wait_device();

     ~LaunchCore() MUDA_NOEXCEPT;
 };

 template <typename T>
 class LaunchBase : public LaunchCore
 {
     template <typename Others>
     friend class LaunchBase;
     using Base = LaunchCore;

   public:
     using derived_type = T;
     MUDA_GENERIC LaunchBase(::cudaStream_t stream) MUDA_NOEXCEPT;

     // create a named scope for better recognization (if you are using some profile tools)
     // usage:
     //  on(stream)
     //      .push_range("part1")
     //      .next<launch>(1,1).apply(...)
     //      .pop_range()
     //      .wait();
     T& push_range(const std::string& name);
     T& pop_range();


     // create a name for the following kernel launch
     // viewers will record this name for the sake of better recognization when debugging
     T&               kernel_name(std::string_view name);
     std::string_view kernel_name() const { return Base::kernel_name(); }

     // record an event on this point with current stream, you could use .when() to
     // capture this event for synchronization
     // flags:
     //  cudaEventRecordDefault : Default event creation flag.
     //  cudaEventRecordExternal : Event is captured in the graph as an external
     //  event node when performing stream capture.
     T& record(cudaEvent_t e, int flag = cudaEventRecordDefault);

     T& record(ComputeGraphVar<cudaEvent_t>&            e,
               const std::vector<ComputeGraphVarBase*>& vars);

     template <typename... ViewT>
     T& record(ComputeGraphVar<cudaEvent_t>& e, ComputeGraphVar<ViewT>&... vars);

     // let the following kernels wait until the event is triggered
     // (asynchronize with the host)
     // usage:
     //  on(stream)
     //      .when(event)
     //      .next<launch>(1,1).apply(...)
     //      .wait();
     // flags:
     //  cudaEventRecordDefault : Default event creation flag.
     //  cudaEventRecordExternal : Event is captured in the graph as an external
     //  event node when performing stream capture.
     T& when(cudaEvent_t e, int flag = cudaEventWaitDefault);
     // let the host wait for the event
     T& wait(cudaEvent_t e, int flag = cudaEventWaitDefault);
     T& wait(const ComputeGraphVar<cudaEvent_t>&      e,
             const std::vector<ComputeGraphVarBase*>& vars);
     template <typename... ViewT>
     T& wait(const ComputeGraphVar<cudaEvent_t>& e, ComputeGraphVar<ViewT>&... vars);


     // let the host wait for the current stream
     T& wait();

     // register a host callback function, which will be called when all the jobs before
     // this point are done.
     T& callback(const std::function<void(::cudaStream_t, ::cudaError)>& callback);

     template <typename Next>
     Next next(Next n);
     template <typename Next, typename... Args>
     Next next(Args&&... args);

     ~LaunchBase() MUDA_NOEXCEPT;

   protected:
     T& pop_kernel_name();

   private:
     T& derived() MUDA_NOEXCEPT { return *(T*)(this); }
 };

 class Empty : public LaunchBase<Empty>
 {
   public:
     Empty(::cudaStream_t stream = nullptr)
         : LaunchBase(stream)
     {
     }
 };

 Empty on(::cudaStream_t stream);

 Empty on();

 void wait_device();
 void wait_stream(::cudaStream_t stream);
 void wait_event(cudaEvent_t event);
 }  // namespace muda

 #include "details/launch_base.inl"
muda::LaunchCore
Definition: launch_base.h:41

muda::Empty
Definition: launch_base.h:166

muda::ComputeGraphVar
Definition: buffer_launch.h:34

muda
Definition: assert.h:13

muda::LaunchBase
Definition: launch_base.h:85