eyalroz/cuda-api-wrappers/profiling_8hpp_source.html

 #pragma once
 #ifndef CUDA_API_WRAPPERS_PROFILING_HPP_
 #define CUDA_API_WRAPPERS_PROFILING_HPP_

 #include "../api/types.hpp"
 #include "../api/error.hpp"
 #include "../api/current_context.hpp"
 #include "../api/stream.hpp"
 #include "../api/event.hpp"
 #include "../api/device.hpp"
 #include "../api/multi_wrapper_impls/context.hpp"

 #include <cudaProfiler.h>

 #if CUDA_VERSION >= 10000 || defined(_WIN32)
 #include <nvtx3/nvToolsExt.h>
 #include <nvtx3/nvToolsExtCuda.h>
 #else
 #include <nvToolsExt.h>
 #include <nvToolsExtCuda.h>
 #endif

 #ifdef _WIN32
 #include <processthreadsapi.h> // for GetThreadId()
 #endif

 #ifdef CUDA_API_WRAPPERS_USE_PTHREADS
 #include <pthread.h>
 #else
 #ifdef CUDA_API_WRAPPERS_USE_WIN32_THREADS
 #include <processthreadsapi.h>
 #endif
 #endif

 #include <mutex>
 #include <cstdint>
 #include <string>
 #include <cstdint>
 #include <thread>


 namespace cuda {

 // Note: No implementation for now for nvtxStringHandle_t's
 namespace profiling {

 namespace detail_ {

 inline void set_message(nvtxEventAttributes_t &attrs, const char *c_str) noexcept
 {
     attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
     attrs.message.ascii = c_str;
 }

 inline void set_message(nvtxEventAttributes_t &attrs, const wchar_t *wc_str) noexcept
 {
     attrs.messageType = NVTX_MESSAGE_TYPE_UNICODE;
     attrs.message.unicode = wc_str;
 }

 inline void set_message(nvtxEventAttributes_t &attrs, nvtxStringHandle_t rsh) noexcept
 {
     attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
     attrs.message.registered = rsh;
 }

 } // namespace detail_

 namespace range {

 enum class type_t { unspecified, kernel, pci_express_transfer };

 using handle_t = nvtxRangeId_t;

 } // namespace range

 struct color_t {
     using underlying_type = ::std::uint32_t;

     using channel_value = ::std::uint8_t;

     channel_value alpha, red, green, blue;

     static constexpr color_t from_hex(underlying_type raw_argb) noexcept {
         return {
             static_cast<channel_value> ((raw_argb >> 24) & 0xFF),
             static_cast<channel_value> ((raw_argb >> 16) & 0xFF),
             static_cast<channel_value> ((raw_argb >>  8) & 0xFF),
             static_cast<channel_value> ((raw_argb >>  0) & 0xFF),
         };
     }

     underlying_type as_hex() const noexcept
     {
         return
         static_cast<underlying_type>(alpha)  << 24 |
         static_cast<underlying_type>(red)    << 16 |
         static_cast<underlying_type>(green)  <<  8 |
         static_cast<underlying_type>(blue)   <<  0;
     }

     operator underlying_type() const noexcept { return as_hex(); }

     static constexpr color_t Black()       noexcept { return from_hex(0x00000000); }
     static constexpr color_t White()       noexcept { return from_hex(0x00FFFFFF); }
     static constexpr color_t FullRed()     noexcept { return from_hex(0x00FF0000); }
     static constexpr color_t FullGreen()   noexcept { return from_hex(0x0000FF00); }
     static constexpr color_t FullBlue()    noexcept { return from_hex(0x000000FF); }
     static constexpr color_t FullYellow()  noexcept { return from_hex(0x00FFFF00); }
     static constexpr color_t LightRed()    noexcept { return from_hex(0x00FFDDDD); }
     static constexpr color_t LightGreen()  noexcept { return from_hex(0x00DDFFDD); }
     static constexpr color_t LightBlue()   noexcept { return from_hex(0x00DDDDFF); }
     static constexpr color_t LightYellow() noexcept { return from_hex(0x00FFFFDD); }
     static constexpr color_t DarkRed()     noexcept { return from_hex(0x00880000); }
     static constexpr color_t DarkGreen()   noexcept { return from_hex(0x00008800); }
     static constexpr color_t DarkBlue()    noexcept { return from_hex(0x00000088); }
     static constexpr color_t DarkYellow()  noexcept { return from_hex(0x00888800); }
 };

 namespace mark {

 namespace detail_ {

 // Used to prevent multiple threads from accessing the profiler simultaneously
 inline ::std::mutex& get_mutex() noexcept
 {
     static ::std::mutex profiler_mutex;
     return profiler_mutex;
 }

 template <typename CharT>
 nvtxEventAttributes_t create_attributes(const CharT* description, color_t color)
 {
     nvtxEventAttributes_t eventAttrib = {0};
     eventAttrib.version = NVTX_VERSION;
     eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
     eventAttrib.colorType = NVTX_COLOR_ARGB;
     eventAttrib.color = color;
     profiling::detail_::set_message(eventAttrib,description);
     return eventAttrib;
 }

 } // namespace detail_

 template <typename CharT>
 void point(const CharT* description, color_t color = color_t::Black())
 {
     auto attrs = detail_::create_attributes(description, color);
     ::std::lock_guard<::std::mutex> guard{ detail_::get_mutex() };
     // logging?
     nvtxMarkEx(&attrs);
 }

 template <typename CharT>
 range::handle_t range_start(
     const CharT*   description,
     range::type_t  type = range::type_t::unspecified,
     color_t        color = color_t::LightRed())
 {
     (void) type; // Currently not doing anything with the type; maybe in the future
     ::std::lock_guard<::std::mutex> guard{ detail_::get_mutex() };
     auto attrs = detail_::create_attributes(description, color);
     nvtxRangeId_t range_handle = nvtxRangeStartEx(&attrs);
     static_assert(::std::is_same<range::handle_t, nvtxRangeId_t>::value,
                   "cuda::profiling::range::handle_t must be the same type as nvtxRangeId_t - but isn't.");
     return range_handle;
 }

 inline void range_end(range::handle_t range_handle)
 {
     static_assert(::std::is_same<range::handle_t, nvtxRangeId_t>::value,
                   "cuda::profiling::range::handle_t must be the same type as nvtxRangeId_t - but isn't.");
     nvtxRangeEnd(range_handle);
 }

 } // namespace mark

 inline void start()
 {
     auto status = cuProfilerStart();
     throw_if_error_lazy(status, "Starting CUDA profiling");
 }

 inline void stop()
 {
     auto status = cuProfilerStop();
     throw_if_error_lazy(status, "Stopping CUDA profiling");
 }

 } // namespace profiling
 } // namespace cuda

 namespace cuda {

 namespace profiling {

 class scoped_range_marker {
 public:
     template <typename CharT>
     explicit scoped_range_marker(
         const CharT* description,
         profiling::range::type_t type = profiling::range::type_t::unspecified)
     {
         range = profiling::mark::range_start(description, type);
     }

     ~scoped_range_marker()
     {
         // TODO: Can we check the range for validity somehow?
         profiling::mark::range_end(range);
     }
 protected:
     profiling::range::handle_t range;
 };

 class scope {
 public:
     scope() { start(); }
     ~scope() { stop(); }
 protected:
     context::current::detail_::scoped_existence_ensurer_t context_existence_ensurer;
 };

 #define profile_this_scope() ::cuda::profiling::scope cuda_profiling_scope_{};

 namespace detail_ {

 template <typename CharT>
 void name_host_thread(uint32_t raw_thread_id, const CharT* name);

 template <>
 inline void name_host_thread<char>(uint32_t raw_thread_id, const char* name)
 {
     nvtxNameOsThreadA(raw_thread_id, name);
 }

 template <>
 inline void name_host_thread<wchar_t>(uint32_t raw_thread_id, const wchar_t* name)
 {
     nvtxNameOsThreadW(raw_thread_id, name);
 }

 template <typename CharT>
 void name_stream(stream::handle_t stream_handle, const CharT* name);

 template <>
 inline void name_stream<char>(stream::handle_t stream_handle, const char* name)
 {
     nvtxNameCuStreamA(stream_handle, name);
 }

 template <>
 inline void name_stream<wchar_t>(stream::handle_t stream_handle, const wchar_t* name)
 {
     nvtxNameCuStreamW(stream_handle, name);
 }

 template <typename CharT>
 inline void name_event(event::handle_t event_handle, const CharT* name);

 template <>
 inline void name_event<char>(event::handle_t event_handle, const char* name)
 {
     nvtxNameCuEventA(event_handle, name);
 }

 template <>
 inline void name_event<wchar_t>(event::handle_t event_handle, const wchar_t* name)
 {
     nvtxNameCuEventW(event_handle, name);
 }

 template <typename CharT>
 void name_device(device::id_t device_id, const CharT* name);

 template <>
 inline void name_device<char>(device::id_t device_id, const char* name)
 {
     nvtxNameCuDeviceA(device_id, name);
 }

 template <>
 inline void name_device<wchar_t>(device::id_t device_id, const wchar_t* name)
 {
     nvtxNameCuDeviceW(device_id, name);
 }

 inline void name(::std::thread::id host_thread_id, const char* name)
 {
     auto native_handle = *(reinterpret_cast<const ::std::thread::native_handle_type*>(&host_thread_id));
 #ifdef _WIN32
     uint32_t thread_id = GetThreadId(native_handle);
 #else
     if (native_handle >= ::std::numeric_limits<uint32_t>::max()) {
         throw ::std::runtime_error("Native thread ID " + ::std::to_string(native_handle) +
             " exceeds maximum representable thread ID " + ::std::to_string(::std::numeric_limits<uint32_t>::max()));
     }
     auto thread_id = static_cast<uint32_t>(native_handle);
 #endif
     name_host_thread(thread_id, name);}

 } // namespace detail_

 template <typename CharT>
 void name(const ::std::thread& host_thread, const CharT* name);

 template <typename CharT>
 void name_this_thread(const CharT* name)
 {
     detail_::name(::std::this_thread::get_id(), name);
 }

 template <typename CharT>
 void name(const stream_t& stream, const CharT* name)
 {
     context::current::detail_::scoped_override_t context_setter{stream.context_handle()};
     detail_::name_stream(stream.handle(), name);
 }

 template <typename CharT>
 void name(const event_t& event, const CharT* name)
 {
     context::current::detail_::scoped_override_t context_setter{event.context_handle()};
     detail_::name_stream(event.handle(), name);
 }

 template <typename CharT>
 void name(const device_t& device, const CharT* name)
 {
     detail_::name_stream(device.id(), name);
 }

 } // namespace profiling
 } // namespace cuda

 #endif // CUDA_API_WRAPPERS_PROFILING_HPP_
cuda::profiling::mark::range_start
range::handle_t range_start(const CharT *description, range::type_t type=range::type_t::unspecified, color_t color=color_t::LightRed())
Mark the beginning of a range on the profiler timeline, giving it also a color and some descriptive t...
Definition: profiling.hpp:203

cuda::profiling::scoped_range_marker
A RAII/CADRe class whose scope of existence is reflected as a range in the profiler.
Definition: profiling.hpp:257

cuda::stream_t::context_handle
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260

cuda::stream_t
Proxy class for a CUDA stream.
Definition: stream.hpp:246

cuda::event_t::handle
event::handle_t handle() const noexcept
The raw CUDA handle for this event.
Definition: event.hpp:143

cuda::stream_t::handle
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257

cuda
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22

cuda::profiling::stop
void stop()
Stop CUDA profiling for the current process.
Definition: profiling.hpp:236

cuda::profiling::range::handle_t
nvtxRangeId_t handle_t
The raw handle of a CUDA profiling range.
Definition: profiling.hpp:93

cuda::event_t
Wrapper class for a CUDA event.
Definition: event.hpp:133

cuda::device::id_t
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850

cuda::profiling::mark::range_end
void range_end(range::handle_t range_handle)
Mark the end of a range, using the handle obtained when previously marking its beginning.
Definition: profiling.hpp:219

cuda::profiling::range::type_t
type_t
Types of profiled ranges we recognize.
Definition: profiling.hpp:90

cuda::device_t::id
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594

cuda::event::handle_t
CUevent handle_t
The CUDA driver&#39;s raw handle for events.
Definition: types.hpp:217

cuda::profiling::color_t::from_hex
static constexpr color_t from_hex(underlying_type raw_argb) noexcept
Construct a profiler color value from a numeric value (typically, an 8-hex-digit literal) ...
Definition: profiling.hpp:115

cuda::profiling::mark::point
void point(const CharT *description, color_t color=color_t::Black())
Mark a single point on the profiler timeline, giving it also a color and some descriptive text...
Definition: profiling.hpp:185

cuda::profiling::start
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229

cuda::profiling::name_this_thread
void name_this_thread(const CharT *name)
Have the profiler refer to the current thread using a specified string identifier (rather than its nu...
Definition: profiling.hpp:387

cuda::profiling::color_t::alpha
channel_value alpha
A profiler color is made up of three color channels and a transparency or "alpha" channel...
Definition: profiling.hpp:111

cuda::profiling::scope
A class to instantiate in the part of your application which does any work you intend to use the CUDA...
Definition: profiling.hpp:281

throw_if_error_lazy
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316

cuda::profiling::color_t::underlying_type
::std::uint32_t underlying_type
A profiler color corresponds to a 32-bit value.
Definition: profiling.hpp:104

cuda::profiling::color_t::channel_value
::std::uint8_t channel_value
Each color channel is an 8-bit value.
Definition: profiling.hpp:107

cuda::stream::handle_t
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239

cuda::profiling::color_t::Black
static constexpr color_t Black() noexcept
Some basic colors, for convenience.
Definition: profiling.hpp:139

cuda::profiling::name
void name(const ::std::thread &host_thread, const CharT *name)
Have the profiler refer to a given host thread, using a specified string identifier (rather than its ...

cuda::device_t
Wrapper class for a CUDA device.
Definition: device.hpp:135

cuda::profiling::name
void name(const device_t &device, const CharT *name)
Have the profile assign a name to a certain CUDA device.
Definition: profiling.hpp:410

cuda::profiling::color_t::as_hex
underlying_type as_hex() const noexcept
Definition: profiling.hpp:125

cuda::profiling::color_t
An RGB colorspace color value, with potential transparency, which may be used to color elements in ti...
Definition: profiling.hpp:102