cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
profiling.hpp
Go to the documentation of this file.
1 
9 #pragma once
10 #ifndef CUDA_API_WRAPPERS_PROFILING_HPP_
11 #define CUDA_API_WRAPPERS_PROFILING_HPP_
12 
13 #include "../api/types.hpp"
14 #include "../api/error.hpp"
15 #include "../api/current_context.hpp"
16 #include "../api/stream.hpp"
17 #include "../api/event.hpp"
18 #include "../api/device.hpp"
19 #include "../api/multi_wrapper_impls/context.hpp"
20 
21 #include <cudaProfiler.h>
22 
23 #if CUDA_VERSION >= 10000 || defined(_WIN32)
24 #include <nvtx3/nvToolsExt.h>
25 #include <nvtx3/nvToolsExtCuda.h>
26 #else
27 #include <nvToolsExt.h>
28 #include <nvToolsExtCuda.h>
29 #endif
30 
31 #ifdef _WIN32
32 #include <processthreadsapi.h> // for GetThreadId()
33 #endif
34 
35 #ifdef CUDA_API_WRAPPERS_USE_PTHREADS
36 #include <pthread.h>
37 #else
38 #ifdef CUDA_API_WRAPPERS_USE_WIN32_THREADS
39 #include <processthreadsapi.h>
40 #endif
41 #endif
42 
43 #include <mutex>
44 #include <cstdint>
45 #include <string>
46 #include <cstdint>
47 #include <thread>
48 
49 
50 namespace cuda {
51 
52 // Note: No implementation for now for nvtxStringHandle_t's
57 namespace profiling {
58 
59 namespace detail_ {
60 
61 inline void set_message(nvtxEventAttributes_t &attrs, const char *c_str) noexcept
62 {
63  attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
64  attrs.message.ascii = c_str;
65 }
66 
67 inline void set_message(nvtxEventAttributes_t &attrs, const wchar_t *wc_str) noexcept
68 {
69  attrs.messageType = NVTX_MESSAGE_TYPE_UNICODE;
70  attrs.message.unicode = wc_str;
71 }
72 
73 inline void set_message(nvtxEventAttributes_t &attrs, nvtxStringHandle_t rsh) noexcept
74 {
75  attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
76  attrs.message.registered = rsh;
77 }
78 
79 } // namespace detail_
80 
82 namespace range {
83 
90 enum class type_t { unspecified, kernel, pci_express_transfer };
91 
93 using handle_t = nvtxRangeId_t;
94 
95 } // namespace range
96 
102 struct color_t {
104  using underlying_type = ::std::uint32_t;
105 
107  using channel_value = ::std::uint8_t;
108 
111  channel_value alpha, red, green, blue;
112 
115  static constexpr color_t from_hex(underlying_type raw_argb) noexcept {
116  return {
117  static_cast<channel_value> ((raw_argb >> 24) & 0xFF),
118  static_cast<channel_value> ((raw_argb >> 16) & 0xFF),
119  static_cast<channel_value> ((raw_argb >> 8) & 0xFF),
120  static_cast<channel_value> ((raw_argb >> 0) & 0xFF),
121  };
122  }
123 
125  underlying_type as_hex() const noexcept
126  {
127  return
128  static_cast<underlying_type>(alpha) << 24 |
129  static_cast<underlying_type>(red) << 16 |
130  static_cast<underlying_type>(green) << 8 |
131  static_cast<underlying_type>(blue) << 0;
132  }
133 
135  operator underlying_type() const noexcept { return as_hex(); }
136 
139  static constexpr color_t Black() noexcept { return from_hex(0x00000000); }
140  static constexpr color_t White() noexcept { return from_hex(0x00FFFFFF); }
141  static constexpr color_t FullRed() noexcept { return from_hex(0x00FF0000); }
142  static constexpr color_t FullGreen() noexcept { return from_hex(0x0000FF00); }
143  static constexpr color_t FullBlue() noexcept { return from_hex(0x000000FF); }
144  static constexpr color_t FullYellow() noexcept { return from_hex(0x00FFFF00); }
145  static constexpr color_t LightRed() noexcept { return from_hex(0x00FFDDDD); }
146  static constexpr color_t LightGreen() noexcept { return from_hex(0x00DDFFDD); }
147  static constexpr color_t LightBlue() noexcept { return from_hex(0x00DDDDFF); }
148  static constexpr color_t LightYellow() noexcept { return from_hex(0x00FFFFDD); }
149  static constexpr color_t DarkRed() noexcept { return from_hex(0x00880000); }
150  static constexpr color_t DarkGreen() noexcept { return from_hex(0x00008800); }
151  static constexpr color_t DarkBlue() noexcept { return from_hex(0x00000088); }
152  static constexpr color_t DarkYellow() noexcept { return from_hex(0x00888800); }
154 };
155 
157 namespace mark {
158 
159 namespace detail_ {
160 
161 // Used to prevent multiple threads from accessing the profiler simultaneously
162 inline ::std::mutex& get_mutex() noexcept
163 {
164  static ::std::mutex profiler_mutex;
165  return profiler_mutex;
166 }
167 
168 template <typename CharT>
169 nvtxEventAttributes_t create_attributes(const CharT* description, color_t color)
170 {
171  nvtxEventAttributes_t eventAttrib = {0};
172  eventAttrib.version = NVTX_VERSION;
173  eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
174  eventAttrib.colorType = NVTX_COLOR_ARGB;
175  eventAttrib.color = color;
176  profiling::detail_::set_message(eventAttrib,description);
177  return eventAttrib;
178 }
179 
180 } // namespace detail_
181 
184 template <typename CharT>
185 void point(const CharT* description, color_t color = color_t::Black())
186 {
187  auto attrs = detail_::create_attributes(description, color);
188  ::std::lock_guard<::std::mutex> guard{ detail_::get_mutex() };
189  // logging?
190  nvtxMarkEx(&attrs);
191 }
192 
202 template <typename CharT>
204  const CharT* description,
205  range::type_t type = range::type_t::unspecified,
206  color_t color = color_t::LightRed())
207 {
208  (void) type; // Currently not doing anything with the type; maybe in the future
209  ::std::lock_guard<::std::mutex> guard{ detail_::get_mutex() };
210  auto attrs = detail_::create_attributes(description, color);
211  nvtxRangeId_t range_handle = nvtxRangeStartEx(&attrs);
212  static_assert(::std::is_same<range::handle_t, nvtxRangeId_t>::value,
213  "cuda::profiling::range::handle_t must be the same type as nvtxRangeId_t - but isn't.");
214  return range_handle;
215 }
216 
219 inline void range_end(range::handle_t range_handle)
220 {
221  static_assert(::std::is_same<range::handle_t, nvtxRangeId_t>::value,
222  "cuda::profiling::range::handle_t must be the same type as nvtxRangeId_t - but isn't.");
223  nvtxRangeEnd(range_handle);
224 }
225 
226 } // namespace mark
227 
229 inline void start()
230 {
231  auto status = cuProfilerStart();
232  throw_if_error_lazy(status, "Starting CUDA profiling");
233 }
234 
236 inline void stop()
237 {
238  auto status = cuProfilerStop();
239  throw_if_error_lazy(status, "Stopping CUDA profiling");
240 }
241 
242 } // namespace profiling
243 } // namespace cuda
244 
245 namespace cuda {
246 
247 namespace profiling {
248 
258 public:
259  template <typename CharT>
260  explicit scoped_range_marker(
261  const CharT* description,
262  profiling::range::type_t type = profiling::range::type_t::unspecified)
263  {
264  range = profiling::mark::range_start(description, type);
265  }
266 
268  {
269  // TODO: Can we check the range for validity somehow?
271  }
272 protected:
274 };
275 
281 class scope {
282 public:
283  scope() { start(); }
284  ~scope() { stop(); }
285 protected:
286  context::current::detail_::scoped_existence_ensurer_t context_existence_ensurer;
287 };
288 
289 #define profile_this_scope() ::cuda::profiling::scope cuda_profiling_scope_{};
290 
291 namespace detail_ {
292 
293 template <typename CharT>
294 void name_host_thread(uint32_t raw_thread_id, const CharT* name);
295 
296 template <>
297 inline void name_host_thread<char>(uint32_t raw_thread_id, const char* name)
298 {
299  nvtxNameOsThreadA(raw_thread_id, name);
300 }
301 
302 template <>
303 inline void name_host_thread<wchar_t>(uint32_t raw_thread_id, const wchar_t* name)
304 {
305  nvtxNameOsThreadW(raw_thread_id, name);
306 }
307 
308 template <typename CharT>
309 void name_stream(stream::handle_t stream_handle, const CharT* name);
310 
311 template <>
312 inline void name_stream<char>(stream::handle_t stream_handle, const char* name)
313 {
314  nvtxNameCuStreamA(stream_handle, name);
315 }
316 
317 template <>
318 inline void name_stream<wchar_t>(stream::handle_t stream_handle, const wchar_t* name)
319 {
320  nvtxNameCuStreamW(stream_handle, name);
321 }
322 
323 template <typename CharT>
324 inline void name_event(event::handle_t event_handle, const CharT* name);
325 
326 template <>
327 inline void name_event<char>(event::handle_t event_handle, const char* name)
328 {
329  nvtxNameCuEventA(event_handle, name);
330 }
331 
332 template <>
333 inline void name_event<wchar_t>(event::handle_t event_handle, const wchar_t* name)
334 {
335  nvtxNameCuEventW(event_handle, name);
336 }
337 
338 template <typename CharT>
339 void name_device(device::id_t device_id, const CharT* name);
340 
341 template <>
342 inline void name_device<char>(device::id_t device_id, const char* name)
343 {
344  nvtxNameCuDeviceA(device_id, name);
345 }
346 
347 template <>
348 inline void name_device<wchar_t>(device::id_t device_id, const wchar_t* name)
349 {
350  nvtxNameCuDeviceW(device_id, name);
351 }
352 
353 inline void name(::std::thread::id host_thread_id, const char* name)
354 {
355  auto native_handle = *(reinterpret_cast<const ::std::thread::native_handle_type*>(&host_thread_id));
356 #ifdef _WIN32
357  uint32_t thread_id = GetThreadId(native_handle);
358 #else
359  if (native_handle >= ::std::numeric_limits<uint32_t>::max()) {
360  throw ::std::runtime_error("Native thread ID " + ::std::to_string(native_handle) +
361  " exceeds maximum representable thread ID " + ::std::to_string(::std::numeric_limits<uint32_t>::max()));
362  }
363  auto thread_id = static_cast<uint32_t>(native_handle);
364 #endif
365  name_host_thread(thread_id, name);}
366 
367 } // namespace detail_
368 
376 template <typename CharT>
377 void name(const ::std::thread& host_thread, const CharT* name);
378 
386 template <typename CharT>
387 void name_this_thread(const CharT* name)
388 {
389  detail_::name(::std::this_thread::get_id(), name);
390 }
391 
393 template <typename CharT>
394 void name(const stream_t& stream, const CharT* name)
395 {
396  context::current::detail_::scoped_override_t context_setter{stream.context_handle()};
397  detail_::name_stream(stream.handle(), name);
398 }
399 
401 template <typename CharT>
402 void name(const event_t& event, const CharT* name)
403 {
404  context::current::detail_::scoped_override_t context_setter{event.context_handle()};
405  detail_::name_stream(event.handle(), name);
406 }
407 
409 template <typename CharT>
410 void name(const device_t& device, const CharT* name)
411 {
412  detail_::name_stream(device.id(), name);
413 }
414 
415 } // namespace profiling
416 } // namespace cuda
417 
418 #endif // CUDA_API_WRAPPERS_PROFILING_HPP_
range::handle_t range_start(const CharT *description, range::type_t type=range::type_t::unspecified, color_t color=color_t::LightRed())
Mark the beginning of a range on the profiler timeline, giving it also a color and some descriptive t...
Definition: profiling.hpp:203
A RAII/CADRe class whose scope of existence is reflected as a range in the profiler.
Definition: profiling.hpp:257
context::handle_t context_handle() const noexcept
The raw CUDA handle for the context in which the represented stream is defined.
Definition: stream.hpp:260
Proxy class for a CUDA stream.
Definition: stream.hpp:246
event::handle_t handle() const noexcept
The raw CUDA handle for this event.
Definition: event.hpp:143
stream::handle_t handle() const noexcept
The raw CUDA handle for a stream which this class wraps.
Definition: stream.hpp:257
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
void stop()
Stop CUDA profiling for the current process.
Definition: profiling.hpp:236
nvtxRangeId_t handle_t
The raw handle of a CUDA profiling range.
Definition: profiling.hpp:93
Wrapper class for a CUDA event.
Definition: event.hpp:133
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
void range_end(range::handle_t range_handle)
Mark the end of a range, using the handle obtained when previously marking its beginning.
Definition: profiling.hpp:219
type_t
Types of profiled ranges we recognize.
Definition: profiling.hpp:90
device::id_t id() const noexcept
Return the proxied device&#39;s ID.
Definition: device.hpp:594
CUevent handle_t
The CUDA driver&#39;s raw handle for events.
Definition: types.hpp:217
static constexpr color_t from_hex(underlying_type raw_argb) noexcept
Construct a profiler color value from a numeric value (typically, an 8-hex-digit literal) ...
Definition: profiling.hpp:115
void point(const CharT *description, color_t color=color_t::Black())
Mark a single point on the profiler timeline, giving it also a color and some descriptive text...
Definition: profiling.hpp:185
void start()
Start CUDA profiling for the current process.
Definition: profiling.hpp:229
void name_this_thread(const CharT *name)
Have the profiler refer to the current thread using a specified string identifier (rather than its nu...
Definition: profiling.hpp:387
channel_value alpha
A profiler color is made up of three color channels and a transparency or "alpha" channel...
Definition: profiling.hpp:111
A class to instantiate in the part of your application which does any work you intend to use the CUDA...
Definition: profiling.hpp:281
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we&#39;ve failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
::std::uint32_t underlying_type
A profiler color corresponds to a 32-bit value.
Definition: profiling.hpp:104
::std::uint8_t channel_value
Each color channel is an 8-bit value.
Definition: profiling.hpp:107
CUstream handle_t
The CUDA driver&#39;s raw handle for streams.
Definition: types.hpp:239
static constexpr color_t Black() noexcept
Some basic colors, for convenience.
Definition: profiling.hpp:139
void name(const ::std::thread &host_thread, const CharT *name)
Have the profiler refer to a given host thread, using a specified string identifier (rather than its ...
Wrapper class for a CUDA device.
Definition: device.hpp:135
void name(const device_t &device, const CharT *name)
Have the profile assign a name to a certain CUDA device.
Definition: profiling.hpp:410
underlying_type as_hex() const noexcept
Definition: profiling.hpp:125
An RGB colorspace color value, with potential transparency, which may be used to color elements in ti...
Definition: profiling.hpp:102