cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
types.hpp
Go to the documentation of this file.
1 
17 #pragma once
18 #ifndef CUDA_API_WRAPPERS_COMMON_TYPES_HPP_
19 #define CUDA_API_WRAPPERS_COMMON_TYPES_HPP_
20 
21 #if (__cplusplus < 201103L && (!defined(_MSVC_LANG) || _MSVC_LANG < 201103L))
22 #error "The CUDA Runtime API headers can only be compiled with C++11 or a later version of the C++ language standard"
23 #endif
24 
25 #ifndef __CUDACC__
26 #include <builtin_types.h>
27 #endif
28 
29 #include <type_traits>
30 #include <cassert>
31 #include <cstddef> // for ::std::size_t
32 #include <cstdint>
33 
34 #ifndef __CUDACC__
35 #ifndef __device__
36 #define __device__
37 #define __host__
38 #endif
39 #endif
40 
41 #ifndef CPP14_CONSTEXPR
42 #if __cplusplus >= 201402L
43 #define CPP14_CONSTEXPR constexpr
44 #else
45 #define CPP14_CONSTEXPR
46 #endif
47 #endif
48 
49 
50 #ifdef _MSC_VER
51 /*
52  * Microsoft Visual C++ (upto v2017) does not support the C++
53  * keywords `and`, `or` and `not`. Apparently, the following
54  * include is a work-around.
55  */
56 #include <ciso646>
57 #endif
58 
62 namespace cuda {
63 
64 /*
65  * The different id and handle types - for devices, streams events etc. - are
66  * just numeric values (mostly useful for breaking dependencies and for
67  * interaction with code using the original CUDA APIs); we also have wrapper
68  * classes for the entites they identify, constructible from the appropriate
69  * handles/id's. These allow convenient access to their related functionality -
70  * such as @ref cuda::device_t, @ref cuda::stream_t and @ref cuda::event_t.
71  */
72 
73 
79 using status_t = cudaError_t;
80 
81 using size_t = ::std::size_t;
82 
87 using dimensionality_t = unsigned;
88 
89 namespace array {
90 
91 using dimension_t = size_t;
100 template<dimensionality_t NumDimensions>
102 
106 template<>
107 struct dimensions_t<3> // this almost-inherits cudaExtent
108 {
109  dimension_t width, height, depth;
110 
111  constexpr __host__ __device__ dimensions_t(dimension_t width_, dimension_t height_, dimension_t depth_)
112  : width(width_), height(height_), depth(depth_) { }
113  constexpr __host__ __device__ dimensions_t(cudaExtent e)
114  : dimensions_t(e.width, e.height, e.depth) { }
115  constexpr __host__ __device__ dimensions_t(const dimensions_t& other)
116  : dimensions_t(other.width, other.height, other.depth) { }
117  constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
118  : dimensions_t(other.width, other.height, other.depth) { }
119 
120  CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(const dimensions_t& other)
121  {
122  width = other.width; height = other.height; depth = other.depth;
123  return *this;
124 
125  }
126  CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(dimensions_t&& other)
127  {
128  width = other.width; height = other.height; depth = other.depth;
129  return *this;
130  }
131 
132  constexpr __host__ __device__ operator cudaExtent(void) const
133  {
134  return { width, height, depth };
135  // Note: We're not using make_cudaExtent here because:
136  // 1. It's not constexpr and
137  // 2. It doesn't do anything except construct the plain struct - as of CUDA 10 at least
138  }
139 
140  constexpr __host__ __device__ size_t volume() const { return width * height * depth; }
141  constexpr __host__ __device__ size_t size() const { return volume(); }
142  constexpr __host__ __device__ dimensionality_t dimensionality() const
143  {
144  return ((width > 1) + (height> 1) + (depth > 1));
145  }
146 
147  // Named constructor idioms
148 
149  static constexpr __host__ __device__ dimensions_t cube(dimension_t x) { return dimensions_t{ x, x, x }; }
150 };
151 
155 template<>
156 struct dimensions_t<2>
157 {
158  dimension_t width, height;
159 
160  constexpr __host__ __device__ dimensions_t(dimension_t width_, dimension_t height_)
161  : width(width_), height(height_) { }
162  constexpr __host__ __device__ dimensions_t(const dimensions_t& other)
163  : dimensions_t(other.width, other.height) { }
164  constexpr __host__ __device__ dimensions_t(dimensions_t&& other)
165  : dimensions_t(other.width, other.height) { }
166 
167  CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(const dimensions_t& other)
168  {
169  width = other.width; height = other.height;
170  return *this;
171 
172  }
173  CPP14_CONSTEXPR __host__ __device__ dimensions_t& operator=(dimensions_t&& other)
174  {
175  width = other.width; height = other.height;
176  return *this;
177  }
178 
179  constexpr __host__ __device__ size_t area() const { return width * height; }
180  constexpr __host__ __device__ size_t size() const { return area(); }
181  constexpr __host__ __device__ dimensionality_t dimensionality() const
182  {
183  return ((width > 1) + (height> 1));
184  }
185 
186  // Named constructor idioms
187 
188  static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x }; }
189 };
190 
191 } // namespace array
192 
197 namespace event {
198 
202 using handle_t = cudaEvent_t;
203 
204 } // namespace event
205 
211 namespace stream {
212 
216 using handle_t = cudaStream_t;
217 
223 using priority_t = int;
224 enum : priority_t {
230 };
231 
232 } // namespace stream
233 
234 namespace grid {
235 
243 using dimension_t = decltype(dim3::x);
244 
257 
258 
271 struct dimensions_t // this almost-inherits dim3
272 {
273  dimension_t x, y, z;
274  constexpr __host__ __device__ dimensions_t(dimension_t x_ = 1, dimension_t y_ = 1, dimension_t z_ = 1)
275  : x(x_), y(y_), z(z_) { }
276 
277  constexpr __host__ __device__ dimensions_t(const uint3& v) : dimensions_t(v.x, v.y, v.z) { }
278  constexpr __host__ __device__ dimensions_t(const dim3& dims) : dimensions_t(dims.x, dims.y, dims.z) { }
279  constexpr __host__ __device__ dimensions_t(dim3&& dims) : dimensions_t(dims.x, dims.y, dims.z) { }
280 
281  constexpr __host__ __device__ operator uint3(void) const { return { x, y, z }; }
282 
283  // This _should_ have been constexpr, but nVIDIA have not marked the dim3 constructors
284  // as constexpr, so it isn't
285  __host__ __device__ operator dim3(void) const { return { x, y, z }; }
286 
287  constexpr __host__ __device__ size_t volume() const { return (size_t) x * y * z; }
288  constexpr __host__ __device__ dimensionality_t dimensionality() const
289  {
290  return ((z > 1) + (y > 1) + (x > 1));
291  }
292 
293  // Named constructor idioms
294 
295  static constexpr __host__ __device__ dimensions_t cube(dimension_t x) { return dimensions_t{ x, x, x }; }
296  static constexpr __host__ __device__ dimensions_t square(dimension_t x) { return dimensions_t{ x, x, 1 }; }
297  static constexpr __host__ __device__ dimensions_t line(dimension_t x) { return dimensions_t{ x, 1, 1 }; }
298  static constexpr __host__ __device__ dimensions_t point() { return dimensions_t{ 1, 1, 1 }; }
299 };
300 
302 constexpr inline bool operator==(const dim3& lhs, const dim3& rhs) noexcept
303 {
304  return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
305 }
306 constexpr inline bool operator==(const dimensions_t& lhs, const dimensions_t& rhs) noexcept
307 {
308  return lhs.x == rhs.x and lhs.y == rhs.y and lhs.z == rhs.z;
309 }
311 
312 
318 
324  grid::dimensions_t grid;
326 };
327 
328 constexpr inline bool operator==(const complete_dimensions_t lhs, const complete_dimensions_t& rhs) noexcept
329 {
330  return (lhs.grid == rhs.grid) and (lhs.block == rhs.block);
331 }
332 
333 constexpr inline bool operator!=(const complete_dimensions_t lhs, const complete_dimensions_t& rhs) noexcept
334 {
335  return not (lhs == rhs);
336 }
337 
338 } // namespace grid
339 
346 namespace memory {
347 namespace shared {
348 
361 using size_t = unsigned;
362 
363 using bank_size_configuration_t = cudaSharedMemConfig;
364 
365 } // namespace shared
366 } // namespace memory
367 
373  grid::complete_dimensions_t dimensions { 0 , 0 };
374  memory::shared::size_t dynamic_shared_memory_size { 0u };
377  bool block_cooperation { false };
383 
384  // In C++11, an inline initializer for a struct's field costs us a lot
385  // of its defaulted constructors; but - we must initialize the shared
386  // memory size to 0, as otherwise, people might be tempted to initialize
387  // a launch configuration with { num_blocks, num_threads } - and get an
388  // uninitialized shared memory size which they did not expect. So,
389  // we do have the inline initializers above regardless of the language
390  // standard version, and we just have to "pay the price" of spelling things out:
391  launch_configuration_t() = delete;
392  constexpr launch_configuration_t(const launch_configuration_t&) = default;
393  constexpr launch_configuration_t(launch_configuration_t&&) = default;
394 
395  constexpr launch_configuration_t(
396  grid::complete_dimensions_t grid_and_block_dimensions,
397  memory::shared::size_t dynamic_shared_mem = 0u,
398  bool thread_block_cooperation = false
399  ) :
400  dimensions{grid_and_block_dimensions},
401  dynamic_shared_memory_size{dynamic_shared_mem},
402  block_cooperation{thread_block_cooperation}
403  { }
404 
405  constexpr launch_configuration_t(
406  grid::dimensions_t grid_dims,
407  grid::dimensions_t block_dims,
408  memory::shared::size_t dynamic_shared_mem = 0u,
409  bool thread_block_cooperation = false
410  ) : launch_configuration_t(
411  {grid_dims, block_dims},
412  dynamic_shared_mem,
413  thread_block_cooperation)
414  { }
415 
416  // A "convenience" delegating ctor to avoid narrowing-conversion warnings
417  constexpr launch_configuration_t(
418  int grid_dims,
419  int block_dims,
420  memory::shared::size_t dynamic_shared_mem = 0u,
421  bool thread_block_cooperation = false
422  ) : launch_configuration_t(
423  grid::dimensions_t(grid_dims),
424  grid::block_dimensions_t(block_dims),
425  dynamic_shared_mem,
426  thread_block_cooperation)
427  { }
428 
433  return {
434  dimensions.block.x * dimensions.grid.x,
435  dimensions.block.y * dimensions.grid.y,
436  dimensions.block.z * dimensions.grid.z
437  };
438  }
439 };
440 
445  grid::complete_dimensions_t grid_and_block_dimensions,
446  memory::shared::size_t dynamic_shared_memory_size = 0u,
447  bool block_cooperation = false) noexcept
448 {
449  return { grid_and_block_dimensions, dynamic_shared_memory_size, block_cooperation };
450 }
451 
453  grid::dimensions_t grid_dimensions,
454  grid::block_dimensions_t block_dimensions,
455  memory::shared::size_t dynamic_shared_memory_size = 0u,
456  bool block_cooperation = false) noexcept
457 {
458  return { { grid_dimensions, block_dimensions }, dynamic_shared_memory_size, block_cooperation };
459 }
460 
461 constexpr inline bool operator==(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
462 {
463  return
464  lhs.dimensions == rhs.dimensions and
465  lhs.dynamic_shared_memory_size == rhs.dynamic_shared_memory_size and
467 }
468 
469 constexpr inline bool operator!=(const launch_configuration_t lhs, const launch_configuration_t& rhs) noexcept
470 {
471  return not (lhs == rhs);
472 }
473 
483  no_preference = cudaFuncCachePreferNone,
485  equal_l1_and_shared_memory = cudaFuncCachePreferEqual,
487  prefer_shared_memory_over_l1 = cudaFuncCachePreferShared,
489  prefer_l1_over_shared_memory = cudaFuncCachePreferL1,
490  // aliases
491  none = no_preference,
493  prefer_shared = prefer_shared_memory_over_l1,
494  prefer_l1 = prefer_l1_over_shared_memory,
495 };
496 
508  : ::std::underlying_type<cudaSharedMemConfig>::type
509 {
510  device_default = cudaSharedMemBankSizeDefault,
511  four_bytes_per_bank = cudaSharedMemBankSizeFourByte,
512  eight_bytes_per_bank = cudaSharedMemBankSizeEightByte
513 };
514 
519 namespace device {
520 
524 using id_t = int;
525 
530 using attribute_t = cudaDeviceAttr;
534 using attribute_value_t = int;
535 
541 using pair_attribute_t = cudaDeviceP2PAttr;
542 
543 } // namespace device
544 
545 namespace detail_ {
546 
555 template<typename P>
556 struct kernel_parameter_decay {
557 private:
558  typedef typename ::std::remove_reference<P>::type U;
559 public:
560  typedef typename ::std::conditional<
561  ::std::is_array<U>::value,
562  typename ::std::remove_extent<U>::type*,
563  typename ::std::conditional<
564  ::std::is_function<U>::value,
565  typename ::std::add_pointer<U>::type,
566  U
567  >::type
568  >::type type;
569 };
570 
571 template<typename P>
572 using kernel_parameter_decay_t = typename kernel_parameter_decay<P>::type;
573 
574 } // namespace detail_
575 
582 
593  heuristic = cudaDeviceScheduleAuto,
594 
599 
609  spin = cudaDeviceScheduleSpin,
610 
617  block = cudaDeviceScheduleBlockingSync,
618 
628  yield = cudaDeviceScheduleYield,
629 
632 };
633 
634 using native_word_t = unsigned;
635 
636 namespace kernel {
637 
638 using attribute_t = cudaFuncAttribute;
639 using attribute_value_t = int;
640 
641 } // namespace kernel
642 
643 } // namespace cuda
644 
645 #ifndef __CUDACC__
646 #ifndef __device__
647 #define __device__
648 #define __host__
649 #endif
650 #endif
651 
652 #endif // CUDA_API_WRAPPERS_COMMON_TYPES_HPP_
int attribute_value_t
All CUDA device attributes (cuda::device::attribute_t) have a value of this type. ...
Definition: types.hpp:534
Yield control while waiting for results.
Definition: types.hpp:628
No preference for more L1 cache or for more shared memory; the API can do as it please.
Divide the cache resources to maximize available shared memory at the expense of L1 cache...
Alias for the default behavior; see heuristic .
Definition: types.hpp:598
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:20
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:223
Combined dimensions for a grid - in terms of blocks, then also down into the block dimensions complet...
Definition: types.hpp:323
Holds the parameters necessary to "launch" a CUDA kernel (i.e.
Definition: types.hpp:372
bool block_cooperation
The number of bytes each grid block may use, in addition to the statically-allocated shared memory da...
Definition: types.hpp:377
Divide the cache resources to maximize available L1 cache at the expense of shared memory...
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:361
cudaDeviceAttr attribute_t
CUDA devices have both "attributes" and "properties".
Definition: types.hpp:530
A richer (kind-of-a-)wrapper for CUDA&#39;s dim3 class, used to specify dimensions for blocks (in terms o...
Definition: types.hpp:271
cudaStream_t handle_t
The CUDA Runtime API&#39;s handle for streams.
Definition: types.hpp:216
unsigned dimensionality_t
The index or number of dimensions of an entity (as opposed to the extent in any dimension) - typicall...
Definition: types.hpp:87
cudaDeviceP2PAttr pair_attribute_t
While Individual CUDA devices have individual "attributes" (attribute_t), there are also attributes c...
Definition: types.hpp:541
cudaEvent_t handle_t
The CUDA Runtime APIs&#39; handle for events.
Definition: types.hpp:202
constexpr launch_configuration_t make_launch_config(grid::complete_dimensions_t grid_and_block_dimensions, memory::shared::size_t dynamic_shared_memory_size=0u, bool block_cooperation=false) noexcept
a named constructor idiom for a launch_config_t
Definition: types.hpp:444
Default behavior; yield or spin based on a heuristic.
Definition: types.hpp:593
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:229
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:481
Block the thread until results are available.
Definition: types.hpp:617
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:256
constexpr grid::dimensions_t combined_grid_dimensions() const
The overall dimensions, in thread, of the launch grid.
Definition: types.hpp:432
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:243
Representation, allocation and manipulation of CUDA-related memory, with its various namespaces and k...
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:507
host_thread_synch_scheduling_policy_t
Scheduling policies the Runtime API may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:581
Divide the cache resources equally between actual L1 cache and shared memory.
int id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:524
cudaError_t status_t
Indicates either the result (success or error index) of a CUDA Runtime API call, or the overall statu...
Definition: types.hpp:79
see heuristic
Definition: types.hpp:631
CUDA&#39;s array memory-objects are multi-dimensional; but their dimensions, or extents, are not the same as cuda::grid::dimensions_t ; they may be much larger in each axis.
Definition: types.hpp:101
Keep control and spin-check for result availability.
Definition: types.hpp:609