cuda-api-wrappers
Thin C++-flavored wrappers for the CUDA Runtime API
multi_wrapper_impls.hpp
Go to the documentation of this file.
1 
8 #pragma once
9 #ifndef MULTI_WRAPPER_IMPLS_HPP_
10 #define MULTI_WRAPPER_IMPLS_HPP_
11 
12 #include <cuda/api/array.hpp>
13 #include <cuda/api/device.hpp>
14 #include <cuda/api/event.hpp>
16 #include <cuda/api/pointer.hpp>
17 #include <cuda/api/stream.hpp>
18 #include <cuda/api/unique_ptr.hpp>
19 #include <cuda_runtime.h>
20 
21 #include <type_traits>
22 #include <vector>
23 #include <algorithm>
24 
25 namespace cuda {
26 
27 namespace array {
28 
29 namespace detail_ {
30 
31 template<typename T>
32 inline cudaArray* allocate(device_t& device, array::dimensions_t<3> dimensions)
33 {
34  device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
35  return allocate_on_current_device<T>(dimensions);
36 }
37 
38 template<typename T>
39 inline cudaArray* allocate(device_t& device, array::dimensions_t<2> dimensions)
40 {
41  device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
42  return allocate_on_current_device<T>(dimensions);
43 }
44 
45 } // namespace detail_
46 
47 } // namespace array
48 
49 namespace event {
50 
51 inline event_t create(
52  device_t& device,
53  bool uses_blocking_sync,
54  bool records_timing,
55  bool interprocess)
56 {
57  auto device_id = device.id();
58  // Yes, we need the ID explicitly even on the current device,
59  // because event_t's don't have an implicit device ID.
60  return event::detail_::create(device_id , uses_blocking_sync, records_timing, interprocess);
61 }
62 
63 namespace ipc {
64 
65 inline handle_t export_(event_t& event)
66 {
67  return detail_::export_(event.id());
68 }
69 
70 inline event_t import(device_t& device, const handle_t& handle)
71 {
72  return event::detail_::wrap(device.id(), detail_::import(handle), do_not_take_ownership);
73 }
74 
75 } // namespace ipc
76 
77 } // namespace event
78 
79 
80 // device_t methods
81 
82 inline stream_t device_t::default_stream() const noexcept
83 {
84  return stream::detail_::wrap(id(), stream::default_stream_id);
85 }
86 
87 inline stream_t
88 device_t::create_stream(
89  bool will_synchronize_with_default_stream,
90  stream::priority_t priority)
91 {
92  device::current::detail_::scoped_override_t set_device_for_this_scope(id_);
93  return stream::detail_::wrap(id(), stream::detail_::create_on_current_device(
94  will_synchronize_with_default_stream, priority), do_take_ownership);
95 }
96 
97 namespace device {
98 namespace current {
99 
100 inline scoped_override_t::scoped_override_t(device_t& device) : parent(device.id()) { }
101 inline scoped_override_t::scoped_override_t(device_t&& device) : parent(device.id()) { }
102 
103 } // namespace current
104 } // namespace device
105 
106 
107 namespace detail_ {
108 
109 } // namespace detail_
110 
111 template <typename KernelFunction, typename ... KernelParameters>
112 void device_t::launch(
113  bool thread_block_cooperativity,
114  KernelFunction kernel_function, launch_configuration_t launch_configuration,
115  KernelParameters ... parameters)
116 {
117  return default_stream().enqueue.kernel_launch(
118  thread_block_cooperativity, kernel_function, launch_configuration, parameters...);
119 }
120 
122  bool uses_blocking_sync,
123  bool records_timing,
124  bool interprocess)
125 {
126  // The current implementation of event::create is not super-smart,
127  // but it's probably not worth it trying to improve just this function
128  return event::create(*this, uses_blocking_sync, records_timing, interprocess);
129 }
130 
131 // event_t methods
132 
133 inline device_t event_t::device() const noexcept
134 {
135  return cuda::device::get(device_id_);
136 }
137 
138 inline void event_t::record(const stream_t& stream)
139 {
140  // Note:
141  // TODO: Perhaps check the device ID here, rather than
142  // have the Runtime API call fail?
143  event::detail_::enqueue(stream.id(), id_);
144 }
145 
146 inline void event_t::fire(const stream_t& stream)
147 {
148  record(stream);
149  stream.synchronize();
150 }
151 
152 
153 // stream_t methods
154 
155 inline device_t stream_t::device() const noexcept
156 {
157  return cuda::device::get(device_id_);
158 }
159 
160 inline void stream_t::enqueue_t::wait(const event_t& event_)
161 {
162  auto device_id = associated_stream.device_id_;
163  device::current::detail_::scoped_override_t set_device_for_this_context(device_id);
164 
165  // Required by the CUDA runtime API; the flags value is currently unused
166  constexpr const unsigned int flags = 0;
167 
168  auto status = cudaStreamWaitEvent(associated_stream.id_, event_.id(), flags);
169  throw_if_error(status,
170  ::std::string("Failed scheduling a wait for event ") + cuda::detail_::ptr_as_hex(event_.id())
171  + " on stream " + cuda::detail_::ptr_as_hex(associated_stream.id_)
172  + " on CUDA device " + ::std::to_string(device_id));
173 
174 }
175 
177 {
178  auto device_id = associated_stream.device_id_;
179  if (existing_event.device_id() != device_id) {
180  throw ::std::invalid_argument("Attempt to enqueue a CUDA event associated with device "
181  + ::std::to_string(existing_event.device_id()) + " to be triggered by a stream on CUDA device "
182  + ::std::to_string(device_id ) );
183  }
184  device::current::detail_::scoped_override_t set_device_for_this_context(device_id);
185  stream::detail_::record_event_on_current_device(device_id, associated_stream.id_, existing_event.id());
186  return existing_event;
187 }
188 
190  bool uses_blocking_sync,
191  bool records_timing,
192  bool interprocess)
193 {
194  auto device_id = associated_stream.device_id_;
195  device::current::detail_::scoped_override_t set_device_for_this_scope(device_id);
196 
197  event_t ev { event::detail_::create_on_current_device(device_id, uses_blocking_sync, records_timing, interprocess) };
198  // Note that, at this point, the event is not associated with this enqueue object's stream.
199  stream::detail_::record_event_on_current_device(device_id, associated_stream.id_, ev.id());
200  return ev;
201 }
202 
203 namespace memory {
204 
205 template <typename T>
206 inline device_t pointer_t<T>::device() const noexcept
207 {
208  return cuda::device::get(attributes().device);
209 }
210 
211 namespace async {
212 
213 inline void copy(void *destination, const void *source, size_t num_bytes, const stream_t& stream)
214 {
215  detail_::copy(destination, source, num_bytes, stream.id());
216 }
217 
218 template <typename T, dimensionality_t NumDimensions>
219 inline void copy(array_t<T, NumDimensions>& destination, const T* source, const stream_t& stream)
220 {
221  detail_::copy(destination, source, stream.id());
222 }
223 
224 template <typename T, dimensionality_t NumDimensions>
225 inline void copy(T* destination, const array_t<T, NumDimensions>& source, const stream_t& stream)
226 {
227  detail_::copy(destination, source, stream.id());
228 }
229 
230 template <typename T>
231 inline void copy_single(T& destination, const T& source, const stream_t& stream)
232 {
233  detail_::copy_single(&destination, &source, sizeof(T), stream.id());
234 }
235 
236 } // namespace async
237 
238 namespace device {
239 
240 inline region_t allocate(cuda::device_t device, size_t size_in_bytes)
241 {
242  return detail_::allocate(device.id(), size_in_bytes);
243 }
244 
245 namespace async {
246 
247 inline region_t allocate(const stream_t& stream, size_t size_in_bytes)
248 {
249  return detail_::allocate(stream.device().id(), stream.id(), size_in_bytes);
250 }
251 
252 inline void set(void* start, int byte_value, size_t num_bytes, const stream_t& stream)
253 {
254  detail_::set(start, byte_value, num_bytes, stream.id());
255 }
256 
257 inline void zero(void* start, size_t num_bytes, const stream_t& stream)
258 {
259  detail_::zero(start, num_bytes, stream.id());
260 }
261 
262 } // namespace async
263 
274 template<typename T>
275 inline unique_ptr<T> make_unique(device_t device, size_t num_elements)
276 {
277  static_assert(::std::is_array<T>::value, "make_unique<T>(device, num_elements) can only be invoked for T being an array type, T = U[]");
278  cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
279  return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>(num_elements);
280 }
281 
291 template <typename T>
292 inline unique_ptr<T> make_unique(device_t device)
293 {
294  cuda::device::current::detail_::scoped_override_t set_device_for_this_scope(device.id());
295  return cuda::memory::detail_::make_unique<T, detail_::allocator, detail_::deleter>();
296 }
297 
298 } // namespace device
299 
300 namespace managed {
301 
302 namespace detail_ {
303 
304 template <typename T>
305 inline device_t base_region_t<T>::preferred_location() const
306 {
307  auto device_id = detail_::get_scalar_range_attribute<bool>(*this, cudaMemRangeAttributePreferredLocation);
308  return cuda::device::get(device_id);
309 }
310 
311 template <typename T>
312 inline void base_region_t<T>::set_preferred_location(device_t& device) const
313 {
314  detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseSetPreferredLocation, device.id());
315 }
316 
317 template <typename T>
318 inline void base_region_t<T>::clear_preferred_location() const
319 {
320  detail_::set_scalar_range_attribute(*this, (cudaMemoryAdvise) cudaMemAdviseUnsetPreferredLocation);
321 }
322 
323 } // namespace detail_
324 
325 
326 inline void advise_expected_access_by(const_region_t region, device_t& device)
327 {
328  detail_::set_scalar_range_attribute(region, cudaMemAdviseSetAccessedBy, device.id());
329 }
330 
331 inline void advise_no_access_expected_by(const_region_t region, device_t& device)
332 {
333  detail_::set_scalar_range_attribute(region, cudaMemAdviseUnsetAccessedBy, device.id());
334 }
335 
336 template <typename Allocator>
337 ::std::vector<device_t, Allocator> accessors(const_region_t region, const Allocator& allocator)
338 {
339  static_assert(sizeof(cuda::device::id_t) == sizeof(device_t), "Unexpected size difference between device IDs and their wrapper class, device_t");
340 
341  auto num_devices = cuda::device::count();
342  ::std::vector<device_t, Allocator> devices(num_devices, allocator);
343  auto device_ids = reinterpret_cast<cuda::device::id_t *>(devices.data());
344 
345 
346  auto status = cudaMemRangeGetAttribute(
347  device_ids, sizeof(device_t) * devices.size(),
348  cudaMemRangeAttributeAccessedBy, region.start(), region.size());
349  throw_if_error(status, "Obtaining the IDs of devices with access to the managed memory range at " + cuda::detail_::ptr_as_hex(region.start()));
350  auto first_invalid_element = ::std::lower_bound(device_ids, device_ids + num_devices, cudaInvalidDeviceId);
351  // We may have gotten less results that the set of all devices, so let's whittle that down
352 
353  if (first_invalid_element - device_ids != num_devices) {
354  devices.resize(first_invalid_element - device_ids);
355  }
356 
357  return devices;
358 }
359 
360 namespace async {
361 
362 inline void prefetch(
363  const_region_t region,
364  cuda::device_t destination,
365  const stream_t& stream)
366 {
367  detail_::prefetch(region, destination.id(), stream.id());
368 }
369 
370 } // namespace async
371 
372 
373 inline region_t allocate(
374  cuda::device_t device,
375  size_t num_bytes,
376  initial_visibility_t initial_visibility)
377 {
378  return detail_::allocate(device.id(), num_bytes, initial_visibility);
379 }
380 
381 } // namespace managed
382 
383 namespace mapped {
384 
385 inline region_pair allocate(
386  cuda::device_t& device,
387  size_t size_in_bytes,
388  allocation_options options)
389 {
390  return cuda::memory::mapped::detail_::allocate(device.id(), size_in_bytes, options);
391 }
392 
393 } // namespace mapped
394 
395 } // namespace memory
396 
397 // kernel_t methods
398 
399 inline void kernel_t::set_attribute(cudaFuncAttribute attribute, int value)
400 {
401  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
402  auto result = cudaFuncSetAttribute(ptr_, attribute, value);
403  throw_if_error(result, "Setting CUDA device function attribute " + ::std::to_string(attribute) + " to value " + ::std::to_string(value));
404 }
405 
407 {
408  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
409 #if CUDART_VERSION >= 9000
410  auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributeMaxDynamicSharedMemorySize, amount_required_by_kernel);
411  throw_if_error(result,
412  "Trying to opt-in to " + ::std::to_string(amount_required_by_kernel) + " bytes of dynamic shared memory, "
413  "exceeding the maximum available on device " + ::std::to_string(device_id_) + " (consider the amount of static shared memory"
414  "in use by the function).");
415 #else
416  throw(cuda::runtime_error {cuda::status::not_yet_implemented});
417 #endif
418 }
419 
420 #if defined(__CUDACC__)
421 // Unfortunately, the CUDA runtime API does not allow for computation of the grid parameters for maximum occupancy
422 // from code compiled with a host-side-only compiler! See cuda_runtime.h for details
423 
424 inline ::std::pair<grid::dimension_t, grid::block_dimension_t>
426  memory::shared::size_t dynamic_shared_memory_size,
427  grid::block_dimension_t block_size_limit,
428  bool disable_caching_override)
429 {
430 #if CUDART_VERSION <= 10000
431  throw(cuda::runtime_error {cuda::status::not_yet_implemented});
432 #else
433  int min_grid_size_in_blocks, block_size;
434  auto result = cudaOccupancyMaxPotentialBlockSizeWithFlags(
435  &min_grid_size_in_blocks, &block_size,
436  ptr_,
437  static_cast<::std::size_t>(dynamic_shared_memory_size),
438  static_cast<int>(block_size_limit),
439  disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
440  );
441  throw_if_error(result,
442  "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr_) +
443  " on device " + ::std::to_string(device_id_) + ".");
444  return { min_grid_size_in_blocks, block_size };
445 #endif
446 }
447 
448 template <typename UnaryFunction>
449 ::std::pair<grid::dimension_t, grid::block_dimension_t>
451  UnaryFunction block_size_to_dynamic_shared_mem_size,
452  grid::block_dimension_t block_size_limit,
453  bool disable_caching_override)
454 {
455 #if CUDART_VERSION <= 10000
456  throw(cuda::runtime_error {cuda::status::not_yet_implemented});
457 #else
458  int min_grid_size_in_blocks, block_size;
459  auto result = cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
460  &min_grid_size_in_blocks, &block_size,
461  ptr_,
462  block_size_to_dynamic_shared_mem_size,
463  static_cast<int>(block_size_limit),
464  disable_caching_override ? cudaOccupancyDisableCachingOverride : cudaOccupancyDefault
465  );
466  throw_if_error(result,
467  "Failed obtaining parameters for a minimum-size grid for kernel " + detail_::ptr_as_hex(ptr_) +
468  " on device " + ::std::to_string(device_id_) + ".");
469  return { min_grid_size_in_blocks, block_size };
470 #endif
471 }
472 #endif
473 
474 inline void kernel_t::set_preferred_shared_mem_fraction(unsigned shared_mem_percentage)
475 {
476  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
477  if (shared_mem_percentage > 100) {
478  throw ::std::invalid_argument("Percentage value can't exceed 100");
479  }
480 #if CUDART_VERSION >= 9000
481  auto result = cudaFuncSetAttribute(ptr_, cudaFuncAttributePreferredSharedMemoryCarveout, shared_mem_percentage);
482  throw_if_error(result, "Trying to set the carve-out of shared memory/L1 cache memory");
483 #else
484  throw(cuda::runtime_error {cuda::status::not_yet_implemented});
485 #endif
486 }
487 
488 inline kernel::attributes_t kernel_t::attributes() const
489 {
490  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
491  kernel::attributes_t function_attributes;
492  auto status = cudaFuncGetAttributes(&function_attributes, ptr_);
493  throw_if_error(status, "Failed obtaining attributes for a CUDA device function");
494  return function_attributes;
495 }
496 
498 {
499  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
500  auto result = cudaFuncSetCacheConfig(ptr_, (cudaFuncCache) preference);
501  throw_if_error(result,
502  "Setting the multiprocessor L1/Shared Memory cache distribution preference for a "
503  "CUDA device function");
504 }
505 
506 
509 {
510  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
511  auto result = cudaFuncSetSharedMemConfig(ptr_, (cudaSharedMemConfig) config);
512  throw_if_error(result);
513 }
514 
516  grid::block_dimension_t num_threads_per_block,
517  memory::shared::size_t dynamic_shared_memory_per_block,
518  bool disable_caching_override)
519 {
520  device::current::detail_::scoped_override_t set_device_for_this_context(device_id_);
521  int result;
522  unsigned int flags = disable_caching_override ?
523  cudaOccupancyDisableCachingOverride : cudaOccupancyDefault;
524  auto status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
525  &result, ptr_, num_threads_per_block,
526  dynamic_shared_memory_per_block, flags);
527  throw_if_error(status, "Failed calculating the maximum occupancy "
528  "of device function blocks per multiprocessor");
529  return result;
530 }
531 
532 
533 template <typename DeviceFunction>
534 kernel_t::kernel_t(const device_t& device, DeviceFunction f, bool thread_block_cooperation)
535 : kernel_t(device.id(), reinterpret_cast<const void*>(f), thread_block_cooperation) { }
536 
537 namespace stream {
538 
540  device_t device,
541  bool synchronizes_with_default_stream,
542  priority_t priority)
543 {
544  return detail_::create(device.id(), synchronizes_with_default_stream, priority);
545 }
546 
547 namespace detail_ {
548 
549 inline void record_event_on_current_device(device::id_t device_id, stream::id_t stream_id, event::id_t event_id)
550 {
551  auto status = cudaEventRecord(event_id, stream_id);
552  throw_if_error(status,
553  "Failed scheduling event " + cuda::detail_::ptr_as_hex(event_id) + " to occur"
554  + " on stream " + cuda::detail_::ptr_as_hex(stream_id)
555  + " on CUDA device " + ::std::to_string(device_id));
556 }
557 } // namespace detail_
558 
559 } // namespace stream
560 
561 template<typename Kernel, typename... KernelParameters>
562 inline void enqueue_launch(
563  bool thread_block_cooperation,
564  Kernel kernel_function,
565  const stream_t& stream,
566  launch_configuration_t launch_configuration,
567  KernelParameters&&... parameters)
568 {
569  auto unwrapped_kernel_function =
570  kernel::unwrap<
571  Kernel,
572  detail_::kernel_parameter_decay_t<KernelParameters>...
573  >(kernel_function);
574  // Note: This helper function is necessary since we may have gotten a
575  // kernel_t as Kernel, which is type-erased - in
576  // which case we need both to obtain the raw function pointer, and determine
577  // its type, i.e. un-type-erase it. Luckily, we have the KernelParameters pack
578  // which - if we can trust the user - contains more-or-less the function's
579  // parameter types; and kernels return `void`, which settles the whole signature.
580  //
581  // I say "more or less" because the KernelParameter pack may contain some
582  // references, arrays and so on - which CUDA kernels cannot accept; so
583  // we massage those a bit.
584 
585 #ifdef DEBUG
586  assert(thread_block_cooperation == detail_::intrinsic_block_cooperation_value,
587  "mismatched indications of whether thread block should be able to cooperate for a kernel");
588 #endif
589  detail_::enqueue_launch(
590  thread_block_cooperation,
591  unwrapped_kernel_function,
592  stream.id(),
593  launch_configuration,
594  ::std::forward<KernelParameters>(parameters)...);
595 }
596 
597 template<typename Kernel, typename... KernelParameters>
598 inline void launch(
599  Kernel kernel_function,
600  launch_configuration_t launch_configuration,
601  KernelParameters&&... parameters)
602 {
603  stream_t stream = device::current::get().default_stream();
605  kernel_function,
606  stream,
607  launch_configuration,
608  ::std::forward<KernelParameters>(parameters)...);
609 }
610 
611 } // namespace cuda
612 
613 #endif // MULTI_WRAPPER_IMPLS_HPP_
614 
void wait(const event_t &event_)
Will pause all further activity on the stream until the specified event has occurred (i...
Definition: multi_wrapper_impls.hpp:160
A proxy class for CUDA devices, providing access to all Runtime API calls involving their use and man...
Proxy class for a CUDA stream.
Definition: stream.hpp:138
void synchronize() const
Block or busy-wait until all previously-scheduled work on this stream has been completed.
Definition: stream.hpp:576
All definitions and functionality wrapping the CUDA Runtime API.
Definition: array.hpp:17
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:225
device::id_t count()
Get the number of CUDA devices usable on the system (with the current CUDA library and kernel driver)...
Definition: miscellany.hpp:49
device::id_t device_id() const noexcept
The device with which this event is associated (i.e.
Definition: event.hpp:114
Holds the parameters necessary to "launch" a CUDA kernel (i.e.
Definition: types.hpp:351
Proxy class for a CUDA event.
Definition: event.hpp:104
::std::pair< grid::dimension_t, grid::block_dimension_t > min_grid_params_for_max_occupancy(memory::shared::size_t dynamic_shared_memory_size=no_dynamic_shared_memory, grid::block_dimension_t block_size_limit=0, bool disable_caching_override=false)
const stream::id_t default_stream_id
The CUDA runtime provides a default stream on which work is scheduled when no stream is specified; fo...
Definition: constants.hpp:42
Owning wrapper for CUDA 2D and 3D arrays.
Definition: array.hpp:76
void throw_if_error(cuda::status_t status, const ::std::string &message) noexcept(false)
Do nothing...
Definition: error.hpp:216
event::id_t id() const noexcept
The CUDA runtime API ID this object is wrapping.
Definition: event.hpp:109
unique_ptr< T > make_unique(device_t device)
Create a variant of ::std::unique_pointer for a single value in device-global memory.
Definition: multi_wrapper_impls.hpp:292
void fire(const stream_t &stream)
Records the event and ensures it has occurred before returning (by synchronizing the stream)...
Definition: multi_wrapper_impls.hpp:146
device_t get(id_t device_id) noexcept
Returns a proxy for the CUDA device with a given id.
Definition: device.hpp:747
void set_preferred_shared_mem_fraction(unsigned shared_mem_percentage)
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with f...
Definition: multi_wrapper_impls.hpp:474
unsigned size_t
Each physical core ("Symmetric Multiprocessor") on an nVIDIA GPU has a space of shared memory (see th...
Definition: types.hpp:342
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
cudaStream_t id_t
The CUDA Runtime API&#39;s numeric handle for streams.
Definition: types.hpp:218
Memory regions appearing in both on the host-side and device-side address spaces with the regions in ...
Definition: memory.hpp:1180
A proxy class for CUDA streams, providing access to all Runtime API calls involving their use and man...
void enqueue_launch(bool thread_block_cooperation, Kernel kernel_function, const stream_t &stream, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Enqueues a kernel on a stream (=queue) on the current CUDA device.
Definition: multi_wrapper_impls.hpp:562
A smart pointer for CUDA device- and host-side memory, similar to the standard library&#39;s ::std::uniqu...
options accepted by CUDA&#39;s allocator of memory with a host-side aspect (host-only or managed memory)...
Definition: memory.hpp:120
stream_t create(device_t device, bool synchronizes_with_default_stream, priority_t priority)
Create a new stream (= queue) on a CUDA device.
Definition: multi_wrapper_impls.hpp:539
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:177
cudaIpcEventHandle_t handle_t
The concrete value passed between processes, used to tell the CUDA Runtime API which event is desired...
Definition: ipc.hpp:159
Contains a proxy class for CUDA arrays - GPU memory with 2-D or 3-D locality and hardware support for...
event_t create(device_t &device, bool uses_blocking_sync=sync_by_busy_waiting, bool records_timing=do_record_timings, bool interprocess=not_interprocess)
creates a new execution stream on a device.
Definition: multi_wrapper_impls.hpp:51
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:424
dimension_t block_dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:258
event_t create_event(bool uses_blocking_sync=event::sync_by_busy_waiting, bool records_timing=event::do_record_timings, bool interprocess=event::not_interprocess)
See cuda::event::create()
Definition: multi_wrapper_impls.hpp:121
a wrapper around cudaFuncAttributes, offering a few convenience member functions. ...
Definition: kernel.hpp:33
void copy(void *destination, const void *source, size_t num_bytes)
Synchronously copies data between memory spaces or within a memory space.
Definition: memory.hpp:380
void opt_in_to_extra_dynamic_memory(cuda::memory::shared::size_t amount_required_by_kernel)
Change the hardware resource carve-out between L1 cache and shared memory for launches of the kernel ...
Definition: multi_wrapper_impls.hpp:406
Definition: memory.hpp:1185
void set_shared_memory_bank_size(multiprocessor_shared_memory_bank_size_option_t config)
Sets a device function&#39;s preference of shared memory bank size preference (for the current device pro...
Definition: multi_wrapper_impls.hpp:507
void record()
Schedule a specified event to occur (= to fire) when all activities already scheduled on the event&#39;s ...
Definition: event.hpp:161
Variadic, chevron-less wrappers for the CUDA kernel launch mechanism.
void set_cache_preference(multiprocessor_cache_preference_t preference)
Indicate the desired carve-out between shared memory and L1 cache when launching this kernel - with c...
Definition: multi_wrapper_impls.hpp:497
grid::dimension_t maximum_active_blocks_per_multiprocessor(grid::block_dimension_t num_threads_per_block, memory::shared::size_t dynamic_shared_memory_per_block, bool disable_caching_override=false)
Calculates the number of grid blocks which may be "active" on a given GPU multiprocessor simultaneous...
Definition: multi_wrapper_impls.hpp:515
decltype(dim3::x) dimension_t
CUDA kernels are launched in grids of blocks of threads, in 3 dimensions.
Definition: types.hpp:245
A wrapper class for host and/or device pointers, allowing easy access to CUDA&#39;s pointer attributes...
Representation, allocation and manipulation of CUDA-related memory, with its various namespaces and k...
multiprocessor_shared_memory_bank_size_option_t
A physical core (SM)&#39;s shared memory has multiple "banks"; at most one datum per bank may be accessed...
Definition: types.hpp:450
A non-owning wrapper class for CUDA __global__ functions.
Definition: kernel.hpp:54
A CUDA event wrapper class and some associated free-standing functions.
device::id_t id() const
Return the proxied device&#39;s ID.
Definition: device.hpp:558
void launch(Kernel kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream on the current device.
Definition: multi_wrapper_impls.hpp:598
Definition: memory.hpp:81
event_t & event(event_t &existing_event)
Have an event &#39;fire&#39;, i.e.
Definition: multi_wrapper_impls.hpp:176
void copy_single(T *destination, const T *source)
Synchronously copies a single (typed) value between two memory locations.
Definition: memory.hpp:611
A pair of memory regions, one in system (=host) memory and one on a CUDA device&#39;s memory - mapped to ...
Definition: memory.hpp:156
Proxy class for a CUDA device.
Definition: device.hpp:148
int id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:467
Definition: memory.hpp:77
cudaEvent_t id_t
The CUDA Runtime API&#39;s numeric handle for events.
Definition: types.hpp:205
void zero(void *start, size_t num_bytes)
Sets all bytes in a region of memory to 0 (zero)
Definition: memory.hpp:335