7 #ifndef CUDA_API_WRAPPERS_CONTEXT_HPP_ 8 #define CUDA_API_WRAPPERS_CONTEXT_HPP_ 86 bool take_ownership =
false) noexcept;
90 ::std::string identify(
const context_t& context);
95 auto status = cuCtxGetLimit(&limit_value, limit_id);
102 auto status = cuCtxSetLimit(limit_id, new_value);
106 constexpr flags_t
inline make_flags(
108 bool keep_larger_local_mem_after_resize)
111 sync_scheduling_policy
112 | (keep_larger_local_mem_after_resize ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
118 auto needed_push = current::detail_::push_if_not_on_top(context_handle);
119 auto device_id = current::detail_::get_device_id();
121 current::detail_::pop();
129 bool take_ownership =
false);
131 inline size_t total_memory(
handle_t handle)
133 size_t total_mem_in_bytes;
134 auto status = cuMemGetInfo(
nullptr, &total_mem_in_bytes);
135 throw_if_error_lazy(status,
"Failed determining amount of total memory for " + identify(handle));
136 return total_mem_in_bytes;
140 inline size_t free_memory(
handle_t handle)
142 size_t free_mem_in_bytes;
143 auto status = cuMemGetInfo(&free_mem_in_bytes,
nullptr);
144 throw_if_error_lazy(status,
"Failed determining amount of free memory for " + identify(handle));
145 return free_mem_in_bytes;
150 auto status = cuCtxSetCacheConfig(static_cast<CUfunc_cache>(preference));
152 "Setting the multiprocessor L1/Shared Memory cache distribution preference to " +
153 ::std::to_string(static_cast<unsigned>(preference)) +
" for " + identify(handle));
158 CUfunc_cache preference;
159 auto status = cuCtxGetCacheConfig(&preference);
161 "Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + identify(handle));
165 #if CUDA_VERSION < 12030 168 CUsharedconfig bank_size;
169 auto status = cuCtxGetSharedMemConfig(&bank_size);
170 throw_if_error_lazy(status,
"Obtaining the multiprocessor shared memory bank size for " + identify(handle));
173 #endif // CUDA_VERSION < 12030 175 #if CUDA_VERSION < 12030 178 auto status = cuCtxSetSharedMemConfig(static_cast<CUsharedconfig>(bank_size));
179 throw_if_error_lazy(status,
"Setting the multiprocessor shared memory bank size for " + identify(handle));
181 #endif // CUDA_VERSION < 12030 186 CAW_SET_SCOPE_CONTEXT(handle);
187 context::current::detail_::synchronize(handle);
192 CAW_SET_SCOPE_CONTEXT(handle);
193 context::current::detail_::synchronize(device_id, handle);
198 return cuCtxDestroy(handle);
201 inline void destroy(
handle_t handle)
203 auto status = destroy_nothrow(handle);
209 auto status = destroy_nothrow(handle);
213 inline context::flags_t get_flags(
handle_t handle)
215 CAW_SET_SCOPE_CONTEXT(handle);
216 return context::current::detail_::get_flags();
252 using flags_type = context::flags_t;
257 ::std::is_same<::std::underlying_type<CUsharedconfig>::type, ::std::underlying_type<cudaSharedMemConfig>::type>::value,
258 "Unexpected difference between enumerators used for the same purpose by the CUDA runtime and the CUDA driver");
268 #if CUDA_VERSION >= 11040 270 using execution_graph_related_attribute_t = CUgraphMem_attribute;
271 #endif // CUDA_VERSION >= 11040 280 : device_id_(device_id), context_handle_(context_handle)
318 size_t size_in_bytes,
320 cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access)
const;
327 CAW_SET_SCOPE_CONTEXT(context_handle_);
328 return context::detail_::total_memory(context_handle_);
336 CAW_SET_SCOPE_CONTEXT(context_handle_);
337 return context::detail_::free_memory(context_handle_);
340 #if CUDA_VERSION >= 11040 346 void free_unused_execution_graph_memory()
const 349 auto status = cuDeviceGraphMemTrim(device_id_);
351 "Trimming memory used for CUDA execution graphs on " + device::detail_::identify(device_id_));
357 size_t get_execution_graph_related_attribute(execution_graph_related_attribute_t attribute)
const 360 auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
361 throw_if_error_lazy(status,
"Failed obtaining an execution-graph-related memory attribute for " 362 + device::detail_::identify(device_id_));
366 void reset_execution_graph_usage_high_watermark()
const 368 cuuint64_t value_{0};
369 auto status = cuDeviceSetGraphMemAttribute(device_id_, CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, &value_);
370 throw_if_error_lazy(status,
"Failed setting an execution-graph-related memory attribute for " 371 + device::detail_::identify(device_id_));
374 #endif // CUDA_VERSION >= 11040 381 device::id_t device_id()
const noexcept {
return device_id_; }
394 CAW_SET_SCOPE_CONTEXT(handle_);
395 return context::detail_::total_memory(handle_);
406 CAW_SET_SCOPE_CONTEXT(handle_);
407 return context::detail_::free_memory(handle_);
414 template <
typename Kernel,
typename ... KernelParameters>
418 KernelParameters... parameters)
const;
426 CAW_SET_SCOPE_CONTEXT(handle_);
427 return context::detail_::cache_preference(handle_);
433 CAW_SET_SCOPE_CONTEXT(handle_);
434 return context::detail_::get_limit(CU_LIMIT_STACK_SIZE);
441 CAW_SET_SCOPE_CONTEXT(handle_);
442 return context::detail_::get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
449 CAW_SET_SCOPE_CONTEXT(handle_);
450 return context::detail_::get_limit(CU_LIMIT_MALLOC_HEAP_SIZE);
462 CAW_SET_SCOPE_CONTEXT(handle_);
463 return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH);
469 return { device_id_, handle_ };
479 CAW_SET_SCOPE_CONTEXT(handle_);
480 return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT);
483 #if CUDA_VERSION >= 10000 493 CAW_SET_SCOPE_CONTEXT(handle_);
494 return context::detail_::get_limit(CU_LIMIT_MAX_L2_FETCH_GRANULARITY);
498 #if CUDA_VERSION < 12030 507 CAW_SET_SCOPE_CONTEXT(handle_);
508 return context::detail_::shared_memory_bank_size(handle_);
510 #endif // CUDA_VERSION < 12030 516 return context::current::detail_::is_(handle_);
520 bool is_primary()
const;
525 CAW_SET_SCOPE_CONTEXT(handle_);
527 auto status = cuCtxGetStreamPriorityRange(&result.
least, &result.
greatest);
529 context::detail_::identify(*
this));
537 CAW_SET_SCOPE_CONTEXT(handle_);
538 return context::detail_::get_limit(limit_id);
549 unsigned int raw_version;
550 auto status = cuCtxGetApiVersion(handle_, &raw_version);
551 throw_if_error_lazy(status,
"Failed obtaining the API version for " + context::detail_::identify(*
this));
557 context::flags_t flags()
const 559 return context::detail_::get_flags(handle_);
576 bool keeping_larger_local_mem_after_resize()
const 578 return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
584 bool will_synchronize_with_default_stream,
591 bool records_timing = event::do_record_timings,
597 template <
typename ContiguousContainer,
598 cuda::detail_::enable_if_t<detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value,
bool> =
true>
601 template <
typename ContiguousContainer,
602 cuda::detail_::enable_if_t<detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value,
bool> =
true>
603 module_t create_module(ContiguousContainer module_data)
const;
610 void enable_access_to(
const context_t& peer)
const;
613 void disable_access_to(
const context_t& peer)
const;
618 CAW_SET_SCOPE_CONTEXT(handle_);
619 #if (CUDA_VERSION >= 11000) 620 auto status = cuCtxResetPersistingL2Cache();
624 cuda::status::insufficient_driver,
625 "Resetting/clearing the persisting L2 cache memory is not supported when compiling CUDA versions lower than 11.0");
630 #if CUDA_VERSION < 12030 639 CAW_SET_SCOPE_CONTEXT(handle_);
640 context::detail_::set_shared_memory_bank_size(handle_, bank_size);
642 #endif // CUDA_VERSION < 12030 652 CAW_SET_SCOPE_CONTEXT(handle_);
653 context::detail_::set_cache_preference(handle_, preference);
660 CAW_SET_SCOPE_CONTEXT(handle_);
661 return context::detail_::set_limit(limit_id, new_value);
669 return set_limit(CU_LIMIT_STACK_SIZE, new_value);
674 return set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_value);
679 return set_limit(CU_LIMIT_MALLOC_HEAP_SIZE, new_value);
684 return set_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH, new_value);
689 return set_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT, new_value);
708 bool take_ownership) noexcept
709 : device_id_(device_id), handle_(context_id), owning_(take_ownership)
718 bool take_ownership) noexcept;
724 context_t(other.device_id_, other.handle_,
false)
728 context_t(other.device_id_, other.handle_, other.owning_)
730 other.owning_ =
false;
735 if (not owning_) {
return; }
736 #if THROW_IN_DESTRUCTORS 737 context::detail_::destroy(handle_, device_id_);
739 context::detail_::destroy_nothrow(handle_);
748 ::std::swap(device_id_, other.device_id_);
749 ::std::swap(handle_, other.handle_);
750 ::std::swap(owning_, other.owning_);
772 return lhs.device_id() == rhs.device_id() and lhs.handle() == rhs.handle();
777 return not (lhs == rhs);
791 bool take_ownership) noexcept
793 return { device_id, context_id, take_ownership };
803 return wrap(device_id, context_handle, take_ownership);
808 host_thread_sync_scheduling_policy_t sync_scheduling_policy = automatic,
809 bool keep_larger_local_mem_after_resize =
false)
811 auto flags = context::detail_::make_flags(
812 sync_scheduling_policy,
813 keep_larger_local_mem_after_resize);
815 #if CUDA_VERSION >= 13000 817 CUctxCreateParams creation_params = {};
818 auto status = cuCtxCreate(&handle, &creation_params, flags, device_id);
820 auto status = cuCtxCreate(&handle, flags, device_id);
823 + device::detail_::identify(device_id));
851 host_thread_sync_scheduling_policy_t sync_scheduling_policy = heuristic,
852 bool keep_larger_local_mem_after_resize =
false);
866 host_thread_sync_scheduling_policy_t sync_scheduling_policy = heuristic,
867 bool keep_larger_local_mem_after_resize =
false);
878 auto handle = detail_::get_handle();
879 if (handle == context::detail_::none) {
880 throw ::std::runtime_error(
"Attempt to obtain the current CUDA context when no context is current.");
882 return context::detail_::from_handle(handle);
894 return detail_::set(context.handle());
901 return context::current::detail_::push_if_not_on_top(context.handle());
913 return context::current::detail_::push(context.handle());
924 static constexpr
const bool do_not_take_ownership {
false };
927 auto handle = context::current::detail_::pop();
928 auto device_id = context::detail_::get_device_id(handle);
929 return context::wrap(device_id, handle, do_not_take_ownership);
945 inline context_t get_with_fallback_push()
947 auto handle = push_default_if_missing();
948 return context::detail_::from_handle(handle);
961 inline ::std::string identify(
const context_t& context)
963 return identify(context.handle(), context.device_id());
982 context::detail_::synchronize(context.device_id(), context.handle());
987 #endif // CUDA_API_WRAPPERS_CONTEXT_HPP_ CUlinkState handle_t
A raw CUDA driver handle for a linking-process.
Definition: link.hpp:38
void reset_persisting_l2_cache() const
Clear the L2 cache memory which persists between invocations of kernels.
Definition: context.hpp:616
version_t api_version() const
Returns a version number corresponding to the capabilities of this context, which can be used can use...
Definition: context.hpp:547
A convenience class for holding, setting and inspecting options for a CUDA binary code linking proces...
Definition: link_options.hpp:130
size_t stack_size() const
Definition: context.hpp:431
Proxy class for a CUDA stream.
Definition: stream.hpp:258
link_t create(const link::options_t &options=link::options_t{})
Create a new link-process (before adding any compiled images or or image-files)
Definition: link.hpp:272
CUsharedconfig shared_memory_bank_size_t
Choice of the number of bytes in each bank of the shared memory.
Definition: context.hpp:44
Wrapper class for a CUDA context.
Definition: context.hpp:249
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:243
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1974
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:880
Wrapper class for a CUDA event.
Definition: event.hpp:147
A class to create a faux member in a context_t, in lieu of an in-class namespace (which C++ does not ...
Definition: context.hpp:267
A range of priorities supported by a CUDA context; ranges from the higher numeric value to the lower...
Definition: context.hpp:50
context::limit_value_t get_limit(context::limit_t limit_id) const
Get one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:535
context::stream_priority_range_t stream_priority_range() const
Get the range of priority values one can set for streams in this context.
Definition: context.hpp:523
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:852
Wrapper class for a CUDA code module.
Definition: module.hpp:126
size_t amount_free() const
Amount of free global memory on the CUDA device's primary context.
Definition: context.hpp:334
stream::priority_t least
Higher numeric value, lower priority.
Definition: context.hpp:52
CUlimit limit_t
Features of contexts which can be configured individually during a context's lifetime.
Definition: context.hpp:37
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:396
void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const
Set one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:658
void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: context.hpp:637
constexpr bool is_trivial() const
When true, stream prioritization is not supported, i.e.
Definition: context.hpp:61
bool push_if_not_on_top(const context_t &context)
Push a (reference to a) context onto the top of the context stack - unless that context is already at...
Definition: context.hpp:899
void synchronize() const
Avoid executing any additional instructions on this thread until all work on all streams in this cont...
Definition: context.hpp:698
Wrappers for Runtime API functions involving versions - of the CUDA runtime and of the CUDA driver...
size_t amount_total() const
Amount of total global memory on the CUDA device's primary context.
Definition: context.hpp:325
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:886
context::limit_value_t memory_allocation_heap_size() const
Definition: context.hpp:447
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:78
global_memory_type memory() const
Get a wrapper object for this context's associated device-global memory.
Definition: context.hpp:467
context_t pop()
Pop the top off of the context stack.
Definition: context.hpp:922
size_t limit_value_t
Type for the actual values for context (see limit_t for the possible kinds of limits whose value can ...
Definition: context.hpp:41
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:282
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:980
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:806
void push(const context_t &context)
Push a (reference to a) context onto the top of the context stack.
Definition: context.hpp:911
link_t wrap(device::id_t device_id, context::handle_t context_handle, link::handle_t handle, const link::options_t &options, bool take_ownership=false) noexcept
Wrap an existing CUDA link-process in a link_t wrapper class instance.
Definition: link.hpp:294
context::host_thread_sync_scheduling_policy_t sync_scheduling_policy() const
Gets the synchronization policy to be used for threads synchronizing with this CUDA context...
Definition: context.hpp:571
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:327
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:768
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing within this...
Definition: context.hpp:424
static version_t from_single_number(combined_version_t combined_version) noexcept
Parse the combined single-number representation, separating it.
Definition: versions.hpp:46
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
A structure representing a CUDA release version.
Definition: versions.hpp:39
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:246
size_t total_memory() const
The amount of total global device memory available to this context, including memory already allocate...
Definition: context.hpp:392
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
bool is_owning() const noexcept
Definition: context.hpp:386
context::limit_value_t maximum_outstanding_kernel_launches() const
Definition: context.hpp:477
Can only be used by the process which created it.
Definition: constants.hpp:95
size_t free_memory() const
The amount of unallocated global device memory available to this context and not yet allocated...
Definition: context.hpp:404
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing within this conte...
Definition: context.hpp:650
bool is_primary(const context_t &context)
Definition: context.hpp:49
bool is_current() const
Definition: context.hpp:514
context::limit_value_t maximum_depth_of_child_grid_sync_calls() const
Definition: context.hpp:460
void stack_size(context::limit_value_t new_value) const
Set the limit on the size of the stack a kernel thread can use when running.
Definition: context.hpp:667
context::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: context.hpp:505
Wrapper class for a CUDA device.
Definition: device.hpp:135
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:755
Fundamental CUDA-related type definitions.
CUresult status_t
Indicates either the result (success or error index) of a CUDA Runtime or Driver API call...
Definition: types.hpp:74
stream::priority_t greatest
Lower numeric value, higher priority.
Definition: context.hpp:55
context::limit_value_t printf_buffer_size() const
Definition: context.hpp:439