7 #ifndef CUDA_API_WRAPPERS_CONTEXT_HPP_ 8 #define CUDA_API_WRAPPERS_CONTEXT_HPP_ 86 bool take_ownership =
false) noexcept;
90 ::std::string identify(
const context_t& context);
95 auto status = cuCtxGetLimit(&limit_value, limit_id);
102 auto status = cuCtxSetLimit(limit_id, new_value);
106 constexpr flags_t
inline make_flags(
108 bool keep_larger_local_mem_after_resize)
111 sync_scheduling_policy
112 | (keep_larger_local_mem_after_resize ? CU_CTX_LMEM_RESIZE_TO_MAX : 0);
118 auto needed_push = current::detail_::push_if_not_on_top(context_handle);
119 auto device_id = current::detail_::get_device_id();
121 current::detail_::pop();
129 bool take_ownership =
false);
131 inline size_t total_memory(
handle_t handle)
133 size_t total_mem_in_bytes;
134 auto status = cuMemGetInfo(
nullptr, &total_mem_in_bytes);
135 throw_if_error_lazy(status,
"Failed determining amount of total memory for " + identify(handle));
136 return total_mem_in_bytes;
140 inline size_t free_memory(
handle_t handle)
142 size_t free_mem_in_bytes;
143 auto status = cuMemGetInfo(&free_mem_in_bytes,
nullptr);
144 throw_if_error_lazy(status,
"Failed determining amount of free memory for " + identify(handle));
145 return free_mem_in_bytes;
150 auto status = cuCtxSetCacheConfig(static_cast<CUfunc_cache>(preference));
152 "Setting the multiprocessor L1/Shared Memory cache distribution preference to " +
153 ::std::to_string(static_cast<unsigned>(preference)) +
" for " + identify(handle));
158 CUfunc_cache preference;
159 auto status = cuCtxGetCacheConfig(&preference);
161 "Obtaining the multiprocessor L1/Shared Memory cache distribution preference for " + identify(handle));
165 #if CUDA_VERSION < 12030 168 CUsharedconfig bank_size;
169 auto status = cuCtxGetSharedMemConfig(&bank_size);
170 throw_if_error_lazy(status,
"Obtaining the multiprocessor shared memory bank size for " + identify(handle));
173 #endif // CUDA_VERSION < 12030 175 #if CUDA_VERSION < 12030 178 auto status = cuCtxSetSharedMemConfig(static_cast<CUsharedconfig>(bank_size));
179 throw_if_error_lazy(status,
"Setting the multiprocessor shared memory bank size for " + identify(handle));
181 #endif // CUDA_VERSION < 12030 186 CAW_SET_SCOPE_CONTEXT(handle);
187 context::current::detail_::synchronize(handle);
192 CAW_SET_SCOPE_CONTEXT(handle);
193 context::current::detail_::synchronize(device_id, handle);
196 inline void destroy(
handle_t handle)
198 auto status = cuCtxDestroy(handle);
204 auto status = cuCtxDestroy(handle);
208 inline context::flags_t get_flags(
handle_t handle)
210 CAW_SET_SCOPE_CONTEXT(handle);
211 return context::current::detail_::get_flags();
247 using flags_type = context::flags_t;
252 ::std::is_same<::std::underlying_type<CUsharedconfig>::type, ::std::underlying_type<cudaSharedMemConfig>::type>::value,
253 "Unexpected difference between enumerators used for the same purpose by the CUDA runtime and the CUDA driver");
263 #if CUDA_VERSION >= 11040 265 using execution_graph_related_attribute_t = CUgraphMem_attribute;
266 #endif // CUDA_VERSION >= 11040 275 : device_id_(device_id), context_handle_(context_handle)
313 size_t size_in_bytes,
315 cuda::memory::managed::initial_visibility_t::to_supporters_of_concurrent_managed_access);
322 CAW_SET_SCOPE_CONTEXT(context_handle_);
323 return context::detail_::total_memory(context_handle_);
331 CAW_SET_SCOPE_CONTEXT(context_handle_);
332 return context::detail_::free_memory(context_handle_);
335 #if CUDA_VERSION >= 11040 341 void free_unused_execution_graph_memory()
const 344 auto status = cuDeviceGraphMemTrim(device_id_);
346 "Trimming memory used for CUDA execution graphs on " + device::detail_::identify(device_id_));
352 size_t get_execution_graph_related_attribute(execution_graph_related_attribute_t attribute)
const 355 auto status = cuDeviceGetGraphMemAttribute(device_id_, attribute, &result);
356 throw_if_error_lazy(status,
"Failed obtaining an execution-graph-related memory attribute for " 357 + device::detail_::identify(device_id_));
361 void reset_execution_graph_usage_high_watermark()
const 363 cuuint64_t value_{0};
364 auto status = cuDeviceSetGraphMemAttribute(device_id_, CU_GRAPH_MEM_ATTR_USED_MEM_HIGH, &value_);
365 throw_if_error_lazy(status,
"Failed setting an execution-graph-related memory attribute for " 366 + device::detail_::identify(device_id_));
369 #endif // CUDA_VERSION >= 11040 376 device::id_t device_id()
const noexcept {
return device_id_; }
389 CAW_SET_SCOPE_CONTEXT(handle_);
390 return context::detail_::total_memory(handle_);
401 CAW_SET_SCOPE_CONTEXT(handle_);
402 return context::detail_::free_memory(handle_);
409 template <
typename Kernel,
typename ... KernelParameters>
413 KernelParameters... parameters)
const;
421 CAW_SET_SCOPE_CONTEXT(handle_);
422 return context::detail_::cache_preference(handle_);
428 CAW_SET_SCOPE_CONTEXT(handle_);
429 return context::detail_::get_limit(CU_LIMIT_STACK_SIZE);
436 CAW_SET_SCOPE_CONTEXT(handle_);
437 return context::detail_::get_limit(CU_LIMIT_PRINTF_FIFO_SIZE);
444 CAW_SET_SCOPE_CONTEXT(handle_);
445 return context::detail_::get_limit(CU_LIMIT_MALLOC_HEAP_SIZE);
457 CAW_SET_SCOPE_CONTEXT(handle_);
458 return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH);
464 return { device_id_, handle_ };
474 CAW_SET_SCOPE_CONTEXT(handle_);
475 return context::detail_::get_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT);
478 #if CUDA_VERSION >= 10000 488 CAW_SET_SCOPE_CONTEXT(handle_);
489 return context::detail_::get_limit(CU_LIMIT_MAX_L2_FETCH_GRANULARITY);
493 #if CUDA_VERSION < 12030 502 CAW_SET_SCOPE_CONTEXT(handle_);
503 return context::detail_::shared_memory_bank_size(handle_);
505 #endif // CUDA_VERSION < 12030 511 return context::current::detail_::is_(handle_);
515 bool is_primary()
const;
520 CAW_SET_SCOPE_CONTEXT(handle_);
522 auto status = cuCtxGetStreamPriorityRange(&result.
least, &result.
greatest);
524 context::detail_::identify(*
this));
532 CAW_SET_SCOPE_CONTEXT(handle_);
533 return context::detail_::get_limit(limit_id);
544 unsigned int raw_version;
545 auto status = cuCtxGetApiVersion(handle_, &raw_version);
546 throw_if_error_lazy(status,
"Failed obtaining the API version for " + context::detail_::identify(*
this));
552 context::flags_t flags()
const 554 return context::detail_::get_flags(handle_);
571 bool keeping_larger_local_mem_after_resize()
const 573 return flags() & CU_CTX_LMEM_RESIZE_TO_MAX;
579 bool will_synchronize_with_default_stream,
586 bool records_timing = event::do_record_timings,
592 template <
typename ContiguousContainer,
593 cuda::detail_::enable_if_t<detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value,
bool> =
true>
596 template <
typename ContiguousContainer,
597 cuda::detail_::enable_if_t<detail_::is_kinda_like_contiguous_container<ContiguousContainer>::value,
bool> =
true>
598 module_t create_module(ContiguousContainer module_data)
const;
605 void enable_access_to(
const context_t& peer)
const;
608 void disable_access_to(
const context_t& peer)
const;
613 CAW_SET_SCOPE_CONTEXT(handle_);
614 #if (CUDA_VERSION >= 11000) 615 auto status = cuCtxResetPersistingL2Cache();
619 cuda::status::insufficient_driver,
620 "Resetting/clearing the persisting L2 cache memory is not supported when compiling CUDA versions lower than 11.0");
625 #if CUDA_VERSION < 12030 634 CAW_SET_SCOPE_CONTEXT(handle_);
635 context::detail_::set_shared_memory_bank_size(handle_, bank_size);
637 #endif // CUDA_VERSION < 12030 647 CAW_SET_SCOPE_CONTEXT(handle_);
648 context::detail_::set_cache_preference(handle_, preference);
655 CAW_SET_SCOPE_CONTEXT(handle_);
656 return context::detail_::set_limit(limit_id, new_value);
664 return set_limit(CU_LIMIT_STACK_SIZE, new_value);
669 return set_limit(CU_LIMIT_PRINTF_FIFO_SIZE, new_value);
674 return set_limit(CU_LIMIT_MALLOC_HEAP_SIZE, new_value);
679 return set_limit(CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH, new_value);
684 return set_limit(CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT, new_value);
703 bool take_ownership) noexcept
704 : device_id_(device_id), handle_(context_id), owning_(take_ownership)
713 bool take_ownership) noexcept;
719 context_t(other.device_id_, other.handle_,
false)
723 context_t(other.device_id_, other.handle_, other.owning_)
725 other.owning_ =
false;
731 cuCtxDestroy(handle_);
742 ::std::swap(device_id_, other.device_id_);
743 ::std::swap(handle_, other.handle_);
744 ::std::swap(owning_, other.owning_);
766 return lhs.device_id() == rhs.device_id() and lhs.handle() == rhs.handle();
771 return not (lhs == rhs);
785 bool take_ownership) noexcept
787 return { device_id, context_id, take_ownership };
797 return wrap(device_id, context_handle, take_ownership);
802 host_thread_sync_scheduling_policy_t sync_scheduling_policy = automatic,
803 bool keep_larger_local_mem_after_resize =
false)
805 auto flags = context::detail_::make_flags(
806 sync_scheduling_policy,
807 keep_larger_local_mem_after_resize);
809 auto status = cuCtxCreate(&handle, flags, device_id);
811 + device::detail_::identify(device_id));
839 host_thread_sync_scheduling_policy_t sync_scheduling_policy = heuristic,
840 bool keep_larger_local_mem_after_resize =
false);
854 host_thread_sync_scheduling_policy_t sync_scheduling_policy = heuristic,
855 bool keep_larger_local_mem_after_resize =
false);
866 auto handle = detail_::get_handle();
867 if (handle == context::detail_::none) {
868 throw ::std::runtime_error(
"Attempt to obtain the current CUDA context when no context is current.");
870 return context::detail_::from_handle(handle);
882 return detail_::set(context.handle());
889 return context::current::detail_::push_if_not_on_top(context.handle());
901 return context::current::detail_::push(context.handle());
912 static constexpr
const bool do_not_take_ownership {
false };
915 auto handle = context::current::detail_::pop();
916 auto device_id = context::detail_::get_device_id(handle);
917 return context::wrap(device_id, handle, do_not_take_ownership);
933 inline context_t get_with_fallback_push()
935 auto handle = push_default_if_missing();
936 return context::detail_::from_handle(handle);
949 inline ::std::string identify(
const context_t& context)
951 return identify(context.handle(), context.device_id());
970 context::detail_::synchronize(context.device_id(), context.handle());
975 #endif // CUDA_API_WRAPPERS_CONTEXT_HPP_ CUlinkState handle_t
A raw CUDA driver handle for a linking-process.
Definition: link.hpp:40
void reset_persisting_l2_cache() const
Clear the L2 cache memory which persists between invocations of kernels.
Definition: context.hpp:611
version_t api_version() const
Returns a version number corresponding to the capabilities of this context, which can be used can use...
Definition: context.hpp:542
A convenience class for holding, setting and inspecting options for a CUDA binary code linking proces...
Definition: link_options.hpp:130
size_t stack_size() const
Definition: context.hpp:426
Proxy class for a CUDA stream.
Definition: stream.hpp:246
CUsharedconfig shared_memory_bank_size_t
Choice of the number of bytes in each bank of the shared memory.
Definition: context.hpp:44
Wrapper class for a CUDA context.
Definition: context.hpp:244
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
int priority_t
CUDA streams have a scheduling priority, with lower values meaning higher priority.
Definition: types.hpp:246
detail_::region_helper< memory::region_t > region_t
A child class of the generic region_t with some managed-memory-specific functionality.
Definition: memory.hpp:1960
The full set of possible configuration parameters for launching a kernel on a GPU.
Definition: launch_configuration.hpp:69
CUcontext handle_t
Raw CUDA driver handle for a context; see {context_t}.
Definition: types.hpp:878
Wrapper class for a CUDA event.
Definition: event.hpp:133
A class to create a faux member in a context_t, in lieu of an in-class namespace (which C++ does not ...
Definition: context.hpp:262
A range of priorities supported by a CUDA context; ranges from the higher numeric value to the lower...
Definition: context.hpp:50
context::limit_value_t get_limit(context::limit_t limit_id) const
Get one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:530
context::stream_priority_range_t stream_priority_range() const
Get the range of priority values one can set for streams in this context.
Definition: context.hpp:518
CUdevice id_t
Numeric ID of a CUDA device used by the CUDA Runtime API.
Definition: types.hpp:850
Wrapper class for a CUDA code module.
Definition: module.hpp:123
size_t amount_free() const
Amount of free global memory on the CUDA device's primary context.
Definition: context.hpp:329
stream::priority_t least
Higher numeric value, lower priority.
Definition: context.hpp:52
CUlimit limit_t
Features of contexts which can be configured individually during a context's lifetime.
Definition: context.hpp:37
void launch(Kernel &&kernel, launch_configuration_t launch_configuration, KernelParameters &&... parameters)
Variant of enqueue_launch for use with the default stream in the current context. ...
Definition: kernel_launch.hpp:394
void set_limit(context::limit_t limit_id, context::limit_value_t new_value) const
Set one of the configurable limits for this context (and events, streams, kernels, etc.
Definition: context.hpp:653
void set_shared_memory_bank_size(context::shared_memory_bank_size_t bank_size) const
Sets the shared memory bank size, described in this Parallel-for-all blog entry
Definition: context.hpp:632
constexpr bool is_trivial() const
When true, stream prioritization is not supported, i.e.
Definition: context.hpp:61
bool push_if_not_on_top(const context_t &context)
Push a (reference to a) context onto the top of the context stack - unless that context is already at...
Definition: context.hpp:887
void synchronize() const
Avoid executing any additional instructions on this thread until all work on all streams in this cont...
Definition: context.hpp:693
Wrappers for Runtime API functions involving versions - of the CUDA runtime and of the CUDA driver...
size_t amount_total() const
Amount of total global memory on the CUDA device's primary context.
Definition: context.hpp:320
host_thread_sync_scheduling_policy_t
Scheduling policies the CUDA driver may use when the host-side thread it is running in needs to wait ...
Definition: types.hpp:884
context::limit_value_t memory_allocation_heap_size() const
Definition: context.hpp:442
::std::size_t size_t
A size type for use throughout the wrappers library (except when specific API functions limit the siz...
Definition: types.hpp:81
global_memory_type memory() const
Get a wrapper object for this context's associated device-global memory.
Definition: context.hpp:462
context_t pop()
Pop the top off of the context stack.
Definition: context.hpp:910
size_t limit_value_t
Type for the actual values for context (see limit_t for the possible kinds of limits whose value can ...
Definition: context.hpp:41
A (base?) class for exceptions raised by CUDA code; these errors are thrown by essentially all CUDA R...
Definition: error.hpp:271
void synchronize(const context_t &context)
Waits for all previously-scheduled tasks on all streams (= queues) in a CUDA context to conclude...
Definition: context.hpp:968
multiprocessor_cache_preference_t
L1-vs-shared-memory balance option.
Definition: types.hpp:804
void push(const context_t &context)
Push a (reference to a) context onto the top of the context stack.
Definition: context.hpp:899
link_t wrap(device::id_t device_id, context::handle_t context_handle, link::handle_t handle, const link::options_t &options, bool take_ownership=false) noexcept
Wrap an existing CUDA link-process in a link_t wrapper class instance.
Definition: link.hpp:281
context::host_thread_sync_scheduling_policy_t sync_scheduling_policy() const
Gets the synchronization policy to be used for threads synchronizing with this CUDA context...
Definition: context.hpp:566
#define throw_if_error_lazy(status__,...)
A macro for only throwing an error if we've failed - which also ensures no string is constructed unle...
Definition: error.hpp:316
bool operator==(const context_t &lhs, const context_t &rhs) noexcept
Definition: context.hpp:762
multiprocessor_cache_preference_t cache_preference() const
Determines the balance between L1 space and shared memory space set for kernels executing within this...
Definition: context.hpp:419
static version_t from_single_number(combined_version_t combined_version) noexcept
Parse the combined single-number representation, separating it.
Definition: versions.hpp:46
Facilities for exception-based handling of Runtime and Driver API errors, including a basic exception...
A structure representing a CUDA release version.
Definition: versions.hpp:39
Can be shared between processes. Must not be able to record timings.
Definition: constants.hpp:96
Fundamental CUDA-related constants and enumerations, not dependent on any more complex abstractions...
the scheduling priority of a stream created without specifying any other priority value ...
Definition: types.hpp:249
size_t total_memory() const
The amount of total global device memory available to this context, including memory already allocate...
Definition: context.hpp:387
The thread calling event_.synchronize() will enter a busy-wait loop; this (might) minimize delay betw...
Definition: constants.hpp:70
bool is_owning() const noexcept
Definition: context.hpp:381
context::limit_value_t maximum_outstanding_kernel_launches() const
Definition: context.hpp:472
Can only be used by the process which created it.
Definition: constants.hpp:95
size_t free_memory() const
The amount of unallocated global device memory available to this context and not yet allocated...
Definition: context.hpp:399
void set_cache_preference(multiprocessor_cache_preference_t preference) const
Controls the balance between L1 space and shared memory space for kernels executing within this conte...
Definition: context.hpp:645
bool is_primary(const context_t &context)
Definition: context.hpp:51
bool is_current() const
Definition: context.hpp:509
context::limit_value_t maximum_depth_of_child_grid_sync_calls() const
Definition: context.hpp:455
void stack_size(context::limit_value_t new_value) const
Set the limit on the size of the stack a kernel thread can use when running.
Definition: context.hpp:662
context::shared_memory_bank_size_t shared_memory_bank_size() const
Returns the shared memory bank size, as described in this Parallel-for-all blog entry ...
Definition: context.hpp:500
Wrapper class for a CUDA device.
Definition: device.hpp:135
initial_visibility_t
The choices of which categories CUDA devices must a managed memory region be visible to...
Definition: types.hpp:753
Fundamental CUDA-related type definitions.
stream::priority_t greatest
Lower numeric value, higher priority.
Definition: context.hpp:55
context::limit_value_t printf_buffer_size() const
Definition: context.hpp:434