8 #ifndef CUDA_API_WRAPPERS_RTC_COMPILATION_OPTIONS_HPP_ 9 #define CUDA_API_WRAPPERS_RTC_COMPILATION_OPTIONS_HPP_ 11 #include "cuda/api/detail/option_marshalling.hpp" 13 #include "../api/device_properties.hpp" 14 #include "../api/device.hpp" 15 #include "../api/common_ptx_compilation_options.hpp" 17 #include <unordered_map> 18 #include <unordered_set> 30 enum class cpp_dialect_t {
40 static constexpr
const size_t language_dialect_name_length { 5 };
41 constexpr
const char* cpp_dialect_names[] = {
48 inline cpp_dialect_t cpp_dialect_from_name(
const char* dialect_name) noexcept(
false)
50 for(
auto known_dialect = static_cast<int>(cpp_dialect_t::cpp03);
51 known_dialect <= static_cast<int>(cpp_dialect_t::last);
54 if (strcmp(detail_::cpp_dialect_names[known_dialect], dialect_name) == 0) {
55 return static_cast<cpp_dialect_t
>(known_dialect);
58 throw ::std::invalid_argument(::std::string(
"No C++ dialect named \"") + dialect_name +
'"');
76 static constexpr
const char* parts[] = {
"error",
"suppress",
"warn" };
85 template <source_kind_t Kind>
88 using optional = cuda::optional<T>;
105 ::std::unordered_set<cuda::device::compute_capability_t>
targets_;
120 targets_.insert(compute_capability);
134 add_target(compute_capability);
150 no_optimization = O0,
154 maximum_optimization = O3
163 template <source_kind_t Kind>
174 using parent::parent;
178 bool parse_without_code_generation {
false };
182 bool allow_expensive_optimizations_below_O2 {
false };
193 bool compile_as_tools_patch {
false };
199 bool compile_extensible_whole_program {
false };
203 bool use_fused_multiply_add {
true };
206 bool verbose {
false };
215 bool dont_merge_basicblocks {
false };
218 bool disable_warnings {
false };
221 bool disable_optimizer_constants {
false };
225 bool return_at_end_of_kernel {
false };
229 bool preserve_variable_relocations {
false };
234 bool double_precision_ops {
false };
235 bool local_memory_use {
false };
236 bool registers_spill_to_local_memory {
false };
237 bool indeterminable_stack_size {
true };
240 bool double_demotion {
false };
241 } situation_warnings;
246 optional<rtc::ptx_register_count_t> kernel {};
247 optional<rtc::ptx_register_count_t> device_function {};
248 } maximum_register_counts;
251 struct caching_mode_spec_t {
252 optional<caching_mode_t<memory_operation_t::load>> load {};
253 optional<caching_mode_t<memory_operation_t::store>> store {};
257 caching_mode_spec_t default_ {};
260 caching_mode_spec_t forced {};
267 return caching_modes.default_.load;
274 return caching_modes.default_.load;
285 ::std::vector<::std::string>& entries();
286 ::std::vector<::std::string>& kernels();
287 ::std::vector<::std::string>& kernel_names();
298 using parent::parent;
305 bool compile_extensible_whole_program {
false };
313 bool optimize_device_code_in_debug_mode {
false };
318 bool support_128bit_integers {
false };
323 bool indicate_function_inlining {
false };
325 #if CUDA_VERSION >= 11200 && CUDA_VERSION <= 12200 330 bool compiler_self_identification {
false };
331 #endif // #if CUDA_VERSION >= 11200 && CUDA_VERSION <= 12200 339 bool syntax_check_only {
false };
349 bool less_builtins {
false };
360 optional<size_t> maximum_register_count { };
367 bool flush_denormal_floats_to_zero {
false };
374 bool use_precise_square_root {
true };
381 bool use_precise_division {
true };
388 bool use_fused_multiply_add {
true };
392 bool use_fast_math {
false };
399 bool link_time_optimization {
false };
403 bool source_dirs_in_include_path {
true };
406 bool extra_device_vectorization {
false };
409 optional<cpp_dialect_t> language_dialect { };
421 bool disable_warnings {
false };
424 bool assume_restrict {
false };
428 bool default_execution_space_is_device {
false };
431 bool display_error_numbers {
true };
464 bool builtin_move_and_forward {
true };
475 bool increase_stack_limit_to_max {
true };
482 bool builtin_initializer_list {
true };
493 ::std::unordered_map<error::number_t, error::handling_method_t> error_handling_overrides;
501 language_dialect = {};
509 language_dialect = dialect;
516 return (dialect_name ==
nullptr or *dialect_name ==
'\0') ?
517 clear_language_dialect() :
518 set_language_dialect(detail_::cpp_dialect_from_name(dialect_name));
524 return dialect_name.empty() ?
525 clear_language_dialect() :
526 set_language_dialect(dialect_name.c_str());
533 error_handling_overrides[error_number] = error::suppress;
541 error_handling_overrides[error_number] = error::raise_error;
549 error_handling_overrides[error_number] = error::warn;
554 template <
typename CompilationOptions>
555 inline ::std::string render(
const CompilationOptions& opts)
557 return marshalling::render(opts);
562 namespace marshalling {
566 template <
typename MarshalTarget,
typename Delimiter>
570 MarshalTarget &marshalled, Delimiter delimiter,
571 bool need_delimiter_after_last_option)
573 opt_start_t<Delimiter> opt_start { delimiter };
584 if (opts.
verbose) { marshalled << opt_start <<
"--verbose"; }
588 if (osw.double_precision_ops) { marshalled << opt_start <<
"--warn-on-double-precision-use"; }
589 if (osw.local_memory_use) { marshalled << opt_start <<
"--warn-on-local-memory-usage"; }
590 if (osw.registers_spill_to_local_memory) { marshalled << opt_start <<
"--warn-on-spills"; }
591 if (not osw.indeterminable_stack_size) { marshalled << opt_start <<
"--suppress-stack-size-warning"; }
592 if (osw.double_demotion) { marshalled << opt_start <<
"--suppress-double-demote-warning"; }
594 if (opts.
disable_warnings) { marshalled << opt_start <<
"--disable-warnings"; }
608 marshalled << opt_start <<
"--allow-expensive-optimizations";
616 marshalled << opt_start <<
"--device-function-maxrregcount " << opts.
maximum_register_counts.device_function.value();
620 const auto& ocm = opts.caching_modes;
621 if (ocm.default_.load) { marshalled << opt_start <<
"--def-load-cache " << ocm.
default_.load.value(); }
622 if (ocm.default_.store) { marshalled << opt_start <<
"--def-store-cache " << ocm.default_.store.value(); }
623 if (ocm.forced.load) { marshalled << opt_start <<
"--force-load-cache " << ocm.forced.load.value(); }
624 if (ocm.forced.store) { marshalled << opt_start <<
"--force-store-cache " << ocm.forced.store.value(); }
629 for(
const auto& target : opts.
targets_) {
631 marshalled << opt_start <<
"--gpu-name=" << prefix <<
'_' << target.as_combined_number();
635 marshalled << opt_start <<
"--entry";
638 if (first) { first =
false; }
639 else { marshalled <<
','; }
644 if (need_delimiter_after_last_option) {
645 marshalled << opt_start;
650 template <
typename MarshalTarget,
typename Delimiter>
654 bool need_delimiter_after_last_option)
656 opt_start_t<Delimiter> opt_start { delimiter };
663 #if CUDA_VERSION >= 11200 && CUDA_VERSION <= 12200 664 if (opts.compiler_self_identification) { marshalled << opt_start <<
"--version-ident=true"; }
665 #endif // CUDA_VERSION >= 11200 && CUDA_VERSION <= 12200 666 if (opts.
syntax_check_only) { marshalled << opt_start <<
"--fdevice-syntax-only"; }
667 if (opts.
less_builtins) { marshalled << opt_start <<
"--minimal"; }
671 if (opts.
disable_warnings) { marshalled << opt_start <<
"--disable-warnings"; }
678 if (opts.
use_fast_math) { marshalled << opt_start <<
"--use_fast_math"; }
686 marshalled << opt_start <<
"--dopt=on";
688 if (not opts.
ptxas.empty()) {
689 marshalled << opt_start <<
"--ptxas-options=" << opts.
ptxas;
694 marshalled << opt_start <<
"--std=" << rtc::detail_::cpp_dialect_names[
static_cast<unsigned>(opts.
language_dialect.value())];
703 for(
const auto& target : opts.
targets_) {
704 #if CUDA_VERSION < 11000 705 marshalled << opt_start <<
"--gpu-architecture=compute_" << target.as_combined_number();
707 marshalled << opt_start <<
"--gpu-architecture=sm_" << target.as_combined_number();
712 marshalled << opt_start <<
"-U" << def;
718 marshalled << opt_start <<
"-D" << def;
723 marshalled << opt_start <<
"-D" << def.first <<
'=' << def.second;
727 marshalled << opt_start <<
"--include-path=" << path;
731 marshalled << opt_start <<
"--pre-include=" << preinclude_file;
734 for(
const auto&
override : opts.error_handling_overrides) {
736 << opt_start <<
"--diag-" << rtc::error::detail_::option_name_part(
override.second)
737 <<
'=' <<
override.first ;
741 marshalled << opt_start << extra_opt;
744 if (need_delimiter_after_last_option) {
745 marshalled << opt_start;
756 #endif // CUDA_API_WRAPPERS_RTC_COMPILATION_OPTIONS_HPP_ ::std::vector<::std::string > mangled_entry_function_names
Specifies the GPU kernels, or __global__ functions in CUDA-C++ terms, or .entry functions in PTX term...
Definition: compilation_options.hpp:283
bool default_execution_space_is_device
Assume functions without an explicit specification of their execution space are __device__ rather tha...
Definition: compilation_options.hpp:428
::std::string ptxas
Extra options for the PTX compiler (a.k.a. "PTX optimizing assembler").
Definition: compilation_options.hpp:434
compilation_options_t & clear_language_dialect()
Let the compiler interpret the program source code using its default-assumption for the C++ language ...
Definition: compilation_options.hpp:499
bool allow_expensive_optimizations_below_O2
Allow the JIT compiler to perform expensive optimizations using maximum available resources (memory a...
Definition: compilation_options.hpp:182
bool use_fused_multiply_add
Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-poin...
Definition: compilation_options.hpp:388
The CUDA variant of C++, accepted by the NVRTC library.
Definition: types.hpp:41
::std::unordered_map<::std::string,::std::string > valued_defines
Preprocessor macros to have the compiler define to specific values.
Definition: compilation_options.hpp:418
bool compile_as_tools_patch
Compile as patch code for CUDA tools.
Definition: compilation_options.hpp:193
Definitions and functionality wrapping CUDA APIs.
Definition: array.hpp:22
handling_method_t
Possible ways of handling a potentially problematic finding by the compiler in the program source cod...
Definition: compilation_options.hpp:67
bool generate_relocatable_device_code
Generate relocatable code that can be linked with other relocatable device code.
Definition: common_ptx_compilation_options.hpp:195
compilation_options_t & suppress_error(error::number_t error_number)
Ignore compiler findings of the specified number (rather than warnings about them or raising an error...
Definition: compilation_options.hpp:531
bool optimize_device_code_in_debug_mode
If debug mode is enabled, perform limited optimizations of device code rather than none at all...
Definition: compilation_options.hpp:313
bool extra_device_vectorization
Enables more aggressive device code vectorization in the LTO IR optimizer.
Definition: compilation_options.hpp:406
unsigned number_t
Errors, or problematic findings, by the compiler are identified by a number of this type...
Definition: compilation_options.hpp:70
bool compile_extensible_whole_program
Expecting only whole-programs to be directly usable, allow some calls to not be resolved until device...
Definition: compilation_options.hpp:199
bool use_fast_math
Make use of fast math operations.
Definition: compilation_options.hpp:392
A numeric designator of the computational capabilities of a CUDA device.
Definition: device_properties.hpp:75
optional< optimization_level_t > optimization_level
Compilation optimization level (as in -O1, -O2 etc.)
Definition: common_ptx_compilation_options.hpp:158
bool support_128bit_integers
Allow the use of the 128-bit __int128 type in the code.
Definition: compilation_options.hpp:318
compilation_options_t & set_language_dialect(const char *dialect_name)
Set which dialect of the C++ language the compiler will try to interpret the program source code as...
Definition: compilation_options.hpp:514
optional< caching_mode_t< memory_operation_t::load > > default_load_caching_mode() const override
Get the caching mode the compiler will be told to use as the default, for load instructions which don...
Definition: compilation_options.hpp:272
Options to be passed to one of the NVIDIA JIT compilers along with a program's source code...
Definition: compilation_options.hpp:164
bool link_time_optimization
Do not compile fully into PTX/Cubin.
Definition: compilation_options.hpp:399
Options for JIT-compilation of CUDA C++ code.
Definition: compilation_options.hpp:292
compilation_options_t & set_language_dialect(cpp_dialect_t dialect)
Set which dialect of the C++ language the compiler will try to interpret the program source code as...
Definition: compilation_options.hpp:507
bool compile_extensible_whole_program
Do extensible whole program compilation of device code.
Definition: compilation_options.hpp:305
optional< caching_mode_t< memory_operation_t::load > > & default_load_caching_mode() override
Get a reference to the caching mode the compiler will be told to use as the default, for load instructions which don't explicitly specify a particular caching mode.
Definition: compilation_options.hpp:265
bool disable_warnings
Have the compiler treat all warnings as though they were suppressed, and print nothing.
Definition: compilation_options.hpp:421
A subset of the options for compiling PTX code into SASS, usable both with the CUDA driver and with N...
Definition: common_ptx_compilation_options.hpp:149
optional< cpp_dialect_t > language_dialect
The dialect of C++ as which the compiler will be forced to interpret the program source code...
Definition: compilation_options.hpp:409
::std::vector<::std::string > additional_include_paths
A sequence of directories to be searched for headers.
Definition: compilation_options.hpp:445
compilation_options_t & set_language_dialect(const ::std::string &dialect_name)
Set which dialect of the C++ language the compiler will try to interpret the program source code as...
Definition: compilation_options.hpp:522
struct cuda::rtc::compilation_options_t< ptx >::@21 maximum_register_counts
Limits on the number of registers which generated object code (of different kinds) is allowed to use...
::std::unordered_set<::std::string > undefines
Preprocessor macros to tell the compiler to specifically _un_define.
Definition: compilation_options.hpp:415
struct cuda::rtc::compilation_options_t< ptx >::@20 situation_warnings
Warnings about situations likely to result in poor performance or other problems. ...
bool syntax_check_only
Stop compilation after the front-end has verified the program's syntax.
Definition: compilation_options.hpp:339
bool return_at_end_of_kernel
Prevents the optimizing away of the return instruction at the end of a program (a kernel...
Definition: compilation_options.hpp:225
bool disable_optimizer_constants
Disable use of the "optimizer constant bank" feature.
Definition: compilation_options.hpp:221
bool generate_debug_info
Generate debugging information associating SASS instructions to locations in the source, embedding it within the compilation output (-g)
Definition: common_ptx_compilation_options.hpp:169
bool use_precise_division
For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a...
Definition: compilation_options.hpp:381
bool assume_restrict
Treat all kernel pointer parameters as if they had the restrict (or __restrict) qualifier.
Definition: compilation_options.hpp:424
compilation_options_base_t & add_target(device::compute_capability_t compute_capability)
Have the compilation also target a specific compute capability.
Definition: compilation_options.hpp:117
bool builtin_move_and_forward
Provide builtin definitions of ::std::move and ::std::forward.
Definition: compilation_options.hpp:464
bool generate_source_line_info
Generate indications of which PTX/SASS instructions correspond to which lines of the source code...
Definition: common_ptx_compilation_options.hpp:165
::std::unordered_set<::std::string > no_value_defines
Preprocessor macros to have the compiler define, without specifying a particular value.
Definition: compilation_options.hpp:412
bool preserve_variable_relocations
Generate relocatable references for variables and preserve relocations generated for them in the link...
Definition: compilation_options.hpp:229
bool dont_merge_basicblocks
Prevent the compiler from merging consecutive basic blocks (https://en.wikipedia.org/wiki/Basic_block...
Definition: compilation_options.hpp:215
bool builtin_initializer_list
Provide builtin definitions of ::std::initializer_list class and member functions.
Definition: compilation_options.hpp:482
bool parse_without_code_generation
Makes the PTX compiler run without producing any CUBIN output (for PTX verification only) ...
Definition: compilation_options.hpp:178
device::compute_capability_t compute_capability() const
Obtains the device's compute capability; see cuda::device::compute_capability_t.
Definition: device.hpp:415
NVIDIA's architecture-inspecific intermediate program representation language, known as PTX or Parall...
Definition: types.hpp:44
bool verbose
Print code generation statistics along with the compilation log.
Definition: compilation_options.hpp:206
bool use_precise_square_root
For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster appro...
Definition: compilation_options.hpp:374
::std::unordered_set< cuda::device::compute_capability_t > targets_
Target devices in terms of CUDA compute capability.
Definition: compilation_options.hpp:105
compilation_options_t & warn_about(error::number_t error_number)
Treat compiler findings of the specified number as warnings (rather than raising an error or ignoring...
Definition: compilation_options.hpp:547
::std::vector<::std::string > preinclude_files
Header files to preinclude during preprocessing of the source.
Definition: compilation_options.hpp:457
caching_mode_spec_t default_
The caching mode to be used for instructions which don't specify a caching mode.
Definition: compilation_options.hpp:257
compilation_options_t & treat_as_error(error::number_t error_number)
Treat compiler findings of the specified number as an error (rather than suppressing them or just war...
Definition: compilation_options.hpp:539
bool flush_denormal_floats_to_zero
When performing single-precision floating-point operations, flush denormal values to zero...
Definition: compilation_options.hpp:367
bool disable_warnings
The equivalent of suppressing all findings which currently trigger a warning.
Definition: compilation_options.hpp:218
bool display_error_numbers
Display (error) numbers for warning (and error?) messages, in addition to the message itself...
Definition: compilation_options.hpp:431
Compilation options common to all kinds of JIT-compilable programs.
Definition: compilation_options.hpp:86
bool increase_stack_limit_to_max
Use setrlimit() to increase the stack size to the maximum the OS allows.
Definition: compilation_options.hpp:475
Wrapper class for a CUDA device.
Definition: device.hpp:135
bool use_fused_multiply_add
Enable the contraction of multiplcations-followed-by-additions (or subtractions) into single fused in...
Definition: compilation_options.hpp:203
Options for JIT-compilation of CUDA PTX code.
Definition: compilation_options.hpp:168
bool indicate_function_inlining
emit a remark when a function is inlined
Definition: compilation_options.hpp:323
bool source_dirs_in_include_path
Implicitly add the directories of source files (TODO: Which source files?) as include file search pat...
Definition: compilation_options.hpp:403
::std::vector<::std::string > extra_options
Support for additional, arbitrary options which may not be covered by other fields in this class (e...
Definition: compilation_options.hpp:491
bool less_builtins
Have the compiler not provide support for various builtins:
Definition: compilation_options.hpp:349
int optimization_level_t
The type used to specify the intensity, and extent of allowed implication, of optimization efforts...
Definition: common_ptx_compilation_options.hpp:26
optional< size_t > maximum_register_count
Specify the maximum amount of registers that GPU functions can use.
Definition: compilation_options.hpp:360