Options for JIT-compilation of CUDA C++ code. More...

#include <compilation_options.hpp>

Inheritance diagram for cuda::rtc::compilation_options_t< cuda_cpp >:

Collaboration diagram for cuda::rtc::compilation_options_t< cuda_cpp >:

Public Types
using	parent = compilation_options_base_t< cuda_cpp >

Public Types inherited from cuda::rtc::compilation_options_base_t< cuda_cpp >
using	optional = cuda::optional< T >

Public Member Functions
compilation_options_t &	clear_language_dialect ()
	Let the compiler interpret the program source code using its default-assumption for the C++ language dialect.

compilation_options_t &	set_language_dialect (cpp_dialect_t dialect)
	Set which dialect of the C++ language the compiler will try to interpret the program source code as. More...

compilation_options_t &	set_language_dialect (const char *dialect_name)
	Set which dialect of the C++ language the compiler will try to interpret the program source code as. More...

compilation_options_t &	set_language_dialect (const ::std::string &dialect_name)
	Set which dialect of the C++ language the compiler will try to interpret the program source code as. More...

compilation_options_t &	suppress_error (error::number_t error_number)
	Ignore compiler findings of the specified number (rather than warnings about them or raising an error)

compilation_options_t &	treat_as_error (error::number_t error_number)
	Treat compiler findings of the specified number as an error (rather than suppressing them or just warning about them)

compilation_options_t &	warn_about (error::number_t error_number)
	Treat compiler findings of the specified number as warnings (rather than raising an error or ignoring them)

Public Member Functions inherited from cuda::rtc::compilation_options_base_t< cuda_cpp >
compilation_options_base_t &	add_target (device::compute_capability_t compute_capability)
	Have the compilation also target a specific compute capability. More...

compilation_options_base_t &	set_target (device::compute_capability_t compute_capability)
	Have the compilation target one one specific compute capability. More...

compilation_options_base_t &	set_target (device_t device)

Public Member Functions inherited from cuda::rtc::common_ptx_compilation_options_t
virtual optional< caching_mode_t< memory_operation_t::load > > &	default_load_caching_mode ()
	see default_load_caching_mode_

virtual optional< caching_mode_t< memory_operation_t::load > >	default_load_caching_mode () const

Public Attributes
bool	compile_extensible_whole_program { false }
	Do extensible whole program compilation of device code. More...

bool	optimize_device_code_in_debug_mode { false }
	If debug mode is enabled, perform limited optimizations of device code rather than none at all. More...

bool	support_128bit_integers { false }
	Allow the use of the 128-bit `__int128` type in the code.

bool	indicate_function_inlining { false }
	emit a remark when a function is inlined

bool	syntax_check_only { false }
	Stop compilation after the front-end has verified the program's syntax. More...

bool	less_builtins { false }
	Have the compiler not provide support for various builtins: More...

optional< size_t >	maximum_register_count { }
	Specify the maximum amount of registers that GPU functions can use. More...

bool	flush_denormal_floats_to_zero { false }
	When performing single-precision floating-point operations, flush denormal values to zero. More...

bool	use_precise_square_root { true }
	For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation. More...

bool	use_precise_division { true }
	For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster approximation. More...

bool	use_fused_multiply_add { true }
	Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations (FMAD, FFMA, or DFMA). More...

bool	use_fast_math { false }
	Make use of fast math operations. More...

bool	link_time_optimization { false }
	Do not compile fully into PTX/Cubin. More...

bool	source_dirs_in_include_path { true }
	Implicitly add the directories of source files (TODO: Which source files?) as include file search paths. More...

bool	extra_device_vectorization { false }
	Enables more aggressive device code vectorization in the LTO IR optimizer.

optional< cpp_dialect_t >	language_dialect { }
	The dialect of C++ as which the compiler will be forced to interpret the program source code.

::std::unordered_set<::std::string >	no_value_defines
	Preprocessor macros to have the compiler define, without specifying a particular value.

::std::unordered_set<::std::string >	undefines
	Preprocessor macros to tell the compiler to specifically _un_define.

::std::unordered_map<::std::string,::std::string >	valued_defines
	Preprocessor macros to have the compiler define to specific values.

bool	disable_warnings { false }
	Have the compiler treat all warnings as though they were suppressed, and print nothing.

bool	assume_restrict { false }
	Treat all kernel pointer parameters as if they had the `restrict` (or `__restrict`) qualifier.

bool	default_execution_space_is_device { false }
	Assume functions without an explicit specification of their execution space are `__device__` rather than `__host__` functions. More...

bool	display_error_numbers { true }
	Display (error) numbers for warning (and error?) messages, in addition to the message itself.

::std::string	ptxas
	Extra options for the PTX compiler (a.k.a. "PTX optimizing assembler").

::std::vector<::std::string >	additional_include_paths
	A sequence of directories to be searched for headers. More...

::std::vector<::std::string >	preinclude_files
	Header files to preinclude during preprocessing of the source. More...

bool	builtin_move_and_forward { true }
	Provide builtin definitions of ::std::move and ::std::forward. More...

bool	increase_stack_limit_to_max { true }
	Use `setrlimit()` to increase the stack size to the maximum the OS allows. More...

bool	builtin_initializer_list { true }
	Provide builtin definitions of ::std::initializer_list class and member functions. More...

::std::vector<::std::string >	extra_options
	Support for additional, arbitrary options which may not be covered by other fields in this class (e.g. More...

::std::unordered_map< error::number_t, error::handling_method_t >	error_handling_overrides

Public Attributes inherited from cuda::rtc::compilation_options_base_t< cuda_cpp >
::std::unordered_set< cuda::device::compute_capability_t >	targets_
	Target devices in terms of CUDA compute capability. More...

Public Attributes inherited from cuda::rtc::common_ptx_compilation_options_t
optional< ptx_register_count_t >	max_num_registers_per_thread {}
	Limit the number of registers which a kernel thread may use.

optional< grid::block_dimension_t >	min_num_threads_per_block {}
	The minimum number of threads per block which the compiler should target.

optional< optimization_level_t >	optimization_level {}
	Compilation optimization level (as in -O1, -O2 etc.)

optional< device::compute_capability_t >	specific_target
	Which NVIDIA physical architecture to generate SASS code for.

bool	generate_source_line_info {false}
	Generate indications of which PTX/SASS instructions correspond to which lines of the source code, within the compiled output.

bool	generate_debug_info {false}
	Generate debugging information associating SASS instructions to locations in the source, embedding it within the compilation output (-g)

optional< caching_mode_t< memory_operation_t::load > >	default_load_caching_mode_
	Which of the memory-load-instruction caching modes (see {caching_mode_t}) to use by default, when no caching mode is specified in a PTX instruction. More...

bool	generate_relocatable_device_code { false }
	Generate relocatable code that can be linked with other relocatable device code. More...

Detailed Description

template<>
class cuda::rtc::compilation_options_t< cuda_cpp >

Options for JIT-compilation of CUDA C++ code.

Member Function Documentation

◆ set_language_dialect() [1/3]

compilation_options_t& cuda::rtc::compilation_options_t< cuda_cpp >::set_language_dialect ( cpp_dialect_t dialect )

inline

Set which dialect of the C++ language the compiler will try to interpret the program source code as.

◆ set_language_dialect() [2/3]

compilation_options_t& cuda::rtc::compilation_options_t< cuda_cpp >::set_language_dialect ( const char * dialect_name )

inline

Set which dialect of the C++ language the compiler will try to interpret the program source code as.

◆ set_language_dialect() [3/3]

compilation_options_t& cuda::rtc::compilation_options_t< cuda_cpp >::set_language_dialect ( const ::std::string & dialect_name )

inline

Set which dialect of the C++ language the compiler will try to interpret the program source code as.

Member Data Documentation

◆ additional_include_paths

::std::vector<::std::string> cuda::rtc::compilation_options_t< cuda_cpp >::additional_include_paths

A sequence of directories to be searched for headers.

These paths are searched after the list of headers given to nvrtcCreateProgram.

Note: The members here are ::std::string's rather than const char* or ::std::string_view's, since this class is a value-type, and cannot rely someone else keeping these strings alive.

Todo:: In C++17, consider making the elements ::std::filesystem::path's.

◆ builtin_initializer_list

bool cuda::rtc::compilation_options_t< cuda_cpp >::builtin_initializer_list { true }

Provide builtin definitions of ::std::initializer_list class and member functions.

Note: Only relevant when the dialect is C++11 or later.

◆ builtin_move_and_forward

bool cuda::rtc::compilation_options_t< cuda_cpp >::builtin_move_and_forward { true }

Provide builtin definitions of ::std::move and ::std::forward.

Note: Only relevant when the dialect is C++11 or later.

◆ compile_extensible_whole_program

bool cuda::rtc::compilation_options_t< cuda_cpp >::compile_extensible_whole_program { false }

Do extensible whole program compilation of device code.

Todo:: explain what that is.

◆ default_execution_space_is_device

bool cuda::rtc::compilation_options_t< cuda_cpp >::default_execution_space_is_device { false }

Assume functions without an explicit specification of their execution space are __device__ rather than __host__ functions.

◆ extra_options

::std::vector<::std::string> cuda::rtc::compilation_options_t< cuda_cpp >::extra_options

Support for additional, arbitrary options which may not be covered by other fields in this class (e.g.

due to newer CUDA versions providing them)

Note: These are appended to the command-line verbatim (so, no prefixing with - signs, no combining pairs of consecutive elements as opt=value etc.)

◆ flush_denormal_floats_to_zero

bool cuda::rtc::compilation_options_t< cuda_cpp >::flush_denormal_floats_to_zero { false }

When performing single-precision floating-point operations, flush denormal values to zero.

Note: Setting use_fast_math implies setting this to true.

◆ increase_stack_limit_to_max

bool cuda::rtc::compilation_options_t< cuda_cpp >::increase_stack_limit_to_max { true }

Use setrlimit() to increase the stack size to the maximum the OS allows.

The limit is reverted to its previous value after compilation.

Note

:

Only works on Linux
Affects the entire process, not just the thread invoking the compilation command.

◆ less_builtins

bool cuda::rtc::compilation_options_t< cuda_cpp >::less_builtins { false }

Have the compiler not provide support for various builtins:

Texture and surface functions & associated types
"CUDA runtime" device-side functions, e.g. cudaMalloc.
Kernel launches from with device code.
Other CUDA types & macros defined in driver_types.h, e.g. cudaError_t.

◆ link_time_optimization

bool cuda::rtc::compilation_options_t< cuda_cpp >::link_time_optimization { false }

Do not compile fully into PTX/Cubin.

Instead, only generate NVIDIA's "LTO IR", which is combined with other LTO IR pieces from object files compiled with LTO support, at device link time.

◆ maximum_register_count

optional<size_t> cuda::rtc::compilation_options_t< cuda_cpp >::maximum_register_count { }

Specify the maximum amount of registers that GPU functions can use.

Until a function-specific limit, a higher value will generally increase the performance of individual GPU threads that execute this function. However, because thread registers are allocated from a global register pool on each GPU, a higher value of this option will also reduce the maximum thread block size, thereby reducing the amount of thread parallelism. Hence, a good maxrregcount value is the result of a trade-off. If this option is not specified, then no maximum is assumed. Value less than the minimum registers required by ABI will be bumped up by the compiler to ABI minimum limit.

◆ optimize_device_code_in_debug_mode

bool cuda::rtc::compilation_options_t< cuda_cpp >::optimize_device_code_in_debug_mode { false }

If debug mode is enabled, perform limited optimizations of device code rather than none at all.

Note: It is not possible to force device code optimizations off in NVRTC in non-debug mode with '–dopt=off' - that's rejected by NVRTC as an invalid option.

◆ preinclude_files

::std::vector<::std::string> cuda::rtc::compilation_options_t< cuda_cpp >::preinclude_files

Header files to preinclude during preprocessing of the source.

Note: The members here are ::std::string's rather than const char* or ::std::string_view's, since this class is a value-type, and cannot rely someone else keeping these strings alive.

Todo:: In C++17, consider making the elements ::std::filesystem::path's.

Todo:: Check how these strings are interpreted. Do they need quotation marks? brackets? full paths?

◆ source_dirs_in_include_path

bool cuda::rtc::compilation_options_t< cuda_cpp >::source_dirs_in_include_path { true }

Implicitly add the directories of source files (TODO: Which source files?) as include file search paths.

◆ syntax_check_only

bool cuda::rtc::compilation_options_t< cuda_cpp >::syntax_check_only { false }

Stop compilation after the front-end has verified the program's syntax.

Note: When this is set to true, the compilation output must not be used.

◆ use_fast_math

bool cuda::rtc::compilation_options_t< cuda_cpp >::use_fast_math { false }

Make use of fast math operations.

Implies use_fused_multiply_add, not use_precise_division and not use_precise_square_root.

◆ use_fused_multiply_add

bool cuda::rtc::compilation_options_t< cuda_cpp >::use_fused_multiply_add { true }

Enables (disables) the contraction of floating-point multiplies and adds/subtracts into floating-point multiply-add operations (FMAD, FFMA, or DFMA).

Setting use_fast_math implies setting this to false.

◆ use_precise_division

bool cuda::rtc::compilation_options_t< cuda_cpp >::use_precise_division { true }

For single-precision floating-point division and reciprocals, use IEEE round-to-nearest mode or use a faster approximation.

Setting use_fast_math implies setting this to false.

◆ use_precise_square_root

bool cuda::rtc::compilation_options_t< cuda_cpp >::use_precise_square_root { true }

For single-precision floating-point square root, use IEEE round-to-nearest mode or use a faster approximation.

Setting use_fast_math implies setting this to false.

The documentation for this class was generated from the following file:

src/cuda/rtc/compilation_options.hpp

Public Types

Public Member Functions

Public Attributes

Detailed Description

template<> class cuda::rtc::compilation_options_t< cuda_cpp >

Member Function Documentation

◆ set_language_dialect() [1/3]

◆ set_language_dialect() [2/3]

◆ set_language_dialect() [3/3]

Member Data Documentation

◆ additional_include_paths

◆ builtin_initializer_list

◆ builtin_move_and_forward

◆ compile_extensible_whole_program

◆ default_execution_space_is_device

◆ extra_options

◆ flush_denormal_floats_to_zero

◆ increase_stack_limit_to_max

◆ less_builtins

◆ link_time_optimization

◆ maximum_register_count

◆ optimize_device_code_in_debug_mode

◆ preinclude_files

◆ source_dirs_in_include_path

◆ syntax_check_only

◆ use_fast_math

◆ use_fused_multiply_add

◆ use_precise_division

◆ use_precise_square_root

template<>
class cuda::rtc::compilation_options_t< cuda_cpp >