Miscellaneous functions provided by cuda-kat which are not a good fit in any other header. More...

#include "common.cuh"
#include <kat/detail/pointers.cuh>
#include <type_traits>
#include <limits>
#include <cassert>

Classes
struct	kat::detail::integer_type_struct< Signed, NumBits >

struct	kat::detail::integer_type_struct< false, 8 >

struct	kat::detail::integer_type_struct< false, 16 >

struct	kat::detail::integer_type_struct< false, 32 >

struct	kat::detail::integer_type_struct< false, 64 >

struct	kat::detail::integer_type_struct< true, 8 >

struct	kat::detail::integer_type_struct< true, 16 >

struct	kat::detail::integer_type_struct< true, 32 >

struct	kat::detail::integer_type_struct< true, 64 >

Typedefs
template<std::size_t NumBits>
using	kat::detail::int_t = typename detail::integer_type_struct< true, NumBits >::type
	A templating by size of the signed integer types.

template<std::size_t NumBits>
using	kat::detail::uint_t = typename detail::integer_type_struct< false, NumBits >::type
	A templating by size of the unsigned integer types.

Functions
KAT_FD void	kat::detail::copy (uint32_t __restrict__ destination, const uint32_t __restrict__ source, std::size_t num_elements_to_copy)

KAT_FD void	kat::detail::copy (uint16_t __restrict__ destination, const uint16_t __restrict__ source, std::size_t num_elements_to_copy)

KAT_FD void	kat::detail::copy (uint8_t __restrict__ destination, const uint8_t __restrict__ source, std::size_t num_elements_to_copy)

template<typename T , bool AssumeSameAlignmentWithinWord = false>
KAT_FD T *	kat::copy (T __restrict__ destination, const T __restrict__ source, std::size_t num_elements_to_copy)
	Copies some data from one location to another - using the native register size for individual elements on CUDA GPUs, i.e. More...

template<typename I >
constexpr KAT_FHD I	kat::num_warp_sizes_to_cover (I number_of_threads)
	Return the number of full warps in a linear grid which would, overall, contain at least a given number of threads. More...

Detailed Description

Miscellaneous functions provided by cuda-kat which are not a good fit in any other header.

Function Documentation

Note: Assumes num_elements_to_copy > 0 and the same misalignment of the source and destination w.r.t. native words.

Note: Assumes num_elements_to_copy > 0 and the same misalignment of the source and destination w.r.t. native words.

Note: Assumes num_elements_to_copy > 0 and the same misalignment of the source and destination w.r.t. native words.

template<typename T , bool AssumeSameAlignmentWithinWord = false>

Copies some data from one location to another - using the native register size for individual elements on CUDA GPUs, i.e.

sizeof(int) = 4

Note: CUDA's own general-purpose memcpy() takes void pointers and uses a u8 (byte) LD-ST loop. See: https://godbolt.org/z/9ChTPM ; this LD-ST's using the native register size, 4 bytes, if possible.; this function assumes appropriate alignment.; Instead of using this function, you're probably better off using a warp-level or block-level primitive for copying data.

Parameters

destination	Destination of the copy. Must have at least 4 (`num_elements_to_copy}` bytes allocated. Data must be self-aligned, i.e. the numeric value of this parameter must be divisible by sizeof(T).
source	The beginning of the memory region from which to copy. There must be sizeof(T) * {`num_elements_to_copy}` bytes readable starting with this address. Data must be self-aligned, i.e. the numeric value of this parameter must be divisible by sizeof(T).
num_elements_to_copy	the number of elements of data to copy - not their total size in bytes!

template<typename I >

constexpr KAT_FHD I kat::num_warp_sizes_to_cover ( I number_of_threads )

Return the number of full warps in a linear grid which would, overall, contain at least a given number of threads.

Note: This comes in handy more times than you must expect even in device-side code.; the reason this function is defined directly rather than using the functions in math or constexpr_math is that bit-counting is either slow in run-time on the GPUwhen you use the constexpr way of doing it, or not constexpr if you use the GPU-side population count instruction.