GPU device-side versions of std::algorithm-like functions, with warp-level collaboration, i.e. More...

#include "common.cuh"
#include <kat/on_device/collaboration/warp.cuh>
#include <type_traits>

Classes
struct	kat::collaborative::warp::detail::plus< LHS, RHS, Result >

struct	kat::collaborative::warp::detail::plus< LHS, RHS, Result >::accumulator

struct	kat::collaborative::warp::detail::plus< LHS, RHS, Result >::accumulator::atomic

Functions
template<typename T , typename AccumulationOp >
KAT_FD T	kat::collaborative::warp::reduce (T value, AccumulationOp op)
	Performs a reduction (e.g. More...

template<typename T >
KAT_FD T	kat::collaborative::warp::sum (T value)

template<typename T , typename AccumulationOp , inclusivity_t Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_FD T	kat::collaborative::warp::scan (T value, AccumulationOp op)

template<typename T , inclusivity_t Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_FD T	kat::collaborative::warp::prefix_sum (T value)

template<typename T , T NeutralValue = T{}>
KAT_FD T	kat::collaborative::warp::exclusive_prefix_sum (T value)

template<typename RandomAccessIterator , typename Size , typename T >
KAT_FD void	kat::collaborative::warp::fill_n (RandomAccessIterator start, Size count, const T &value)

template<typename RandomAccessIterator , typename T , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void	kat::collaborative::warp::fill (RandomAccessIterator start, RandomAccessIterator end, const T &value)

template<typename RandomAccessIterator , typename Size >
KAT_FD void	kat::collaborative::warp::memzero_n (RandomAccessIterator start, Size count)

template<typename RandomAccessIterator , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void	kat::collaborative::warp::memzero (RandomAccessIterator start, RandomAccessIterator end)

template<typename T , typename S , typename UnaryOperation , typename Size >
KAT_FD void	kat::collaborative::warp::transform_n (const S __restrict__ source, Size length, T __restrict__ target, UnaryOperation unary_op)
	apply a transformation to each element of an array, placing the results in another array. More...

template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>
KAT_FD void	kat::collaborative::warp::transform (const S __restrict__ source_start, const S __restrict__ source_end, T *__restrict__ target, UnaryOperation unary_op)

template<typename S , typename T , typename Size >
KAT_FD void	kat::collaborative::warp::cast_and_copy_n (const S __restrict__ source, Size length, T __restrict__ target)
	Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types. More...

template<typename T , typename U , typename Size = std::ptrdiff_t>
KAT_FD void	kat::collaborative::warp::cast_and_copy (const U __restrict__ source_start, const U __restrict__ source_end, T *__restrict__ target)

template<typename T , typename Size >
KAT_FD void	kat::collaborative::warp::detail::naive_copy (const T __restrict__ source, Size length, T __restrict__ target)
	A version of `kat::copy()` which ignores pointer alignment, and the memory transaction size, simply making coalesced writes of warp_size elements at a time (except for the last range) More...

template<typename T >
constexpr KAT_FHD T	kat::collaborative::warp::detail::clear_lower_bits (T x, unsigned k)

template<typename T , typename Size , bool MayHaveSlack = true>
KAT_FD void	kat::collaborative::warp::copy_n (const T __restrict__ source, Size length, T __restrict__ target)
	Has the warp copy data from one place to another. More...

template<typename T , bool MayHaveSlack = true, typename Size = std::ptrdiff_t>
KAT_FD void	kat::collaborative::warp::copy (const T __restrict__ source_start, const T __restrict__ source_end, T *__restrict__ target_start)

template<typename T , typename I , typename Size , typename U = T>
KAT_FD void	kat::collaborative::warp::lookup (T __restrict__ target, const U __restrict__ lookup_table, const I *__restrict__ indices, Size num_indices)
	Use a lookup table to convert numeric indices to a sequence of values of any type.

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >
KAT_FD void	kat::collaborative::warp::elementwise_accumulate_n (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source, Size length)
	Perform an accumulation operation (e.g. More...

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size = std::ptrdiff_t>
KAT_FD void	kat::collaborative::warp::elementwise_accumulate (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source_start, RandomAccessIterator __restrict__ source_end)

Detailed Description

GPU device-side versions of std::algorithm-like functions, with warp-level collaboration, i.e.

different CUDA warps act independently, but all lanes in each warp collaborate on the same task.

Note: Most functions actually in std::algorithm are still missing; see the cppreference page for <algorithm> for a full list of those.; some functions here are not actually in std::algorithm but might as well have been, e.g. memzero() which is like std::memset() with 0.; This is the most-divergent version of std-algorithm-like functions, i.e. don't go looking for thread-level implementations (which would, in fact, be the same as a straightforward CPU-side implementation of std::algorithm); if you find yourself needing them, it's possible - perhaps likely - that you're doing something wrong.

Todo:: Some inclusions in the warp-primitives might only be relevant to the functions here; double-check.

Function Documentation

§ cast_and_copy_n()

template<typename S , typename T , typename Size >

KAT_FD void kat::collaborative::warp::cast_and_copy_n	(	const S *__restrict__	source,
		Size	length,
		T *__restrict__	target
	)

Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types.

Parameters

target	The destination into which to write the converted elements
source	The origin of the data
length	The number of elements available (for reading?] at the source

§ copy_n()

template<typename T , typename Size , bool MayHaveSlack = true>

KAT_FD void kat::collaborative::warp::copy_n	(	const T *__restrict__	source,
		Size	length,
		T *__restrict__	target
	)

Has the warp copy data from one place to another.

Note: if the input is not 32-byte (sometimes 128-byte )-aligned, and more importantly, the output is not 128-byte-aligned, performance will likely degrade due to the need to execute a pair of memory transactions for every single 32 x 4 byte write.

Template Parameters

T	type of the elements being copied
Size	type of the length parameter
MayHaveSlack	we "like" data whose size is a multiple of 4 bytes, and can copy it faster. When this is true, we assume the overall size of data to copy is a multiple of 4, without taking the time to check. In the future the semantics of this parameter will change to involve alignment of the start and end addresses.

Parameters

[out]	target	starting address of the region of memory to copy into
[in]	source	starting address of the region of memory to copy from
[in]	length	number of elements (of type T) to copy

§ elementwise_accumulate_n()

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >

KAT_FD void kat::collaborative::warp::elementwise_accumulate_n	(	AccumulatingOperation	op,
		D *__restrict__	destination,
		RandomAccessIterator __restrict__	source,
		Size	length
	)

Perform an accumulation operation (e.g.

addition) between equal-sized arrays - with either regular or atomic semantics. Usable with memory locations which the entire block has the same view of and accessibility to (mostly shared and global, but not just those).

Note

Assumes a linear block.
The operation is supposed to have the signature: WhateverWeDontCare operation(D& accumulator_element, S value) otherwise it might be a no-op here.
If you're having multiple blocks calling this function with the same destination, it will have to be atomic (as you cannot guarantee these blocks will not execute simultaneously, either on different multiprocessors or on the same multiprocessor). Also, if you want to use a global-mem source, you will need to pass this function block-specific offsets; remember it is not a kernel!

Template Parameters

D	Destination data type
S	Source data type
AccumulatingOperation	Typically, one of the 'accumulator' substructures of the functors in liftedfunctions.hpp ; but it may very well be an accumulator::atomic substructure
Size	... so that you don't have to decide whether you want to specify your number of elements as an int, uint, long long int, ulong long etc.

Parameters

[in,out]	destination	The array into which we accumulate; holds existing data and is not simply overwritten.
[in]	source	The array of partial data to integrate via accumulation.
[in]	length	the length in elements of `destination` and `source`

Todo:: consider taking a GSL-span-like parameter isntead of a ptr+length

Todo:: Some inclusions in the block-primitives might only be relevant to the functions here; double-check.

Todo:: consider using elementwise_apply for this.

§ naive_copy()

template<typename T , typename Size >

KAT_FD void kat::collaborative::warp::detail::naive_copy	(	const T *__restrict__	source,
		Size	length,
		T *__restrict__	target
	)

A version of kat::copy() which ignores pointer alignment, and the memory transaction size, simply making coalesced writes of warp_size elements at a time (except for the last range)

Parameters

target
source
length

§ reduce()

template<typename T , typename AccumulationOp >

KAT_FD T kat::collaborative::warp::reduce	(	T	value,
		AccumulationOp	op
	)

Performs a reduction (e.g.

a summation or a multiplication) of all elements passed into the function by the threads of a block - but with each thread ending up with the reduction result for all threads upto itself.

Note: What about inclusivity?

Todo:: offer both an inclusive and an exclusive versionn

§ transform()

template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>

KAT_FD void kat::collaborative::warp::transform	(	const S *__restrict__	source_start,
		const S *__restrict__	source_end,
		T *__restrict__	target,
		UnaryOperation	unary_op
	)

Note: Prefer copy_n(); this will force the size to ptrdiff_t, which unnecessarily large.

§ transform_n()

template<typename T , typename S , typename UnaryOperation , typename Size >

KAT_FD void kat::collaborative::warp::transform_n	(	const S *__restrict__	source,
		Size	length,
		T *__restrict__	target,
		UnaryOperation	unary_op
	)

apply a transformation to each element of an array, placing the results in another array.

Parameters

source	The (block-common) origin of the data
target	The (block-common) destination into which to write the converted elements
length	The (block-common) number of elements available (for reading?] at the source

Classes

Functions

Detailed Description

Function Documentation

§ cast_and_copy_n()

§ copy_n()

§ elementwise_accumulate_n()

§ naive_copy()

§ reduce()

§ transform()

§ transform_n()