GPU device-side versions of std::algorithm-like functions, with block-level collaboration, i.e. More...

#include "common.cuh"
#include <kat/on_device/collaboration/warp.cuh>
#include <kat/on_device/collaboration/block.cuh>
#include <kat/on_device/sequence_ops/warp.cuh>
#include <kat/on_device/shuffle.cuh>

Classes
struct	kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op >

struct	kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op(Op &)>

struct	kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op(Op &) const >

struct	kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op(*)(Op &)>

struct	kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< M(C::*)>

Typedefs
template<typename Op >
using	kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_t = typename accumulator_op_return_type_helper< Op >::type

Functions
template<typename RandomAccessIterator , typename Size , typename T >
KAT_FD void	kat::linear_grid::collaborative::block::fill_n (RandomAccessIterator start, Size count, const T &value)

template<typename RandomAccessIterator , typename T , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void	kat::linear_grid::collaborative::block::fill (RandomAccessIterator start, RandomAccessIterator end, const T &value)

template<typename RandomAccessIterator , typename Size >
KAT_FD void	kat::linear_grid::collaborative::block::memzero_n (RandomAccessIterator start, Size count)

template<typename RandomAccessIterator , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void	kat::linear_grid::collaborative::block::memzero (RandomAccessIterator start, RandomAccessIterator end)

template<typename T , typename S , typename UnaryOperation , typename Size >
KAT_FD void	kat::linear_grid::collaborative::block::transform_n (const S __restrict__ source, Size length, T __restrict__ target, UnaryOperation unary_op)
	apply a transformation to each element of an array, placing the results in another array. More...

template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>
KAT_FD void	kat::linear_grid::collaborative::block::transform (const S __restrict__ source_start, const S __restrict__ source_end, T *__restrict__ target, UnaryOperation unary_op)

template<typename S , typename T , typename Size >
KAT_FD void	kat::linear_grid::collaborative::block::cast_and_copy_n (const S __restrict__ source, Size length, T __restrict__ target)
	Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types. More...

template<typename S , typename T , typename Size = std::ptrdiff_t>
KAT_FD void	kat::linear_grid::collaborative::block::cast_and_copy (const S __restrict__ source_start, const S __restrict__ source_end, T *__restrict__ target)

template<typename T , typename Size >
KAT_FD void	kat::linear_grid::collaborative::block::copy_n (const T __restrict__ source, Size length, T __restrict__ target)
	block-collaboratively copy data between stretches of memory More...

template<typename T , typename Size = std::ptrdiff_t>
KAT_FD void	kat::linear_grid::collaborative::block::copy (const T __restrict__ source_start, const T __restrict__ source_end, T *__restrict__ target)
	block-collaboratively copy data between stretches of memory More...

template<typename T , typename I , typename Size , typename U = T>
KAT_FD void	kat::linear_grid::collaborative::block::lookup (T __restrict__ target, const U __restrict__ lookup_table, const I *__restrict__ indices, Size num_indices)
	Use a lookup table to convert numeric indices to a sequence of values of any type.

template<typename T , typename AccumulationOp , bool AllThreadsObtainResult = false, T NeutralValue = T{}>
KAT_DEV T	kat::linear_grid::collaborative::block::reduce (T value, AccumulationOp op)
	Perform a reduction over a block's worth of data with a specific (asymmetric) accumulation operation, and maintaing the input element type. More...

template<typename T , bool AllThreadsObtainResult = false>
KAT_DEV T	kat::linear_grid::collaborative::block::sum (T value)

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV T	kat::linear_grid::collaborative::block::scan (T value, AccumulationOp op, T *__restrict__ scratch)

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV T	kat::linear_grid::collaborative::block::scan (T value, AccumulationOp op)

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV void	kat::linear_grid::collaborative::block::scan_and_reduce (T *__restrict__ scratch, T value, AccumulationOp op, T &scan_result, T &reduction_result)
	Perform both a block-level scan and a block-level reduction, with each thread having the results of both. More...

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV void	kat::linear_grid::collaborative::block::scan_and_reduce (T value, AccumulationOp op, T &scan_result, T &reduction_result)

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >
KAT_FD void	kat::linear_grid::collaborative::block::elementwise_accumulate_n (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source, Size length)
	Perform an accumulation operation (e.g. More...

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size = std::ptrdiff_t>
KAT_FD void	kat::linear_grid::collaborative::block::elementwise_accumulate (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source_start, RandomAccessIterator __restrict__ source_end)

template<typename Operation , typename Size , typename ResultDatum , typename... Args>
KAT_FD void	kat::linear_grid::collaborative::block::elementwise_apply (ResultDatum __restrict__ results, Size length, Operation op, const Args __restrict__ ... arguments)

Detailed Description

GPU device-side versions of std::algorithm-like functions, with block-level collaboration, i.e.

different CUDA blocks act independently, but all lanes in each warp collaborate on the same task.

Note: Most functions actually in std::algorithm are still missing; see the algorithm page on cppreference.com for a full list of those.; some functions here are not actually in std::algorithm but might as well have been, e.g. memzero() which is like std::memset() with 0.

Function Documentation

§ cast_and_copy_n()

template<typename S , typename T , typename Size >

KAT_FD void kat::linear_grid::collaborative::block::cast_and_copy_n	(	const S *__restrict__	source,
		Size	length,
		T *__restrict__	target
	)

Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types.

Parameters

target	The (block-common) destination into which to write the converted elements
source	The (block-common) origin of the data
length	The (block-common) number of elements available (for reading?] at the source

§ copy()

template<typename T , typename Size = std::ptrdiff_t>

KAT_FD void kat::linear_grid::collaborative::block::copy	(	const T *__restrict__	source_start,
		const T *__restrict__	source_end,
		T *__restrict__	target
	)

block-collaboratively copy data between stretches of memory

Parameters

source_start	(block-common) location of the first data element to copy
source_end	(block-common) location past the last data element to copy
target	(block-common) location into which to copy the first element

Note: Prefer copy_n(); this will force the size to ptrdiff_t, which unnecessarily large.

§ copy_n()

template<typename T , typename Size >

KAT_FD void kat::linear_grid::collaborative::block::copy_n	(	const T *__restrict__	source,
		Size	length,
		T *__restrict__	target
	)

block-collaboratively copy data between stretches of memory

Parameters

source	(block-common) location from which to copy data
target	(block-common) location into which to copy the first element
length	number of elements at `source` to copy

§ elementwise_accumulate_n()

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >

KAT_FD void kat::linear_grid::collaborative::block::elementwise_accumulate_n	(	AccumulatingOperation	op,
		D *__restrict__	destination,
		RandomAccessIterator __restrict__	source,
		Size	length
	)

Perform an accumulation operation (e.g.

addition) between equal-sized arrays - with either regular or atomic semantics. Usable with memory locations which the entire block has the same view of and accessibility to (mostly shared and global, but not just those).

Note

Assumes a linear block.
The operation is supposed to have the signature: WhateverWeDontCare operation(D& accumulator_element, S value) otherwise it might be a no-op here.
If you're having multiple blocks calling this function with the same destination, it will have to be atomic (as you cannot guarantee these blocks will not execute simultaneously, either on different multiprocessors or on the same multiprocessor). Also, if you want to use a global-mem source, you will need to pass this function block-specific offsets; remember it is not a kernel!

Template Parameters

D	Destination data type
S	Source data type
AccumulatingOperation	Typically, one of the 'accumulator' substructures of the functors in liftedfunctions.hpp ; but it may very well be an accumulator::atomic substructure
Size	... so that you don't have to decide whether you want to specify your number of elements as an int, uint, long long int, ulong long etc.

Parameters

[in,out]	destination	The array into which we accumulate; holds existing data and is not simply overwritten.
[in]	source	The array of partial data to integrate via accumulation.
[in]	length	the length in elements of `destination` and `source`

Todo:: consider taking a GSL-span-like parameter isntead of a ptr+length

Todo:: Some inclusions in the block-primitives might only be relevant to the functions here; double-check.

Todo:: consider using elementwise_apply for this.

§ reduce()

template<typename T , typename AccumulationOp , bool AllThreadsObtainResult = false, T NeutralValue = T{}>

KAT_DEV T kat::linear_grid::collaborative::block::reduce	(	T	value,
		AccumulationOp	op
	)

Perform a reduction over a block's worth of data with a specific (asymmetric) accumulation operation, and maintaing the input element type.

Parameters

value	each thread's contribution to the reduction
op	the accumulation operator - it must have the appropriate `operator()`, i.e. with signature `T AccumulationOp::operator()(T&, T)`. It does not have to have any other members or types defined (so a lambda works fine).

Returns: for threads in the first warp of the block - the reduction result over all value elements of all block threads; for other threads - the result is undefined, in case

Template Parameters

AllThreadsObtainResult is false, or like the first warp if AllThreadsObtainResult is true

Note: This should work without full block participation, but it does need full warp participation, i.e. each warp either participates fully or not at all.; One might wonder: "Why insist on the same type for the result and the input?" - well, that is not necessary. However, separating the types would require additional template or parameter information: Two operators (if not more), and a decision at what point we switch to the result type - immediately, after at most k operations, above the warp level. This also makes it nearly impossible to write "simple" calls to reduce - with a value and a single lambda. We may at some point define a structure for setting these parameters, which will put some onus on the user code, but allow for this flexibility. Poke the library author/contributors about this.

Template Parameters

AllThreadsObtainResult when true, all threads in a block will return the reduction result; otherwise, only the first warp of the block is guaranteed to return the actual reduction result.

§ scan()

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>

KAT_DEV T kat::linear_grid::collaborative::block::scan	(	T	value,
		AccumulationOp	op,
		T *__restrict__	scratch
	)

Note: Supports only full-warps, and you should probably have the entire block participate.

Parameters

scratch
value

Returns

§ scan_and_reduce()

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>

KAT_DEV void kat::linear_grid::collaborative::block::scan_and_reduce	(	T *__restrict__	scratch,
		T	value,
		AccumulationOp	op,
		T &	scan_result,
		T &	reduction_result
	)

Perform both a block-level scan and a block-level reduction, with each thread having the results of both.

Note: implementation relies on the details of the implementation of the scan primitive, above.

Todo:

consider returning a pair rather than using non-const references

lots of code duplication with just-scan

add a bool template param allowing the code to assume the block is full (this saves a few ops)

Parameters

scratch	An area of memory in which this primitive can use for inter-warp communication (as warps cannot communicate directly). It must have at least warp_size elements allocated (i.e. sizeof(ReductionOp::result_type)*warp_size bytes
value	Each thread provides its input value, and the scan is applied to them all as though they were in some input array
scan_result	the result of applying a scan to all threads' input values, in order of the thread indices
reduction_result	the result of reducing all threads' input values

§ transform()

template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>

KAT_FD void kat::linear_grid::collaborative::block::transform	(	const S *__restrict__	source_start,
		const S *__restrict__	source_end,
		T *__restrict__	target,
		UnaryOperation	unary_op
	)

Note: Prefer copy_n(); this will force the size to ptrdiff_t, which unnecessarily large.

§ transform_n()

template<typename T , typename S , typename UnaryOperation , typename Size >

KAT_FD void kat::linear_grid::collaborative::block::transform_n	(	const S *__restrict__	source,
		Size	length,
		T *__restrict__	target,
		UnaryOperation	unary_op
	)

apply a transformation to each element of an array, placing the results in another array.

Parameters

source	The (block-common) origin of the data
target	The (block-common) destination into which to write the converted elements
length	The (block-common) number of elements available (for reading?] at the source

Classes

Typedefs

Functions

Detailed Description

Function Documentation

§ cast_and_copy_n()

§ copy()

§ copy_n()

§ elementwise_accumulate_n()

§ reduce()

§ scan()

§ scan_and_reduce()

§ transform()

§ transform_n()