cuda-kat
CUDA kernel author's tools
Classes | Macros | Typedefs | Functions
block.cuh File Reference

GPU device-side versions of std::algorithm-like functions, with block-level collaboration, i.e. More...

#include "common.cuh"
#include <kat/on_device/collaboration/warp.cuh>
#include <kat/on_device/collaboration/block.cuh>
#include <kat/on_device/sequence_ops/warp.cuh>
#include <kat/on_device/shuffle.cuh>

Classes

struct  kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op >
 
struct  kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op(Op &)>
 
struct  kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op(Op &) const >
 
struct  kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< Op(*)(Op &)>
 
struct  kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_helper< M(C::*)>
 

Typedefs

template<typename Op >
using kat::linear_grid::collaborative::block::detail::accumulator_op_return_type_t = typename accumulator_op_return_type_helper< Op >::type
 

Functions

template<typename RandomAccessIterator , typename Size , typename T >
KAT_FD void kat::linear_grid::collaborative::block::fill_n (RandomAccessIterator start, Size count, const T &value)
 
template<typename RandomAccessIterator , typename T , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void kat::linear_grid::collaborative::block::fill (RandomAccessIterator start, RandomAccessIterator end, const T &value)
 
template<typename RandomAccessIterator , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::memzero_n (RandomAccessIterator start, Size count)
 
template<typename RandomAccessIterator , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void kat::linear_grid::collaborative::block::memzero (RandomAccessIterator start, RandomAccessIterator end)
 
template<typename T , typename S , typename UnaryOperation , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::transform_n (const S *__restrict__ source, Size length, T *__restrict__ target, UnaryOperation unary_op)
 apply a transformation to each element of an array, placing the results in another array. More...
 
template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>
KAT_FD void kat::linear_grid::collaborative::block::transform (const S *__restrict__ source_start, const S *__restrict__ source_end, T *__restrict__ target, UnaryOperation unary_op)
 
template<typename S , typename T , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::cast_and_copy_n (const S *__restrict__ source, Size length, T *__restrict__ target)
 Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types. More...
 
template<typename S , typename T , typename Size = std::ptrdiff_t>
KAT_FD void kat::linear_grid::collaborative::block::cast_and_copy (const S *__restrict__ source_start, const S *__restrict__ source_end, T *__restrict__ target)
 
template<typename T , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::copy_n (const T *__restrict__ source, Size length, T *__restrict__ target)
 block-collaboratively copy data between stretches of memory More...
 
template<typename T , typename Size = std::ptrdiff_t>
KAT_FD void kat::linear_grid::collaborative::block::copy (const T *__restrict__ source_start, const T *__restrict__ source_end, T *__restrict__ target)
 block-collaboratively copy data between stretches of memory More...
 
template<typename T , typename I , typename Size , typename U = T>
KAT_FD void kat::linear_grid::collaborative::block::lookup (T *__restrict__ target, const U *__restrict__ lookup_table, const I *__restrict__ indices, Size num_indices)
 Use a lookup table to convert numeric indices to a sequence of values of any type.
 
template<typename T , typename AccumulationOp , bool AllThreadsObtainResult = false, T NeutralValue = T{}>
KAT_DEV T kat::linear_grid::collaborative::block::reduce (T value, AccumulationOp op)
 Perform a reduction over a block's worth of data with a specific (asymmetric) accumulation operation, and maintaing the input element type. More...
 
template<typename T , bool AllThreadsObtainResult = false>
KAT_DEV T kat::linear_grid::collaborative::block::sum (T value)
 
template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV T kat::linear_grid::collaborative::block::scan (T value, AccumulationOp op, T *__restrict__ scratch)
 
template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV T kat::linear_grid::collaborative::block::scan (T value, AccumulationOp op)
 
template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV void kat::linear_grid::collaborative::block::scan_and_reduce (T *__restrict__ scratch, T value, AccumulationOp op, T &scan_result, T &reduction_result)
 Perform both a block-level scan and a block-level reduction, with each thread having the results of both. More...
 
template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV void kat::linear_grid::collaborative::block::scan_and_reduce (T value, AccumulationOp op, T &scan_result, T &reduction_result)
 
template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::elementwise_accumulate_n (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source, Size length)
 Perform an accumulation operation (e.g. More...
 
template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size = std::ptrdiff_t>
KAT_FD void kat::linear_grid::collaborative::block::elementwise_accumulate (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source_start, RandomAccessIterator __restrict__ source_end)
 
template<typename Operation , typename Size , typename ResultDatum , typename... Args>
KAT_FD void kat::linear_grid::collaborative::block::elementwise_apply (ResultDatum *__restrict__ results, Size length, Operation op, const Args *__restrict__ ... arguments)
 

Detailed Description

GPU device-side versions of std::algorithm-like functions, with block-level collaboration, i.e.

different CUDA blocks act independently, but all lanes in each warp collaborate on the same task.

Note
Most functions actually in std::algorithm are still missing; see the algorithm page on cppreference.com for a full list of those.
some functions here are not actually in std::algorithm but might as well have been, e.g. memzero() which is like std::memset() with 0.

Function Documentation

§ cast_and_copy_n()

template<typename S , typename T , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::cast_and_copy_n ( const S *__restrict__  source,
Size  length,
T *__restrict__  target 
)

Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types.

Parameters
targetThe (block-common) destination into which to write the converted elements
sourceThe (block-common) origin of the data
lengthThe (block-common) number of elements available (for reading?] at the source

§ copy()

template<typename T , typename Size = std::ptrdiff_t>
KAT_FD void kat::linear_grid::collaborative::block::copy ( const T *__restrict__  source_start,
const T *__restrict__  source_end,
T *__restrict__  target 
)

block-collaboratively copy data between stretches of memory

Parameters
source_start(block-common) location of the first data element to copy
source_end(block-common) location past the last data element to copy
target(block-common) location into which to copy the first element
Note
Prefer copy_n(); this will force the size to ptrdiff_t, which unnecessarily large.

§ copy_n()

template<typename T , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::copy_n ( const T *__restrict__  source,
Size  length,
T *__restrict__  target 
)

block-collaboratively copy data between stretches of memory

Parameters
source(block-common) location from which to copy data
target(block-common) location into which to copy the first element
lengthnumber of elements at source to copy

§ elementwise_accumulate_n()

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::elementwise_accumulate_n ( AccumulatingOperation  op,
D *__restrict__  destination,
RandomAccessIterator __restrict__  source,
Size  length 
)

Perform an accumulation operation (e.g.

addition) between equal-sized arrays - with either regular or atomic semantics. Usable with memory locations which the entire block has the same view of and accessibility to (mostly shared and global, but not just those).

Note
  1. Assumes a linear block.
  2. The operation is supposed to have the signature: WhateverWeDontCare operation(D& accumulator_element, S value) otherwise it might be a no-op here.
  3. If you're having multiple blocks calling this function with the same destination, it will have to be atomic (as you cannot guarantee these blocks will not execute simultaneously, either on different multiprocessors or on the same multiprocessor). Also, if you want to use a global-mem source, you will need to pass this function block-specific offsets; remember it is not a kernel!
Template Parameters
DDestination data type
SSource data type
AccumulatingOperationTypically, one of the 'accumulator' substructures of the functors in liftedfunctions.hpp ; but it may very well be an accumulator::atomic substructure
Size... so that you don't have to decide whether you want to specify your number of elements as an int, uint, long long int, ulong long etc.
Parameters
[in,out]destinationThe array into which we accumulate; holds existing data and is not simply overwritten.
[in]sourceThe array of partial data to integrate via accumulation.
[in]lengththe length in elements of destination and source
Todo:
consider taking a GSL-span-like parameter isntead of a ptr+length
Todo:
Some inclusions in the block-primitives might only be relevant to the functions here; double-check.
Todo:
consider using elementwise_apply for this.

§ reduce()

template<typename T , typename AccumulationOp , bool AllThreadsObtainResult = false, T NeutralValue = T{}>
KAT_DEV T kat::linear_grid::collaborative::block::reduce ( value,
AccumulationOp  op 
)

Perform a reduction over a block's worth of data with a specific (asymmetric) accumulation operation, and maintaing the input element type.

Parameters
valueeach thread's contribution to the reduction
opthe accumulation operator - it must have the appropriate operator(), i.e. with signature T AccumulationOp::operator()(T&, T). It does not have to have any other members or types defined (so a lambda works fine).
Returns
for threads in the first warp of the block - the reduction result over all value elements of all block threads; for other threads - the result is undefined, in case
Template Parameters
AllThreadsObtainResultis false, or like the first warp if AllThreadsObtainResult is true
Note
This should work without full block participation, but it does need full warp participation, i.e. each warp either participates fully or not at all.
One might wonder: "Why insist on the same type for the result and the input?" - well, that is not necessary. However, separating the types would require additional template or parameter information: Two operators (if not more), and a decision at what point we switch to the result type - immediately, after at most k operations, above the warp level. This also makes it nearly impossible to write "simple" calls to reduce - with a value and a single lambda. We may at some point define a structure for setting these parameters, which will put some onus on the user code, but allow for this flexibility. Poke the library author/contributors about this.
Template Parameters
AllThreadsObtainResultwhen true, all threads in a block will return the reduction result; otherwise, only the first warp of the block is guaranteed to return the actual reduction result.

§ scan()

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV T kat::linear_grid::collaborative::block::scan ( value,
AccumulationOp  op,
T *__restrict__  scratch 
)
Note
Supports only full-warps, and you should probably have the entire block participate.
Parameters
scratch
value
Returns

§ scan_and_reduce()

template<typename T , typename AccumulationOp , bool Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_DEV void kat::linear_grid::collaborative::block::scan_and_reduce ( T *__restrict__  scratch,
value,
AccumulationOp  op,
T &  scan_result,
T &  reduction_result 
)

Perform both a block-level scan and a block-level reduction, with each thread having the results of both.

Note
implementation relies on the details of the implementation of the scan primitive, above.
Todo:

consider returning a pair rather than using non-const references

lots of code duplication with just-scan

add a bool template param allowing the code to assume the block is full (this saves a few ops)

Parameters
scratchAn area of memory in which this primitive can use for inter-warp communication (as warps cannot communicate directly). It must have at least warp_size elements allocated (i.e. sizeof(ReductionOp::result_type)*warp_size bytes
valueEach thread provides its input value, and the scan is applied to them all as though they were in some input array
scan_resultthe result of applying a scan to all threads' input values, in order of the thread indices
reduction_resultthe result of reducing all threads' input values

§ transform()

template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>
KAT_FD void kat::linear_grid::collaborative::block::transform ( const S *__restrict__  source_start,
const S *__restrict__  source_end,
T *__restrict__  target,
UnaryOperation  unary_op 
)
Note
Prefer copy_n(); this will force the size to ptrdiff_t, which unnecessarily large.

§ transform_n()

template<typename T , typename S , typename UnaryOperation , typename Size >
KAT_FD void kat::linear_grid::collaborative::block::transform_n ( const S *__restrict__  source,
Size  length,
T *__restrict__  target,
UnaryOperation  unary_op 
)

apply a transformation to each element of an array, placing the results in another array.

Parameters
sourceThe (block-common) origin of the data
targetThe (block-common) destination into which to write the converted elements
lengthThe (block-common) number of elements available (for reading?] at the source