cuda-kat
CUDA kernel author's tools
Classes | Macros | Functions
warp.cuh File Reference

GPU device-side versions of std::algorithm-like functions, with warp-level collaboration, i.e. More...

#include "common.cuh"
#include <kat/on_device/collaboration/warp.cuh>
#include <type_traits>

Classes

struct  kat::collaborative::warp::detail::plus< LHS, RHS, Result >
 
struct  kat::collaborative::warp::detail::plus< LHS, RHS, Result >::accumulator
 
struct  kat::collaborative::warp::detail::plus< LHS, RHS, Result >::accumulator::atomic
 

Functions

template<typename T , typename AccumulationOp >
KAT_FD T kat::collaborative::warp::reduce (T value, AccumulationOp op)
 Performs a reduction (e.g. More...
 
template<typename T >
KAT_FD T kat::collaborative::warp::sum (T value)
 
template<typename T , typename AccumulationOp , inclusivity_t Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_FD T kat::collaborative::warp::scan (T value, AccumulationOp op)
 
template<typename T , inclusivity_t Inclusivity = inclusivity_t::Inclusive, T NeutralValue = T{}>
KAT_FD T kat::collaborative::warp::prefix_sum (T value)
 
template<typename T , T NeutralValue = T{}>
KAT_FD T kat::collaborative::warp::exclusive_prefix_sum (T value)
 
template<typename RandomAccessIterator , typename Size , typename T >
KAT_FD void kat::collaborative::warp::fill_n (RandomAccessIterator start, Size count, const T &value)
 
template<typename RandomAccessIterator , typename T , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void kat::collaborative::warp::fill (RandomAccessIterator start, RandomAccessIterator end, const T &value)
 
template<typename RandomAccessIterator , typename Size >
KAT_FD void kat::collaborative::warp::memzero_n (RandomAccessIterator start, Size count)
 
template<typename RandomAccessIterator , typename Size = decltype(std::declval<RandomAccessIterator>() - std::declval<RandomAccessIterator>())>
KAT_FD void kat::collaborative::warp::memzero (RandomAccessIterator start, RandomAccessIterator end)
 
template<typename T , typename S , typename UnaryOperation , typename Size >
KAT_FD void kat::collaborative::warp::transform_n (const S *__restrict__ source, Size length, T *__restrict__ target, UnaryOperation unary_op)
 apply a transformation to each element of an array, placing the results in another array. More...
 
template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>
KAT_FD void kat::collaborative::warp::transform (const S *__restrict__ source_start, const S *__restrict__ source_end, T *__restrict__ target, UnaryOperation unary_op)
 
template<typename S , typename T , typename Size >
KAT_FD void kat::collaborative::warp::cast_and_copy_n (const S *__restrict__ source, Size length, T *__restrict__ target)
 Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types. More...
 
template<typename T , typename U , typename Size = std::ptrdiff_t>
KAT_FD void kat::collaborative::warp::cast_and_copy (const U *__restrict__ source_start, const U *__restrict__ source_end, T *__restrict__ target)
 
template<typename T , typename Size >
KAT_FD void kat::collaborative::warp::detail::naive_copy (const T *__restrict__ source, Size length, T *__restrict__ target)
 A version of kat::copy() which ignores pointer alignment, and the memory transaction size, simply making coalesced writes of warp_size elements at a time (except for the last range) More...
 
template<typename T >
constexpr KAT_FHD T kat::collaborative::warp::detail::clear_lower_bits (T x, unsigned k)
 
template<typename T , typename Size , bool MayHaveSlack = true>
KAT_FD void kat::collaborative::warp::copy_n (const T *__restrict__ source, Size length, T *__restrict__ target)
 Has the warp copy data from one place to another. More...
 
template<typename T , bool MayHaveSlack = true, typename Size = std::ptrdiff_t>
KAT_FD void kat::collaborative::warp::copy (const T *__restrict__ source_start, const T *__restrict__ source_end, T *__restrict__ target_start)
 
template<typename T , typename I , typename Size , typename U = T>
KAT_FD void kat::collaborative::warp::lookup (T *__restrict__ target, const U *__restrict__ lookup_table, const I *__restrict__ indices, Size num_indices)
 Use a lookup table to convert numeric indices to a sequence of values of any type.
 
template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >
KAT_FD void kat::collaborative::warp::elementwise_accumulate_n (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source, Size length)
 Perform an accumulation operation (e.g. More...
 
template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size = std::ptrdiff_t>
KAT_FD void kat::collaborative::warp::elementwise_accumulate (AccumulatingOperation op, D *__restrict__ destination, RandomAccessIterator __restrict__ source_start, RandomAccessIterator __restrict__ source_end)
 

Detailed Description

GPU device-side versions of std::algorithm-like functions, with warp-level collaboration, i.e.

different CUDA warps act independently, but all lanes in each warp collaborate on the same task.

Note
Most functions actually in std::algorithm are still missing; see the cppreference page for <algorithm> for a full list of those.
some functions here are not actually in std::algorithm but might as well have been, e.g. memzero() which is like std::memset() with 0.
This is the most-divergent version of std-algorithm-like functions, i.e. don't go looking for thread-level implementations (which would, in fact, be the same as a straightforward CPU-side implementation of std::algorithm); if you find yourself needing them, it's possible - perhaps likely - that you're doing something wrong.
Todo:
Some inclusions in the warp-primitives might only be relevant to the functions here; double-check.

Function Documentation

§ cast_and_copy_n()

template<typename S , typename T , typename Size >
KAT_FD void kat::collaborative::warp::cast_and_copy_n ( const S *__restrict__  source,
Size  length,
T *__restrict__  target 
)

Have all warp threads collaborate in copying data between two memory locations (possibly not in the same memory space), while also converting types.

Parameters
targetThe destination into which to write the converted elements
sourceThe origin of the data
lengthThe number of elements available (for reading?] at the source

§ copy_n()

template<typename T , typename Size , bool MayHaveSlack = true>
KAT_FD void kat::collaborative::warp::copy_n ( const T *__restrict__  source,
Size  length,
T *__restrict__  target 
)

Has the warp copy data from one place to another.

Note
if the input is not 32-byte (sometimes 128-byte )-aligned, and more importantly, the output is not 128-byte-aligned, performance will likely degrade due to the need to execute a pair of memory transactions for every single 32 x 4 byte write.
Template Parameters
Ttype of the elements being copied
Sizetype of the length parameter
MayHaveSlackwe "like" data whose size is a multiple of 4 bytes, and can copy it faster. When this is true, we assume the overall size of data to copy is a multiple of 4, without taking the time to check. In the future the semantics of this parameter will change to involve alignment of the start and end addresses.
Parameters
[out]targetstarting address of the region of memory to copy into
[in]sourcestarting address of the region of memory to copy from
[in]lengthnumber of elements (of type T) to copy

§ elementwise_accumulate_n()

template<typename D , typename RandomAccessIterator , typename AccumulatingOperation , typename Size >
KAT_FD void kat::collaborative::warp::elementwise_accumulate_n ( AccumulatingOperation  op,
D *__restrict__  destination,
RandomAccessIterator __restrict__  source,
Size  length 
)

Perform an accumulation operation (e.g.

addition) between equal-sized arrays - with either regular or atomic semantics. Usable with memory locations which the entire block has the same view of and accessibility to (mostly shared and global, but not just those).

Note
  1. Assumes a linear block.
  2. The operation is supposed to have the signature: WhateverWeDontCare operation(D& accumulator_element, S value) otherwise it might be a no-op here.
  3. If you're having multiple blocks calling this function with the same destination, it will have to be atomic (as you cannot guarantee these blocks will not execute simultaneously, either on different multiprocessors or on the same multiprocessor). Also, if you want to use a global-mem source, you will need to pass this function block-specific offsets; remember it is not a kernel!
Template Parameters
DDestination data type
SSource data type
AccumulatingOperationTypically, one of the 'accumulator' substructures of the functors in liftedfunctions.hpp ; but it may very well be an accumulator::atomic substructure
Size... so that you don't have to decide whether you want to specify your number of elements as an int, uint, long long int, ulong long etc.
Parameters
[in,out]destinationThe array into which we accumulate; holds existing data and is not simply overwritten.
[in]sourceThe array of partial data to integrate via accumulation.
[in]lengththe length in elements of destination and source
Todo:
consider taking a GSL-span-like parameter isntead of a ptr+length
Todo:
Some inclusions in the block-primitives might only be relevant to the functions here; double-check.
Todo:
consider using elementwise_apply for this.

§ naive_copy()

template<typename T , typename Size >
KAT_FD void kat::collaborative::warp::detail::naive_copy ( const T *__restrict__  source,
Size  length,
T *__restrict__  target 
)

A version of kat::copy() which ignores pointer alignment, and the memory transaction size, simply making coalesced writes of warp_size elements at a time (except for the last range)

Parameters
target
source
length

§ reduce()

template<typename T , typename AccumulationOp >
KAT_FD T kat::collaborative::warp::reduce ( value,
AccumulationOp  op 
)

Performs a reduction (e.g.

a summation or a multiplication) of all elements passed into the function by the threads of a block - but with each thread ending up with the reduction result for all threads upto itself.

Note
What about inclusivity?
Todo:
offer both an inclusive and an exclusive versionn

§ transform()

template<typename S , typename T , typename UnaryOperation , typename Size = std::ptrdiff_t>
KAT_FD void kat::collaborative::warp::transform ( const S *__restrict__  source_start,
const S *__restrict__  source_end,
T *__restrict__  target,
UnaryOperation  unary_op 
)
Note
Prefer copy_n(); this will force the size to ptrdiff_t, which unnecessarily large.

§ transform_n()

template<typename T , typename S , typename UnaryOperation , typename Size >
KAT_FD void kat::collaborative::warp::transform_n ( const S *__restrict__  source,
Size  length,
T *__restrict__  target,
UnaryOperation  unary_op 
)

apply a transformation to each element of an array, placing the results in another array.

Parameters
sourceThe (block-common) origin of the data
targetThe (block-common) destination into which to write the converted elements
lengthThe (block-common) number of elements available (for reading?] at the source