The warp_scan class is a warp level parallel primitive which provides methods for performing inclusive and exclusive scan operations of items partitioned across threads in a hardware warp. More...

#include <warp_scan.hpp>

Inheritance diagram for warp_scan< T, WarpSize >:

[legend]

Collaboration diagram for warp_scan< T, WarpSize >:

[legend]

Public Types
using	storage_type = typename base_type::storage_type
	Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. More...

Public Member Functions
template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	inclusive_scan (T input, T &output, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive scan across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	inclusive_scan (T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive scan across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	inclusive_scan (T input, T &output, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive scan and reduction across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	inclusive_scan (T, T &, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive scan and reduction across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	exclusive_scan (T input, T &output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs exclusive scan across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	exclusive_scan (T, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs exclusive scan across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	exclusive_scan (T input, T &output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs exclusive scan and reduction across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	exclusive_scan (T, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs exclusive scan and reduction across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	scan (T input, T &inclusive_output, T &exclusive_output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive and exclusive scan operations across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	scan (T, T &, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive and exclusive scan operations across threads Invalid Warp Size.

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	scan (T input, T &inclusive_output, T &exclusive_output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive and exclusive scan operations, and reduction across threads in a logical warp. More...

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	scan (T, T &, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
	Performs inclusive and exclusive scan operations across threads Invalid Warp Size.

template<unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	broadcast (T input, const unsigned int src_lane, storage_type &storage) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), T >::type
	Broadcasts value from one thread to all threads in logical warp. More...

template<unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	broadcast (T, const unsigned int, storage_type &) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T >::type
	Broadcasts value from one thread to all threads in logical warp. More...

Protected Member Functions
template<unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	to_exclusive (T inclusive_input, T &exclusive_output, storage_type &storage) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type

template<unsigned int FunctionWarpSize = WarpSize>
ROCPRIM_DEVICE ROCPRIM_INLINE auto	to_exclusive (T, T &, storage_type &) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type

Detailed Description

template<class T, unsigned int WarpSize = device_warp_size()>
class warp_scan< T, WarpSize >

The warp_scan class is a warp level parallel primitive which provides methods for performing inclusive and exclusive scan operations of items partitioned across threads in a hardware warp.

Template Parameters

T	- the input/output type.
WarpSize	- the size of logical warp size, which can be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). Scan operations are performed separately within groups determined by WarpSize.

Overview

WarpSize must be equal to or less than the size of hardware warp (see rocprim::device_warp_size()). If it is less, scan is performed separately within groups determined by WarpSize.
For example, if WarpSize is 4, hardware warp is 64, scan will be performed in logical warps grouped like this: { {0, 1, 2, 3}, {4, 5, 6, 7 }, ..., {60, 61, 62, 63} } (thread is represented here by its id within hardware warp).
Logical warp is a group of WarpSize consecutive threads from the same hardware warp.
Supports non-commutative scan operators. However, a scan operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
Number of threads executing warp_scan's function must be a multiple of WarpSize;
All threads from a logical warp must be in the same hardware warp.

Examples: In the examples scan operation is performed on groups of 16 threads, each provides one int value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 64.

__global__ void example_kernel(...)
{
// specialize warp_scan for int and logical warp of 16 threads
using warp_scan_int = rocprim::warp_scan<int, 16>;
// allocate storage in shared memory
__shared__ warp_scan_int::storage_type temp[4];
int logical_warp_id = threadIdx.x/16;
int value = ...;
// execute inclusive scan
warp_scan_int().inclusive_scan(
value, // input
value, // output
temp[logical_warp_id]
);
...
}

Member Typedef Documentation

◆ storage_type

template<class T , unsigned int WarpSize = device_warp_size()>

using warp_scan< T, WarpSize >::storage_type = typename base_type::storage_type

Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.

Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords __shared__. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.

Member Function Documentation

◆ broadcast() [1/2]

template<class T , unsigned int WarpSize = device_warp_size()>

template<unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::broadcast	(	T	input,
		const unsigned int	src_lane,
		storage_type &	storage
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), T>::type

inline

Broadcasts value from one thread to all threads in logical warp.

Parameters

[in]	input	- value to broadcast.
[in]	src_lane	- id of the thread whose value should be broadcasted
[in]	storage	- reference to a temporary storage object of type storage_type.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

◆ broadcast() [2/2]

template<class T , unsigned int WarpSize = device_warp_size()>

template<unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::broadcast	(	T	,
		const unsigned	int,
		storage_type &
	)		-> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T>::type

inline

Broadcasts value from one thread to all threads in logical warp.

Invalid Warp Size

◆ exclusive_scan() [1/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::exclusive_scan	(	T	input,
		T &	output,
		T	init,
		storage_type &	storage,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs exclusive scan across threads in a logical warp.

Template Parameters

BinaryFunction - type of binary function used for scan. Default type is rocprim::plus<T>.

Parameters

[in]	input	- thread input value.
[out]	output	- reference to a thread output value. May be aliased with `input`.
[in]	init	- initial value used to start the exclusive scan. Should be the same for all threads in a logical warp.
[in]	storage	- reference to a temporary storage object of type storage_type.
[in]	scan_op	- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: `T f(const T &a, const T &b);`. The signature does not need to have `const &`, but function object must not modify the objects passed to it.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Examples

The examples present exclusive min scan operations performed on groups of 32 threads, each provides one float value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 256.

__global__ void example_kernel(...) // blockDim.x = 256
{
    // specialize warp_scan for float and logical warp of 32 threads
    using warp_scan_f = rocprim::warp_scan<float, 32>;
    // allocate storage in shared memory
    __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8
    int logical_warp_id = threadIdx.x/32;
    float value = ...;
    // execute exclusive min scan
    warp_scan_float().exclusive_scan(
        value, // input
        value, // output
        100.0f, // init
        temp[logical_warp_id],
        rocprim::minimum<float>()
    );
    ...
}

If the initial value is 100 and input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}, then output values in the first logical warp will be {100, 1, -2, -2, -4, ..., -30}, in the second: {100, 33, -34, -34, -36, ..., -62} etc.

◆ exclusive_scan() [2/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::exclusive_scan	(	T	,
		T &	,
		T	,
		storage_type &	,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs exclusive scan across threads in a logical warp.

Invalid Warp Size

◆ exclusive_scan() [3/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::exclusive_scan	(	T	input,
		T &	output,
		T	init,
		T &	reduction,
		storage_type &	storage,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs exclusive scan and reduction across threads in a logical warp.

Template Parameters

BinaryFunction - type of binary function used for scan. Default type is rocprim::plus<T>.

Parameters

[in]	input	- thread input value.
[out]	output	- reference to a thread output value. May be aliased with `input`.
[in]	init	- initial value used to start the exclusive scan. Should be the same for all threads in a logical warp.
[out]	reduction	- result of reducing of all `input` values in logical warp. `init` value is not included in the reduction.
[in]	storage	- reference to a temporary storage object of type storage_type.
[in]	scan_op	- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: `T f(const T &a, const T &b);`. The signature does not need to have `const &`, but function object must not modify the objects passed to it.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Examples

The examples present exclusive prefix sum operations performed on groups of 64 threads, each thread provides one int value. Hardware warp size is 64. Block (tile) size is 256.

__global__ void example_kernel(...) // blockDim.x = 256
{
    // specialize warp_scan for int and logical warp of 64 threads
    using warp_scan_int = rocprim::warp_scan<int, 64>;
    // allocate storage in shared memory
    __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4
    int logical_warp_id = threadIdx.x/64;
    int input = ...;
    int output, reduction;
    // exclusive prefix sum
    warp_scan_int().exclusive_scan(
        input,
        output,
        10, // init
        reduction,
        temp[logical_warp_id]
    );
    ...
}

If the initial value is 10 and input values across threads in a block/tile are {1, 1, ..., 1, 1}, then output values in every logical warp will be {10, 11, 12, 13, ..., 73}. The reduction will be 64.

◆ exclusive_scan() [4/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::exclusive_scan	(	T	,
		T &	,
		T	,
		T &	,
		storage_type &	,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs exclusive scan and reduction across threads in a logical warp.

Invalid Warp Size

◆ inclusive_scan() [1/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::inclusive_scan	(	T	input,
		T &	output,
		storage_type &	storage,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs inclusive scan across threads in a logical warp.

Template Parameters

BinaryFunction - type of binary function used for scan. Default type is rocprim::plus<T>.

Parameters

[in]	input	- thread input value.
[out]	output	- reference to a thread output value. May be aliased with `input`.
[in]	storage	- reference to a temporary storage object of type storage_type.
[in]	scan_op	- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: `T f(const T &a, const T &b);`. The signature does not need to have `const &`, but function object must not modify the objects passed to it.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Examples

The examples present inclusive min scan operations performed on groups of 32 threads, each provides one float value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 256.

__global__ void example_kernel(...) // blockDim.x = 256
{
    // specialize warp_scan for float and logical warp of 32 threads
    using warp_scan_f = rocprim::warp_scan<float, 32>;
    // allocate storage in shared memory
    __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8
    int logical_warp_id = threadIdx.x/32;
    float value = ...;
    // execute inclusive min scan
    warp_scan_float().inclusive_scan(
        value, // input
        value, // output
        temp[logical_warp_id],
        rocprim::minimum<float>()
    );
    ...
}

If the input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}, then output values in the first logical warp will be {1, -2, -2, -4, ..., -32}, in the second: {33, -34, -34, -36, ..., -64} etc.

◆ inclusive_scan() [2/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::inclusive_scan	(	T	,
		T &	,
		storage_type &	,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs inclusive scan across threads in a logical warp.

Invalid Warp Size

◆ inclusive_scan() [3/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::inclusive_scan	(	T	input,
		T &	output,
		T &	reduction,
		storage_type &	storage,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs inclusive scan and reduction across threads in a logical warp.

Template Parameters

BinaryFunction - type of binary function used for scan. Default type is rocprim::plus<T>.

Parameters

[in]	input	- thread input value.
[out]	output	- reference to a thread output value. May be aliased with `input`.
[out]	reduction	- result of reducing of all `input` values in logical warp.
[in]	storage	- reference to a temporary storage object of type storage_type.
[in]	scan_op	- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: `T f(const T &a, const T &b);`. The signature does not need to have `const &`, but function object must not modify the objects passed to it.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Examples

The examples present inclusive prefix sum operations performed on groups of 64 threads, each thread provides one int value. Hardware warp size is 64. Block (tile) size is 256.

__global__ void example_kernel(...) // blockDim.x = 256
{
    // specialize warp_scan for int and logical warp of 64 threads
    using warp_scan_int = rocprim::warp_scan<int, 64>;
    // allocate storage in shared memory
    __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4
    int logical_warp_id = threadIdx.x/64;
    int input = ...;
    int output, reduction;
    // inclusive prefix sum
    warp_scan_int().inclusive_scan(
        input,
        output,
        reduction,
        temp[logical_warp_id]
    );
    ...
}

If the input values across threads in a block/tile are {1, 1, 1, 1, ..., 1, 1}, then output values in the every logical warp will be {1, 2, 3, 4, ..., 64}. The reduction will be equal 64.

◆ inclusive_scan() [4/4]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::inclusive_scan	(	T	,
		T &	,
		T &	,
		storage_type &	,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs inclusive scan and reduction across threads in a logical warp.

Invalid Warp Size

◆ scan() [1/2]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::scan	(	T	input,
		T &	inclusive_output,
		T &	exclusive_output,
		T	init,
		storage_type &	storage,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs inclusive and exclusive scan operations across threads in a logical warp.

Template Parameters

BinaryFunction - type of binary function used for scan. Default type is rocprim::plus<T>.

Parameters

[in]	input	- thread input value.
[out]	inclusive_output	- reference to a thread inclusive-scan output value.
[out]	exclusive_output	- reference to a thread exclusive-scan output value.
[in]	init	- initial value used to start the exclusive scan. Should be the same for all threads in a logical warp.
[in]	storage	- reference to a temporary storage object of type storage_type.
[in]	scan_op	- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: `T f(const T &a, const T &b);`. The signature does not need to have `const &`, but function object must not modify the objects passed to it.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Examples

The examples present min inclusive and exclusive scan operations performed on groups of 32 threads, each provides one float value, result is returned using the same variable as for input. Hardware warp size is 64. Block (tile) size is 256.

__global__ void example_kernel(...) // blockDim.x = 256
{
    // specialize warp_scan for float and logical warp of 32 threads
    using warp_scan_f = rocprim::warp_scan<float, 32>;
    // allocate storage in shared memory
    __shared__ warp_scan_float::storage_type temp[8]; // 256/32 = 8
    int logical_warp_id = threadIdx.x/32;
    float input = ...;
    float ex_output, in_output;
    // execute exclusive min scan
    warp_scan_float().scan(
        input,
        in_output,
        ex_output,
        100.0f, // init
        temp[logical_warp_id],
        rocprim::minimum<float>()
    );
    ...
}

If the initial value is 100 and input values across threads in a block/tile are {1, -2, 3, -4, ..., 255, -256}, then in_output values in the first logical warp will be {1, -2, -2, -4, ..., -32}, in the second: {33, -34, -34, -36, ..., -64} and so forth, ex_output values in the first logical warp will be {100, 1, -2, -2, -4, ..., -30}, in the second: {100, 33, -34, -34, -36, ..., -62} etc.

◆ scan() [2/2]

template<class T , unsigned int WarpSize = device_warp_size()>

template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>

ROCPRIM_DEVICE ROCPRIM_INLINE auto warp_scan< T, WarpSize >::scan	(	T	input,
		T &	inclusive_output,
		T &	exclusive_output,
		T	init,
		T &	reduction,
		storage_type &	storage,
		BinaryFunction	scan_op = `BinaryFunction()`
	)		-> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type

inline

Performs inclusive and exclusive scan operations, and reduction across threads in a logical warp.

Template Parameters

BinaryFunction - type of binary function used for scan. Default type is rocprim::plus<T>.

Parameters

[in]	input	- thread input value.
[out]	inclusive_output	- reference to a thread inclusive-scan output value.
[out]	exclusive_output	- reference to a thread exclusive-scan output value.
[in]	init	- initial value used to start the exclusive scan. Should be the same for all threads in a logical warp.
[out]	reduction	- result of reducing of all `input` values in logical warp. `init` value is not included in the reduction.
[in]	storage	- reference to a temporary storage object of type storage_type.
[in]	scan_op	- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: `T f(const T &a, const T &b);`. The signature does not need to have `const &`, but function object must not modify the objects passed to it.

Storage reusage: Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().

Examples

The examples present inclusive and exclusive prefix sum operations performed on groups of 64 threads, each thread provides one int value. Hardware warp size is 64. Block (tile) size is 256.

__global__ void example_kernel(...) // blockDim.x = 256
{
    // specialize warp_scan for int and logical warp of 64 threads
    using warp_scan_int = rocprim::warp_scan<int, 64>;
    // allocate storage in shared memory
    __shared__ warp_scan_int::storage_type temp[4]; // 256/64 = 4
    int logical_warp_id = threadIdx.x/64;
    int input = ...;
    int in_output, ex_output, reduction;
    // inclusive and exclusive prefix sum
    warp_scan_int().scan(
        input,
        in_output,
        ex_output,
        init,
        reduction,
        temp[logical_warp_id]
    );
    ...
}

If the initial value is 10 and input values across threads in a block/tile are {1, 1, ..., 1, 1}, then in_output values in every logical warp will be {1, 2, 3, 4, ..., 63, 64}, and ex_output values in every logical warp will be {10, 11, 12, 13, ..., 73}. The reduction will be 64.

The documentation for this class was generated from the following file:

rocprim/include/rocprim/warp/warp_scan.hpp

Public Types

Public Member Functions

Protected Member Functions

Detailed Description

template<class T, unsigned int WarpSize = device_warp_size()> class warp_scan< T, WarpSize >

Member Typedef Documentation

◆ storage_type

Member Function Documentation

◆ broadcast() [1/2]

◆ broadcast() [2/2]

◆ exclusive_scan() [1/4]

◆ exclusive_scan() [2/4]

◆ exclusive_scan() [3/4]

◆ exclusive_scan() [4/4]

◆ inclusive_scan() [1/4]

◆ inclusive_scan() [2/4]

◆ inclusive_scan() [3/4]

◆ inclusive_scan() [4/4]

◆ scan() [1/2]

◆ scan() [2/2]

template<class T, unsigned int WarpSize = device_warp_size()>
class warp_scan< T, WarpSize >