rocPRIM
|
The block_adjacent_difference
class is a block level parallel primitive which provides methods for applying binary functions for pairs of consecutive items partition across a thread block.
More...
#include <block_adjacent_difference.hpp>
Public Types | |
using | storage_type = detail::raw_storage< storage_type_ > |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. More... | |
Public Member Functions | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_heads (Flag(&head_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags head_flags that indicate discontinuities between items partitioned across the thread block, where the first item has no reference and is always flagged. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_heads (Flag(&head_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_heads (Flag(&head_flags)[ItemsPerThread], T tile_predecessor_item, const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags head_flags that indicate discontinuities between items partitioned across the thread block, where the first item of the first thread is compared against a tile_predecessor_item . More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_heads (Flag(&head_flags)[ItemsPerThread], T tile_predecessor_item, const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_tails (Flag(&tail_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags tail_flags that indicate discontinuities between items partitioned across the thread block, where the last item has no reference and is always flagged. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_tails (Flag(&tail_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_tails (Flag(&tail_flags)[ItemsPerThread], T tile_successor_item, const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags tail_flags that indicate discontinuities between items partitioned across the thread block, where the last item of the last thread is compared against a tile_successor_item . More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_tails (Flag(&tail_flags)[ItemsPerThread], T tile_successor_item, const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], Flag(&tail_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags both head_flags andtail_flags that indicate discontinuities between items partitioned across the thread block. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], Flag(&tail_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], Flag(&tail_flags)[ItemsPerThread], T tile_successor_item, const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags both head_flags andtail_flags that indicate discontinuities between items partitioned across the thread block, where the last item of the last thread is compared against a tile_successor_item . More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], Flag(&tail_flags)[ItemsPerThread], T tile_successor_item, const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], T tile_predecessor_item, Flag(&tail_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags both head_flags andtail_flags that indicate discontinuities between items partitioned across the thread block, where the first item of the first thread is compared against a tile_predecessor_item . More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], T tile_predecessor_item, Flag(&tail_flags)[ItemsPerThread], const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], T tile_predecessor_item, Flag(&tail_flags)[ItemsPerThread], T tile_successor_item, const T(&input)[ItemsPerThread], FlagOp flag_op, storage_type &storage) |
Tags both head_flags andtail_flags that indicate discontinuities between items partitioned across the thread block, where the first and last items of the first and last thread is compared against a tile_predecessor_item and a tile_successor_item . More... | |
template<unsigned int ItemsPerThread, class Flag , class FlagOp > | |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | flag_heads_and_tails (Flag(&head_flags)[ItemsPerThread], T tile_predecessor_item, Flag(&tail_flags)[ItemsPerThread], T tile_successor_item, const T(&input)[ItemsPerThread], FlagOp flag_op) |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_left (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_left (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, const T tile_predecessor, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item, with an explicit item before the tile. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_left_partial (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, const unsigned int valid_items, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item, in a partial tile. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_left_partial (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, const T tile_predecessor, const unsigned int valid_items, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item, in a partial tile with a predecessor. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_right (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the right item. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_right (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, const T tile_successor, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the right item, with an explicit item after the tile. More... | |
template<typename Output , unsigned int ItemsPerThread, typename BinaryFunction > | |
ROCPRIM_DEVICE ROCPRIM_INLINE void | subtract_right_partial (const T(&input)[ItemsPerThread], Output(&output)[ItemsPerThread], const BinaryFunction op, const unsigned int valid_items, storage_type &storage) |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the right item, in a partial tile. More... | |
The block_adjacent_difference
class is a block level parallel primitive which provides methods for applying binary functions for pairs of consecutive items partition across a thread block.
T | - the input type. |
BlockSize | - the number of threads in a block. |
In the examples discontinuity operation is performed on block of 128 threads, using type int
.
using block_adjacent_difference< T, BlockSizeX, BlockSizeY, BlockSizeZ >::storage_type = detail::raw_storage<storage_type_> |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.
Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords __shared__
. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.
|
inline |
Tags head_flags
that indicate discontinuities between items partitioned across the thread block, where the first item has no reference and is always flagged.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | head_flags | - array that contains the head flags. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not take a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags head_flags
that indicate discontinuities between items partitioned across the thread block, where the first item of the first thread is compared against a tile_predecessor_item
.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | head_flags | - array that contains the head flags. |
[in] | tile_predecessor_item | - first tile item from thread to be compared against. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags both head_flags
andtail_flags
that indicate discontinuities between items partitioned across the thread block.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | head_flags | - array that contains the head flags. |
[out] | tail_flags | - array that contains the tail flags. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags both head_flags
andtail_flags
that indicate discontinuities between items partitioned across the thread block, where the last item of the last thread is compared against a tile_successor_item
.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | head_flags | - array that contains the head flags. |
[out] | tail_flags | - array that contains the tail flags. |
[in] | tile_successor_item | - last tile item from thread to be compared against. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags both head_flags
andtail_flags
that indicate discontinuities between items partitioned across the thread block, where the first item of the first thread is compared against a tile_predecessor_item
.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | head_flags | - array that contains the head flags. |
[in] | tile_predecessor_item | - first tile item from thread to be compared against. |
[out] | tail_flags | - array that contains the tail flags. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags both head_flags
andtail_flags
that indicate discontinuities between items partitioned across the thread block, where the first and last items of the first and last thread is compared against a tile_predecessor_item
and a tile_successor_item
.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | head_flags | - array that contains the head flags. |
[in] | tile_predecessor_item | - first tile item from thread to be compared against. |
[out] | tail_flags | - array that contains the tail flags. |
[in] | tile_successor_item | - last tile item from thread to be compared against. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags tail_flags
that indicate discontinuities between items partitioned across the thread block, where the last item has no reference and is always flagged.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | tail_flags | - array that contains the tail flags. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Tags tail_flags
that indicate discontinuities between items partitioned across the thread block, where the last item of the last thread is compared against a tile_successor_item
.
ItemsPerThread | - [inferred] the number of items to be processed by each thread. |
Flag | - [inferred] the flag type. |
FlagOp | - [inferred] type of binary function used for flagging. |
[out] | tail_flags | - array that contains the tail flags. |
[in] | tile_successor_item | - last tile item from thread to be compared against. |
[in] | input | - array that data is loaded from. |
[in] | flag_op | - binary operation function object that will be used for flagging. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); or bool (const T& a, const T& b, unsigned int b_index); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
|
inline |
This is an overloaded member function, provided for convenience. It differs from the above function only in what argument(s) it accepts.
This overload does not accept a reference to temporary storage, instead it is declared as part of the function itself. Note that this does NOT decrease the shared memory requirements of a kernel using this function.
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item.
The first item in the first thread is copied from the input then for the rest the following code applies.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
storage | reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item, with an explicit item before the tile.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
[in] | tile_predecessor | - the item before the tile, will be used as the input of the first application of op |
storage | - reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item, in a partial tile.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
[in] | valid_items | - number of items in the block which are considered "valid" and will be used. Must be less or equal to BlockSize * ItemsPerThread |
storage | - reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the left item, in a partial tile with a predecessor.
This combines subtract_left_partial() with a tile predecessor.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
[in] | tile_predecessor | - the item before the tile, will be used as the input of the first application of op |
[in] | valid_items | - number of items in the block which are considered "valid" and will be used. Must be less or equal to BlockSize * ItemsPerThread |
storage | - reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the right item.
The last item in the last thread is copied from the input then for the rest the following code applies.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
storage | - reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the right item, with an explicit item after the tile.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
[in] | tile_successor | - the item after the tile, will be used as the input of the last application of op |
storage | - reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .
|
inline |
Apply a function to each consecutive pair of elements partitioned across threads in the block and write the output to the position of the right item, in a partial tile.
Output | - [inferred] the type of output, must be assignable from the result of op |
ItemsPerThread | - [inferred] the number of items processed by each thread |
BinaryFunction | - [inferred] the type of the function to apply |
[in] | input | - array that data is loaded from partitioned across the threads in the block |
[out] | output | - array where the result of function application will be written to |
[in] | op | - binary function applied to the items. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b) The signature does not need to have const & but the function object must not modify the objects passed to it. |
[in] | valid_items | - number of items in the block which are considered "valid" and will be used. Must be less or equal to BlockSize * ItemsPerThread |
storage | - reference to a temporary storage object of type storage_type |
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads() .