|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | sort (Key &thread_key, BinaryFunction compare_function=BinaryFunction()) |
| Block sort for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | sort (Key(&thread_keys)[ItemsPerThread], BinaryFunction compare_function=BinaryFunction()) |
| This overload allows an array of ItemsPerThread keys to be passed in so that each thread can process multiple items. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_INLINE void | sort (Key &thread_key, storage_type &storage, BinaryFunction compare_function=BinaryFunction()) |
| Block sort for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_INLINE void | sort (Key(&thread_keys)[ItemsPerThread], storage_type &storage, BinaryFunction compare_function=BinaryFunction()) |
| This overload allows arrays of ItemsPerThread keys to be passed in so that each thread can process multiple items. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | sort (Key &thread_key, Value &thread_value, BinaryFunction compare_function=BinaryFunction()) |
| Block sort by key for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | sort (Key(&thread_keys)[ItemsPerThread], Value(&thread_values)[ItemsPerThread], BinaryFunction compare_function=BinaryFunction()) |
| This overload allows an array of ItemsPerThread keys and values to be passed in so that each thread can process multiple items. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_INLINE void | sort (Key &thread_key, Value &thread_value, storage_type &storage, BinaryFunction compare_function=BinaryFunction()) |
| Block sort by key for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_INLINE void | sort (Key(&thread_keys)[ItemsPerThread], Value(&thread_values)[ItemsPerThread], storage_type &storage, BinaryFunction compare_function=BinaryFunction()) |
| This overload allows an array of ItemsPerThread keys and values to be passed in so that each thread can process multiple items. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | sort (Key &thread_key, storage_type &storage, const unsigned int size, BinaryFunction compare_function=BinaryFunction()) |
| Block sort for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void | sort (Key(&thread_keys)[ItemsPerThread], storage_type &storage, const unsigned int size, BinaryFunction compare_function=BinaryFunction()) |
| Block sort for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_INLINE void | sort (Key &thread_key, Value &thread_value, storage_type &storage, const unsigned int size, BinaryFunction compare_function=BinaryFunction()) |
| Block sort by key for any data type. More...
|
|
template<class BinaryFunction = ::rocprim::less<Key>> |
ROCPRIM_DEVICE ROCPRIM_INLINE void | sort (Key(&thread_keys)[ItemsPerThread], Value(&thread_values)[ItemsPerThread], storage_type &storage, const unsigned int size, BinaryFunction compare_function=BinaryFunction()) |
| Block sort by key for any data type. More...
|
|
template<class Key, unsigned int BlockSizeX, unsigned int ItemsPerThread = 1, class Value = empty_type, block_sort_algorithm Algorithm = block_sort_algorithm::default_algorithm, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
class block_sort< Key, BlockSizeX, ItemsPerThread, Value, Algorithm, BlockSizeY, BlockSizeZ >
The block_sort class is a block level parallel primitive which provides methods sorting items (keys or key-value pairs) partitioned across threads in a block using comparison-based sort algorithm.
- Template Parameters
-
Key | - the key type. |
BlockSize | - the number of threads in a block. |
ItemsPerThread | - number of items processed by each thread. The total range will be BlockSize * ItemsPerThread long |
Value | - the value type. Default type empty_type indicates a keys-only sort. |
Algorithm | - selected sort algorithm, block_sort_algorithm::default_algorithm by default. |
- Overview
- Accepts custom compare_functions for sorting across a block.
- Performance notes:
- It is generally better if
BlockSize
and ItemsPerThread
are powers of two.
- The overloaded functions with
size
are generally slower.
- Examples
In the examples sort is performed on a block of 256 threads, each thread provides one int
value, results are returned using the same variable as for input.
__global__ void example_kernel(...)
{
using block_sort_int = rocprim::block_sort<int, 256>;
__shared__ block_sort_int::storage_type storage;
int input = ...;
block_sort_int().sort(
input,
storage
);
...
}
template<class Key , unsigned int BlockSizeX, unsigned int ItemsPerThread = 1, class Value = empty_type, block_sort_algorithm Algorithm = block_sort_algorithm::default_algorithm, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class BinaryFunction = ::rocprim::less<Key>>
ROCPRIM_DEVICE ROCPRIM_INLINE void block_sort< Key, BlockSizeX, ItemsPerThread, Value, Algorithm, BlockSizeY, BlockSizeZ >::sort |
( |
Key & |
thread_key, |
|
|
Value & |
thread_value, |
|
|
storage_type & |
storage, |
|
|
BinaryFunction |
compare_function = BinaryFunction() |
|
) |
| |
|
inline |
Block sort by key for any data type.
- Template Parameters
-
BinaryFunction | - type of binary function used for sort. Default type is rocprim::less<T>. |
- Parameters
-
[in,out] | thread_key | - reference to a key provided by a thread. |
[in,out] | thread_value | - reference to a value provided by a thread. |
[in] | storage | - reference to a temporary storage object of type storage_type. |
[in] | compare_function | - comparison function object which returns true if the first argument is is ordered before the second. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b); . The signature does not need to have const & , but function object must not modify the objects passed to it. |
- Storage reusage
- Synchronization barrier should be placed before
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
In the examples sort is performed on a block of 256 threads, each thread provides one int
key and one int
value, results are returned using the same variable as for input.
__global__ void example_kernel(...)
{
using block_sort_int = rocprim::block_sort<int, 256, int>;
__shared__ block_sort_int::storage_type storage;
int key = ...;
int value = ...;
block_sort_int().sort(
key,
value,
storage
);
...
}