rocPRIM
Functions
Collaboration diagram for Device-wide:

Functions

template<class Config = default_config, class HaystackIterator , class NeedlesIterator , class OutputIterator , class CompareFunction = ::rocprim::less<>>
hipError_t lower_bound (void *temporary_storage, size_t &storage_size, HaystackIterator haystack, NeedlesIterator needles, OutputIterator output, size_t haystack_size, size_t needles_size, CompareFunction compare_op=CompareFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 
template<class Config = default_config, class HaystackIterator , class NeedlesIterator , class OutputIterator , class CompareFunction = ::rocprim::less<>>
hipError_t upper_bound (void *temporary_storage, size_t &storage_size, HaystackIterator haystack, NeedlesIterator needles, OutputIterator output, size_t haystack_size, size_t needles_size, CompareFunction compare_op=CompareFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 
template<class Config = default_config, class HaystackIterator , class NeedlesIterator , class OutputIterator , class CompareFunction = ::rocprim::less<>>
hipError_t binary_search (void *temporary_storage, size_t &storage_size, HaystackIterator haystack, NeedlesIterator needles, OutputIterator output, size_t haystack_size, size_t needles_size, CompareFunction compare_op=CompareFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 
template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_even (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram, unsigned int levels, Level lower_level, Level upper_level, hipStream_t stream=0, bool debug_synchronous=false)
 Computes a histogram from a sequence of samples using equal-width bins. More...
 
template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_even (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram, unsigned int levels, Level lower_level, Level upper_level, hipStream_t stream=0, bool debug_synchronous=false)
 Computes a histogram from a two-dimensional region of samples using equal-width bins. More...
 
template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_even (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level lower_level[ActiveChannels], Level upper_level[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false)
 Computes histograms from a sequence of multi-channel samples using equal-width bins. More...
 
template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_even (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level lower_level[ActiveChannels], Level upper_level[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false)
 Computes histograms from a two-dimensional region of multi-channel samples using equal-width bins. More...
 
template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_range (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram, unsigned int levels, Level *level_values, hipStream_t stream=0, bool debug_synchronous=false)
 Computes a histogram from a sequence of samples using the specified bin boundary levels. More...
 
template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_range (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram, unsigned int levels, Level *level_values, hipStream_t stream=0, bool debug_synchronous=false)
 Computes a histogram from a two-dimensional region of samples using the specified bin boundary levels. More...
 
template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_range (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int size, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level *level_values[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false)
 Computes histograms from a sequence of multi-channel samples using the specified bin boundary levels. More...
 
template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_range (void *temporary_storage, size_t &storage_size, SampleIterator samples, unsigned int columns, unsigned int rows, size_t row_stride_bytes, Counter *histogram[ActiveChannels], unsigned int levels[ActiveChannels], Level *level_values[ActiveChannels], hipStream_t stream=0, bool debug_synchronous=false)
 Computes histograms from a two-dimensional region of multi-channel samples using the specified bin boundary levels. More...
 
template<class Config = default_config, class InputIterator1 , class InputIterator2 , class OutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<InputIterator1>::value_type>>
hipError_t merge (void *temporary_storage, size_t &storage_size, InputIterator1 input1, InputIterator2 input2, OutputIterator output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel merge primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator1 , class KeysInputIterator2 , class KeysOutputIterator , class ValuesInputIterator1 , class ValuesInputIterator2 , class ValuesOutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator1>::value_type>>
hipError_t merge (void *temporary_storage, size_t &storage_size, KeysInputIterator1 keys_input1, KeysInputIterator2 keys_input2, KeysOutputIterator keys_output, ValuesInputIterator1 values_input1, ValuesInputIterator2 values_input2, ValuesOutputIterator values_output, const size_t input1_size, const size_t input2_size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel merge primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t merge_sort (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, const size_t size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel merge sort primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t merge_sort (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, const size_t size, BinaryFunction compare_function=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending merge sort-by-key primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class FlagIterator , class OutputIterator , class SelectedCountOutputIterator >
hipError_t partition (void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
 Parallel select primitive for device level using range of flags. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class SelectedCountOutputIterator , class UnaryPredicate >
hipError_t partition (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, UnaryPredicate predicate, const hipStream_t stream=0, const bool debug_synchronous=false)
 Parallel select primitive for device level using selection predicate. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_keys (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_keys_desc (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_pairs (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_pairs_desc (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class Key >
hipError_t radix_sort_keys (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort primitive for device level. More...
 
template<class Config = default_config, class Key >
hipError_t radix_sort_keys_desc (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort primitive for device level. More...
 
template<class Config = default_config, class Key , class Value >
hipError_t radix_sort_pairs (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, double_buffer< Value > &values, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class Key , class Value >
hipError_t radix_sort_pairs_desc (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, double_buffer< Value > &values, unsigned int size, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class InitValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t reduce (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel reduction primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t reduce (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel reduce primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class ValuesInputIterator , class UniqueOutputIterator , class AggregatesOutputIterator , class UniqueCountOutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>, class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t reduce_by_key (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, ValuesInputIterator values_input, unsigned int size, UniqueOutputIterator unique_output, AggregatesOutputIterator aggregates_output, UniqueCountOutputIterator unique_count_output, BinaryFunction reduce_op=BinaryFunction(), KeyCompareFunction key_compare_op=KeyCompareFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel reduce-by-key primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class UniqueOutputIterator , class CountsOutputIterator , class RunsCountOutputIterator >
hipError_t run_length_encode (void *temporary_storage, size_t &storage_size, InputIterator input, unsigned int size, UniqueOutputIterator unique_output, CountsOutputIterator counts_output, RunsCountOutputIterator runs_count_output, hipStream_t stream=0, bool debug_synchronous=false)
 Parallel run-length encoding for device level. More...
 
template<class Config = default_config, class InputIterator , class OffsetsOutputIterator , class CountsOutputIterator , class RunsCountOutputIterator >
hipError_t run_length_encode_non_trivial_runs (void *temporary_storage, size_t &storage_size, InputIterator input, unsigned int size, OffsetsOutputIterator offsets_output, CountsOutputIterator counts_output, RunsCountOutputIterator runs_count_output, hipStream_t stream=0, bool debug_synchronous=false)
 Parallel run-length encoding of non-trivial runs for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t inclusive_scan (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false, size_t size_limit=size_t(std::numeric_limits< int >::max())+1)
 Parallel inclusive scan primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class InitValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t exclusive_scan (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false, size_t size_limit=size_t(std::numeric_limits< int >::max())+1)
 Parallel exclusive scan primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class ValuesInputIterator , class ValuesOutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>, class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t inclusive_scan_by_key (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, ValuesInputIterator values_input, ValuesOutputIterator values_output, const size_t size, BinaryFunction scan_op=BinaryFunction(), KeyCompareFunction key_compare_op=KeyCompareFunction(), const hipStream_t stream=0, bool debug_synchronous=false, size_t size_limit=size_t(std::numeric_limits< int >::max())+1)
 Parallel inclusive scan-by-key primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class ValuesInputIterator , class ValuesOutputIterator , class InitialValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>, class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t exclusive_scan_by_key (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, ValuesInputIterator values_input, ValuesOutputIterator values_output, const InitialValueType initial_value, const size_t size, BinaryFunction scan_op=BinaryFunction(), KeyCompareFunction key_compare_op=KeyCompareFunction(), const hipStream_t stream=0, bool debug_synchronous=false, size_t size_limit=size_t(std::numeric_limits< int >::max())+1)
 Parallel exclusive scan-by-key primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_keys (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_keys_desc (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_pairs (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_pairs_desc (void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class Key , class OffsetIterator >
hipError_t segmented_radix_sort_keys (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort primitive for device level. More...
 
template<class Config = default_config, class Key , class OffsetIterator >
hipError_t segmented_radix_sort_keys_desc (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort primitive for device level. More...
 
template<class Config = default_config, class Key , class Value , class OffsetIterator >
hipError_t segmented_radix_sort_pairs (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, double_buffer< Value > &values, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel ascending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class Key , class Value , class OffsetIterator >
hipError_t segmented_radix_sort_pairs_desc (void *temporary_storage, size_t &storage_size, double_buffer< Key > &keys, double_buffer< Value > &values, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel descending radix sort-by-key primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class OffsetIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>, class InitValueType = typename std::iterator_traits<InputIterator>::value_type>
hipError_t segmented_reduce (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, BinaryFunction reduce_op=BinaryFunction(), InitValueType initial_value=InitValueType(), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel segmented reduction primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class OffsetIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_inclusive_scan (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, BinaryFunction scan_op=BinaryFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel segmented inclusive scan primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class OffsetIterator , class InitValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_exclusive_scan (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, const InitValueType initial_value, BinaryFunction scan_op=BinaryFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel segmented exclusive scan primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class HeadFlagIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_inclusive_scan (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, HeadFlagIterator head_flags, size_t size, BinaryFunction scan_op=BinaryFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel segmented inclusive scan primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class InitValueType , class HeadFlagIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_exclusive_scan (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, HeadFlagIterator head_flags, const InitValueType initial_value, size_t size, BinaryFunction scan_op=BinaryFunction(), hipStream_t stream=0, bool debug_synchronous=false)
 Parallel segmented exclusive scan primitive for device level. More...
 
template<class Config = default_config, class InputIterator , class FlagIterator , class OutputIterator , class SelectedCountOutputIterator >
hipError_t select (void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
 Parallel select primitive for device level using range of flags. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class SelectedCountOutputIterator , class UnaryPredicate >
hipError_t select (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, UnaryPredicate predicate, const hipStream_t stream=0, const bool debug_synchronous=false)
 Parallel select primitive for device level using selection operator. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class UniqueCountOutputIterator , class EqualityOp = ::rocprim::equal_to<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t unique (void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, UniqueCountOutputIterator unique_count_output, const size_t size, EqualityOp equality_op=EqualityOp(), const hipStream_t stream=0, const bool debug_synchronous=false)
 Device-level parallel unique primitive. More...
 
template<class Config = default_config, class InputIterator , class OutputIterator , class UnaryFunction >
hipError_t transform (InputIterator input, OutputIterator output, const size_t size, UnaryFunction transform_op, const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel transform primitive for device level. More...
 
template<class Config = default_config, class InputIterator1 , class InputIterator2 , class OutputIterator , class BinaryFunction >
hipError_t transform (InputIterator1 input1, InputIterator2 input2, OutputIterator output, const size_t size, BinaryFunction transform_op, const hipStream_t stream=0, bool debug_synchronous=false)
 Parallel device-level transform primitive for two inputs. More...
 

Detailed Description

Function Documentation

◆ exclusive_scan()

template<class Config = default_config, class InputIterator , class OutputIterator , class InitValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t exclusive_scan ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
const InitValueType  initial_value,
const size_t  size,
BinaryFunction  scan_op = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false,
size_t  size_limit = size_t(std::numeric_limits<int>::max()) + 1 
)
inline

Parallel exclusive scan primitive for device level.

exclusive_scan function performs a device-wide exclusive prefix scan operation using binary scan_op operator.

Overview
  • Supports non-commutative scan operators. However, a scan operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input and output must have at least size elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
InitValueType- type of the initial value.
BinaryFunction- type of binary function used for scan. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to scan.
[out]output- iterator to the first element in the output range. It can be same as input.
[in]initial_value- initial value to start the scan.
[in]size- number of element in the input range.
[in]scan_op- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel
[in]size_limit- [optional] Set the maximum size which handled at the same time launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level exclusive min-scan operation is performed on an array of integer values (shorts are scanned into ints) using custom operator.

// custom scan function
auto min_op =
[] __device__ (int a, int b) -> int
{
return a < b ? a : b;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
int * output; // empty array of 8 elements
int start_value; // e.g., 9
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, start_value, input_size, min_op
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, start_value, input_size, min_op
);
// output: [9, 4, 7, 6, 2, 2, 1, 1]

◆ exclusive_scan_by_key()

template<class Config = default_config, class KeysInputIterator , class ValuesInputIterator , class ValuesOutputIterator , class InitialValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>, class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t exclusive_scan_by_key ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
const InitialValueType  initial_value,
const size_t  size,
BinaryFunction  scan_op = BinaryFunction(),
KeyCompareFunction  key_compare_op = KeyCompareFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false,
size_t  size_limit = size_t(std::numeric_limits<int>::max()) + 1 
)
inline

Parallel exclusive scan-by-key primitive for device level.

inclusive_scan_by_key function performs a device-wide exclusive prefix scan-by-key operation using binary scan_op operator.

Overview
  • Supports non-commutative scan operators. However, a scan operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by keys_input, values_input, and values_output must have at least size elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
InitValueType- type of the initial value.
BinaryFunction- type of binary function used for scan. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
KeyCompareFunction- type of binary function used to determine keys equality. Default type is rocprim::equal_to<T>, where T is a value_type of KeysInputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- iterator to the first element in the range of keys.
[in]values_input- iterator to the first element in the range of values to scan.
[out]values_output- iterator to the first element in the output value range.
[in]initial_value- initial value to start the scan.
[in]size- number of element in the input range.
[in]scan_op- binary operation function object that will be used for scanning input values. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is BinaryFunction().
[in]key_compare_op- binary operation function object that will be used to determine keys equality. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is KeyCompareFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel
[in]size_limit- [optional] Set the maximum size which handled at the same time launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level inclusive sum-by-key operation is performed on an array of integer values (shorts are scanned into ints).

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t size; // e.g., 8
int * keys_input; // e.g., [1, 1, 1, 2, 2, 3, 3, 4]
short * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int start_value; // e.g., 9
int * values_output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, values_input,
values_output, start_value,
size,rocprim::plus<int>()
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan-by-key
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, values_input,
values_output, start_value,
size,rocprim::plus<int>()
);
// values_output: [9, 10, 12, 9, 13, 9, 15, 9]

◆ histogram_even() [1/2]

template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_even ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  size,
Counter *  histogram,
unsigned int  levels,
Level  lower_level,
Level  upper_level,
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes a histogram from a sequence of samples using equal-width bins.

  • The number of histogram bins is (levels - 1).
  • Bins are evenly-segmented and include the same width of sample values: (upper_level - lower_level) / (levels - 1).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]size- number of elements in the samples range.
[out]histogram- pointer to the first element in the histogram range.
[in]levels- number of boundaries (levels) for histogram bins.
[in]lower_level- lower sample value bound (inclusive) for the first histogram bin.
[in]upper_level- upper sample value bound (exclusive) for the last histogram bin.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level histogram of 5 bins is computed on an array of float samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int size; // e.g., 8
float * samples; // e.g., [-10.0, 0.3, 9.5, 8.1, 1.5, 1.9, 100.0, 5.1]
int * histogram; // empty array of at least 5 elements
unsigned int levels; // e.g., 6 (for 5 bins)
float lower_level; // e.g., 0.0
float upper_level; // e.g., 10.0
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, lower_level, upper_level
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histogram
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, lower_level, upper_level
);
// histogram: [3, 0, 1, 0, 2]

◆ histogram_even() [2/2]

template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_even ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  columns,
unsigned int  rows,
size_t  row_stride_bytes,
Counter *  histogram,
unsigned int  levels,
Level  lower_level,
Level  upper_level,
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes a histogram from a two-dimensional region of samples using equal-width bins.

  • The two-dimensional region of interest within samples can be specified using the columns, rows and row_stride_bytes parameters.
  • The row stride must be a whole multiple of the sample data type size, i.e., (row_stride_bytes % sizeof(std::iterator_traits<SampleIterator>::value_type)) == 0.
  • The number of histogram bins is (levels - 1).
  • Bins are evenly-segmented and include the same width of sample values: (upper_level - lower_level) / (levels - 1).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]columns- number of elements in each row of the region.
[in]rows- number of rows of the region.
[in]row_stride_bytes- number of bytes between starts of consecutive rows of the region.
[out]histogram- pointer to the first element in the histogram range.
[in]levels- number of boundaries (levels) for histogram bins.
[in]lower_level- lower sample value bound (inclusive) for the first histogram bin.
[in]upper_level- upper sample value bound (exclusive) for the last histogram bin.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level histogram of 5 bins is computed on an array of float samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int columns; // e.g., 4
unsigned int rows; // e.g., 2
size_t row_stride_bytes; // e.g., 6 * sizeof(float)
float * samples; // e.g., [-10.0, 0.3, 9.5, 8.1, -, -, 1.5, 1.9, 100.0, 5.1, -, -]
int * histogram; // empty array of at least 5 elements
unsigned int levels; // e.g., 6 (for 5 bins)
float lower_level; // e.g., 0.0
float upper_level; // e.g., 10.0
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, lower_level, upper_level
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histogram
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, lower_level, upper_level
);
// histogram: [3, 0, 1, 0, 2]

◆ histogram_range() [1/2]

template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_range ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  size,
Counter *  histogram,
unsigned int  levels,
Level *  level_values,
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes a histogram from a sequence of samples using the specified bin boundary levels.

  • The number of histogram bins is (levels - 1).
  • The range for binj is [level_values[j], level_values[j+1]).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]size- number of elements in the samples range.
[out]histogram- pointer to the first element in the histogram range.
[in]levels- number of boundaries (levels) for histogram bins.
[in]level_values- pointer to the array of bin boundaries.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level histogram of 5 bins is computed on an array of float samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int size; // e.g., 8
float * samples; // e.g., [-10.0, 0.3, 9.5, 8.1, 1.5, 1.9, 100.0, 5.1]
int * histogram; // empty array of at least 5 elements
unsigned int levels; // e.g., 6 (for 5 bins)
float * level_values; // e.g., [0.0, 1.0, 5.0, 10.0, 20.0, 50.0]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, level_values
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histogram
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, level_values
);
// histogram: [1, 2, 3, 0, 0]

◆ histogram_range() [2/2]

template<class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t histogram_range ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  columns,
unsigned int  rows,
size_t  row_stride_bytes,
Counter *  histogram,
unsigned int  levels,
Level *  level_values,
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes a histogram from a two-dimensional region of samples using the specified bin boundary levels.

  • The two-dimensional region of interest within samples can be specified using the columns, rows and row_stride_bytes parameters.
  • The row stride must be a whole multiple of the sample data type size, i.e., (row_stride_bytes % sizeof(std::iterator_traits<SampleIterator>::value_type)) == 0.
  • The number of histogram bins is (levels - 1).
  • The range for binj is [level_values[j], level_values[j+1]).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]columns- number of elements in each row of the region.
[in]rows- number of rows of the region.
[in]row_stride_bytes- number of bytes between starts of consecutive rows of the region.
[out]histogram- pointer to the first element in the histogram range.
[in]levels- number of boundaries (levels) for histogram bins.
[in]level_values- pointer to the array of bin boundaries.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level histogram of 5 bins is computed on an array of float samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int columns; // e.g., 4
unsigned int rows; // e.g., 2
size_t row_stride_bytes; // e.g., 6 * sizeof(float)
float * samples; // e.g., [-10.0, 0.3, 9.5, 8.1, 1.5, 1.9, 100.0, 5.1]
int * histogram; // empty array of at least 5 elements
unsigned int levels; // e.g., 6 (for 5 bins)
float level_values; // e.g., [0.0, 1.0, 5.0, 10.0, 20.0, 50.0]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, level_values
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histogram
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, level_values
);
// histogram: [1, 2, 3, 0, 0]

◆ inclusive_scan()

template<class Config = default_config, class InputIterator , class OutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t inclusive_scan ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
const size_t  size,
BinaryFunction  scan_op = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false,
size_t  size_limit = size_t(std::numeric_limits<int>::max()) + 1 
)
inline

Parallel inclusive scan primitive for device level.

inclusive_scan function performs a device-wide inclusive prefix scan operation using binary scan_op operator.

Overview
  • Supports non-commutative scan operators. However, a scan operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input and output must have at least size elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for scan. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to scan.
[out]output- iterator to the first element in the output range. It can be same as input.
[in]size- number of element in the input range.
[in]scan_op- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is BinaryFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel
[in]size_limit- [optional] Set the maximum size which handled at the same time launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level inclusive sum operation is performed on an array of integer values (shorts are scanned into ints).

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size, rocprim::plus<int>()
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size, rocprim::plus<int>()
);
// output: [1, 3, 6, 10, 15, 21, 28, 36]

◆ inclusive_scan_by_key()

template<class Config = default_config, class KeysInputIterator , class ValuesInputIterator , class ValuesOutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>, class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t inclusive_scan_by_key ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
const size_t  size,
BinaryFunction  scan_op = BinaryFunction(),
KeyCompareFunction  key_compare_op = KeyCompareFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false,
size_t  size_limit = size_t(std::numeric_limits<int>::max()) + 1 
)
inline

Parallel inclusive scan-by-key primitive for device level.

inclusive_scan_by_key function performs a device-wide inclusive prefix scan-by-key operation using binary scan_op operator.

Overview
  • Supports non-commutative scan operators. However, a scan operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by keys_input, values_input, and values_output must have at least size elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
BinaryFunction- type of binary function used for scan. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
KeyCompareFunction- type of binary function used to determine keys equality. Default type is rocprim::equal_to<T>, where T is a value_type of KeysInputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- iterator to the first element in the range of keys.
[in]values_input- iterator to the first element in the range of values to scan.
[out]values_output- iterator to the first element in the output value range.
[in]size- number of element in the input range.
[in]scan_op- binary operation function object that will be used for scanning input values. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is BinaryFunction().
[in]key_compare_op- binary operation function object that will be used to determine keys equality. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is KeyCompareFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel
[in]size_limit- [optional] Set the maximum size which handled at the same time launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level inclusive sum-by-key operation is performed on an array of integer values (shorts are scanned into ints).

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t size; // e.g., 8
int * keys_input; // e.g., [1, 1, 2, 2, 3, 3, 3, 5]
short * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * values_output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, values_input,
values_output, size,
rocprim::plus<int>()
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan-by-key
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, values_input,
values_output, size,
rocprim::plus<int>()
);
// values_output: [1, 2, 3, 7, 5, 11, 18, 8]

◆ merge() [1/2]

template<class Config = default_config, class InputIterator1 , class InputIterator2 , class OutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<InputIterator1>::value_type>>
hipError_t merge ( void *  temporary_storage,
size_t &  storage_size,
InputIterator1  input1,
InputIterator2  input2,
OutputIterator  output,
const size_t  input1_size,
const size_t  input2_size,
BinaryFunction  compare_function = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel merge primitive for device level.

merge function performs a device-wide merge. Function merges two ordered sets of input values based on comparison function.

Overview
  • The contents of the inputs are not altered by the merging function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Accepts custom compare_functions for merging across the device.
Template Parameters
Config- [optional] configuration of the primitive. It can be merge_config or a custom class with the same members.
InputIterator1- random-access iterator type of the first input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
InputIterator2- random-access iterator type of the second input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input1- iterator to the first element in the first range to merge.
[in]input2- iterator to the first element in the second range to merge.
[out]output- iterator to the first element in the output range.
[in]input1_size- number of element in the first input range.
[in]input2_size- number of element in the second input range.
[in]compare_function- binary operation function object that will be used for comparison. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending merge is performed on an array of int values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size1; // e.g., 4
size_t input_size2; // e.g., 4
int * input1; // e.g., [0, 1, 2, 3]
int * input2; // e.g., [0, 1, 2, 3]
int * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input1, input2, output, input_size1, input_size2
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform merge
temporary_storage_ptr, temporary_storage_size_bytes,
input1, input2, output, input_size1, input_size2
);
// output: [0, 0, 1, 1, 2, 2, 3, 3]

◆ merge() [2/2]

template<class Config = default_config, class KeysInputIterator1 , class KeysInputIterator2 , class KeysOutputIterator , class ValuesInputIterator1 , class ValuesInputIterator2 , class ValuesOutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator1>::value_type>>
hipError_t merge ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator1  keys_input1,
KeysInputIterator2  keys_input2,
KeysOutputIterator  keys_output,
ValuesInputIterator1  values_input1,
ValuesInputIterator2  values_input2,
ValuesOutputIterator  values_output,
const size_t  input1_size,
const size_t  input2_size,
BinaryFunction  compare_function = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel merge primitive for device level.

merge function performs a device-wide merge of (key, value) pairs. Function merges two ordered sets of input keys and corresponding values based on key comparison function.

Overview
  • The contents of the inputs are not altered by the merging function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Accepts custom compare_functions for merging across the device.
Template Parameters
Config- [optional] configuration of the primitive. It can be merge_config or a custom class with the same members.
KeysInputIterator1- random-access iterator type of the first keys input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysInputIterator2- random-access iterator type of the second keys input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the keys output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
ValuesInputIterator1- random-access iterator type of the first values input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesInputIterator2- random-access iterator type of the second values input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the values output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input1- iterator to the first key in the first range to merge.
[in]keys_input2- iterator to the first key in the second range to merge.
[out]keys_output- iterator to the first key in the output range.
[in]values_input1- iterator to the first value in the first range to merge.
[in]values_input2- iterator to the first value in the second range to merge.
[out]values_output- iterator to the first value in the output range.
[in]input1_size- number of element in the first input range.
[in]input2_size- number of element in the second input range.
[in]compare_function- binary operation function object that will be used for key comparison. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending merge is performed on an array of int values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size1; // e.g., 4
size_t input_size2; // e.g., 4
int * keys_input1; // e.g., [0, 1, 2, 3]
int * keys_input2; // e.g., [0, 1, 2, 3]
int * keys_output; // empty array of 8 elements
int * values_input1; // e.g., [10, 11, 12, 13]
int * values_input2; // e.g., [20, 21, 22, 23]
int * values_output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input1, keys_input2, keys_output,
values_input1, values_input2, values_output,
// input_size1, input_size2
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform merge
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input1, keys_input2, keys_output,
values_input1, values_input2, values_output,
// input_size1, input_size2
);
// keys_output: [0, 0, 1, 1, 2, 2, 3, 3]
// values_output: [10, 20, 11, 21, 12, 22, 13, 23]

◆ merge_sort() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t merge_sort ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
const size_t  size,
BinaryFunction  compare_function = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel merge sort primitive for device level.

merge_sort function performs a device-wide merge sort of keys. Function sorts input keys based on comparison function.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Accepts custom compare_functions for sorting across the device.
Template Parameters
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]compare_function- binary operation function object that will be used for comparison. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending merge sort is performed on an array of float values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
float * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size
);
// keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]

◆ merge_sort() [2/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class BinaryFunction = ::rocprim::less<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t merge_sort ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
const size_t  size,
BinaryFunction  compare_function = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending merge sort-by-key primitive for device level.

merge_sort function performs a device-wide merge sort of (key, value) pairs. Function sorts input pairs based on comparison function.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Accepts custom compare_functions for sorting across the device.
Template Parameters
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]values_input- pointer to the first element in the range to sort.
[out]values_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]compare_function- binary operation function object that will be used for comparison. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending merge sort is performed where input keys are represented by an array of unsigned integers and input values by an array of doubles.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 2, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
unsigned int * keys_output; // empty array of 8 elements
double * values_output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size
);
// keys_output: [ 1, 2, 3, 4, 5, 6, 7, 8]
// values_output: [-1, -2, 2, 3, -4, -5, 7, -8]

◆ multi_histogram_even() [1/2]

template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_even ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  size,
Counter *  histogram[ActiveChannels],
unsigned int  levels[ActiveChannels],
Level  lower_level[ActiveChannels],
Level  upper_level[ActiveChannels],
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes histograms from a sequence of multi-channel samples using equal-width bins.

  • The input is a sequence of pixel structures, where each pixel comprises a record of Channels consecutive data samples (e.g., Channels = 4 for RGBA samples).
  • The first ActiveChannels channels of total Channels channels will be used for computing histograms (e.g., ActiveChannels = 3 for computing histograms of only RGB from RGBA samples).
  • For channeli the number of histogram bins is (levels[i] - 1).
  • For channeli bins are evenly-segmented and include the same width of sample values: (upper_level[i] - lower_level[i]) / (levels[i] - 1).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Channels- number of channels interleaved in the input samples.
ActiveChannels- number of channels being used for computing histograms.
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]size- number of pixels in the samples range.
[out]histogram- pointers to the first element in the histogram range, one for each active channel.
[in]levels- number of boundaries (levels) for histogram bins in each active channel.
[in]lower_level- lower sample value bound (inclusive) for the first histogram bin in each active channel.
[in]upper_level- upper sample value bound (exclusive) for the last histogram bin in each active channel.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example histograms for 3 channels (RGB) are computed on an array of 8-bit RGBA samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int size; // e.g., 8
unsigned char * samples; // e.g., [(3, 1, 5, 255), (3, 1, 5, 255), (4, 2, 6, 127), (3, 2, 6, 127),
// (0, 0, 0, 100), (0, 1, 0, 100), (0, 0, 1, 255), (0, 1, 1, 255)]
int * histogram[3]; // 3 empty arrays of at least 256 elements each
unsigned int levels[3]; // e.g., [257, 257, 257] (for 256 bins)
int lower_level[3]; // e.g., [0, 0, 0]
int upper_level[3]; // e.g., [256, 256, 256]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
rocprim::multi_histogram_even<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, lower_level, upper_level
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histograms
rocprim::multi_histogram_even<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, lower_level, upper_level
);
// histogram: [[4, 0, 0, 3, 1, 0, 0, ..., 0],
// [2, 4, 2, 0, 0, 0, 0, ..., 0],
// [2, 2, 0, 0, 0, 2, 2, ..., 0]]

◆ multi_histogram_even() [2/2]

template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_even ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  columns,
unsigned int  rows,
size_t  row_stride_bytes,
Counter *  histogram[ActiveChannels],
unsigned int  levels[ActiveChannels],
Level  lower_level[ActiveChannels],
Level  upper_level[ActiveChannels],
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes histograms from a two-dimensional region of multi-channel samples using equal-width bins.

  • The two-dimensional region of interest within samples can be specified using the columns, rows and row_stride_bytes parameters.
  • The row stride must be a whole multiple of the sample data type size, i.e., (row_stride_bytes % sizeof(std::iterator_traits<SampleIterator>::value_type)) == 0.
  • The input is a sequence of pixel structures, where each pixel comprises a record of Channels consecutive data samples (e.g., Channels = 4 for RGBA samples).
  • The first ActiveChannels channels of total Channels channels will be used for computing histograms (e.g., ActiveChannels = 3 for computing histograms of only RGB from RGBA samples).
  • For channeli the number of histogram bins is (levels[i] - 1).
  • For channeli bins are evenly-segmented and include the same width of sample values: (upper_level[i] - lower_level[i]) / (levels[i] - 1).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Channels- number of channels interleaved in the input samples.
ActiveChannels- number of channels being used for computing histograms.
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]columns- number of elements in each row of the region.
[in]rows- number of rows of the region.
[in]row_stride_bytes- number of bytes between starts of consecutive rows of the region.
[out]histogram- pointers to the first element in the histogram range, one for each active channel.
[in]levels- number of boundaries (levels) for histogram bins in each active channel.
[in]lower_level- lower sample value bound (inclusive) for the first histogram bin in each active channel.
[in]upper_level- upper sample value bound (exclusive) for the last histogram bin in each active channel.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example histograms for 3 channels (RGB) are computed on an array of 8-bit RGBA samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int columns; // e.g., 4
unsigned int rows; // e.g., 2
size_t row_stride_bytes; // e.g., 5 * sizeof(unsigned char)
unsigned char * samples; // e.g., [(3, 1, 5, 255), (3, 1, 5, 255), (4, 2, 6, 127), (3, 2, 6, 127), (-, -, -, -),
// (0, 0, 0, 100), (0, 1, 0, 100), (0, 0, 1, 255), (0, 1, 1, 255), (-, -, -, -)]
int * histogram[3]; // 3 empty arrays of at least 256 elements each
unsigned int levels[3]; // e.g., [257, 257, 257] (for 256 bins)
int lower_level[3]; // e.g., [0, 0, 0]
int upper_level[3]; // e.g., [256, 256, 256]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
rocprim::multi_histogram_even<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, lower_level, upper_level
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histograms
rocprim::multi_histogram_even<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, lower_level, upper_level
);
// histogram: [[4, 0, 0, 3, 1, 0, 0, ..., 0],
// [2, 4, 2, 0, 0, 0, 0, ..., 0],
// [2, 2, 0, 0, 0, 2, 2, ..., 0]]

◆ multi_histogram_range() [1/2]

template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_range ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  size,
Counter *  histogram[ActiveChannels],
unsigned int  levels[ActiveChannels],
Level *  level_values[ActiveChannels],
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes histograms from a sequence of multi-channel samples using the specified bin boundary levels.

  • The input is a sequence of pixel structures, where each pixel comprises a record of Channels consecutive data samples (e.g., Channels = 4 for RGBA samples).
  • The first ActiveChannels channels of total Channels channels will be used for computing histograms (e.g., ActiveChannels = 3 for computing histograms of only RGB from RGBA samples).
  • For channeli the number of histogram bins is (levels[i] - 1).
  • For channeli the range for binj is [level_values[i][j], level_values[i][j+1]).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Channels- number of channels interleaved in the input samples.
ActiveChannels- number of channels being used for computing histograms.
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]size- number of pixels in the samples range.
[out]histogram- pointers to the first element in the histogram range, one for each active channel.
[in]levels- number of boundaries (levels) for histogram bins in each active channel.
[in]level_values- pointer to the array of bin boundaries for each active channel.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example histograms for 3 channels (RGB) are computed on an array of 8-bit RGBA samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int size; // e.g., 8
unsigned char * samples; // e.g., [(0, 0, 80, 255), (120, 0, 80, 255), (123, 0, 82, 127), (10, 1, 83, 127),
// (51, 1, 8, 100), (52, 1, 8, 100), (53, 0, 81, 255), (54, 50, 81, 255)]
int * histogram[3]; // 3 empty arrays of at least 256 elements each
unsigned int levels[3]; // e.g., [4, 4, 3]
int * level_values[3]; // e.g., [[0, 50, 100, 200], [0, 20, 40, 60], [0, 10, 100]]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
rocprim::multi_histogram_range<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, level_values
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histograms
rocprim::multi_histogram_range<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, size,
histogram, levels, level_values
);
// histogram: [[2, 4, 2], [7, 0, 1], [2, 6]]

◆ multi_histogram_range() [2/2]

template<unsigned int Channels, unsigned int ActiveChannels, class Config = default_config, class SampleIterator , class Counter , class Level >
hipError_t multi_histogram_range ( void *  temporary_storage,
size_t &  storage_size,
SampleIterator  samples,
unsigned int  columns,
unsigned int  rows,
size_t  row_stride_bytes,
Counter *  histogram[ActiveChannels],
unsigned int  levels[ActiveChannels],
Level *  level_values[ActiveChannels],
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Computes histograms from a two-dimensional region of multi-channel samples using the specified bin boundary levels.

  • The two-dimensional region of interest within samples can be specified using the columns, rows and row_stride_bytes parameters.
  • The row stride must be a whole multiple of the sample data type size, i.e., (row_stride_bytes % sizeof(std::iterator_traits<SampleIterator>::value_type)) == 0.
  • The input is a sequence of pixel structures, where each pixel comprises a record of Channels consecutive data samples (e.g., Channels = 4 for RGBA samples).
  • The first ActiveChannels channels of total Channels channels will be used for computing histograms (e.g., ActiveChannels = 3 for computing histograms of only RGB from RGBA samples).
  • For channeli the number of histogram bins is (levels[i] - 1).
  • For channeli the range for binj is [level_values[i][j], level_values[i][j+1]).
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
Template Parameters
Channels- number of channels interleaved in the input samples.
ActiveChannels- number of channels being used for computing histograms.
Config- [optional] configuration of the primitive. It can be histogram_config or a custom class with the same members.
SampleIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
Counter- integer type for histogram bin counters.
Level- type of histogram boundaries (levels)
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]samples- iterator to the first element in the range of input samples.
[in]columns- number of elements in each row of the region.
[in]rows- number of rows of the region.
[in]row_stride_bytes- number of bytes between starts of consecutive rows of the region.
[out]histogram- pointers to the first element in the histogram range, one for each active channel.
[in]levels- number of boundaries (levels) for histogram bins in each active channel.
[in]level_values- pointer to the array of bin boundaries for each active channel.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful histogram operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example histograms for 3 channels (RGB) are computed on an array of 8-bit RGBA samples.

// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int columns; // e.g., 4
unsigned int rows; // e.g., 2
size_t row_stride_bytes; // e.g., 5 * sizeof(unsigned char)
unsigned char * samples; // e.g., [(0, 0, 80, 0), (120, 0, 80, 0), (123, 0, 82, 0), (10, 1, 83, 0), (-, -, -, -),
// (51, 1, 8, 0), (52, 1, 8, 0), (53, 0, 81, 0), (54, 50, 81, 0), (-, -, -, -)]
int * histogram[3]; // 3 empty arrays
unsigned int levels[3]; // e.g., [4, 4, 3]
int * level_values[3]; // e.g., [[0, 50, 100, 200], [0, 20, 40, 60], [0, 10, 100]]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
rocprim::multi_histogram_range<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, level_values
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// compute histograms
rocprim::multi_histogram_range<4, 3>(
temporary_storage_ptr, temporary_storage_size_bytes,
samples, columns, rows, row_stride_bytes,
histogram, levels, level_values
);
// histogram: [[2, 4, 2], [7, 0, 1], [2, 6]]

◆ partition() [1/2]

template<class Config = default_config, class InputIterator , class FlagIterator , class OutputIterator , class SelectedCountOutputIterator >
hipError_t partition ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
FlagIterator  flags,
OutputIterator  output,
SelectedCountOutputIterator  selected_count_output,
const size_t  size,
const hipStream_t  stream = 0,
const bool  debug_synchronous = false 
)
inline

Parallel select primitive for device level using range of flags.

Performs a device-wide partition based on input flags. Partition copies the values from input to output in such a way that all values for which the corresponding items from /p flags are true (or can be implicitly converted to true) precede the elements for which the corresponding items from /p flags are false.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input, flags and output must have at least size elements.
  • Range specified by selected_count_output must have at least 1 element.
  • Values of flag range should be implicitly convertible to bool type.
  • Relative order is preserved for the elements for which the corresponding values from flags are true. Other elements are copied in reverse order.
Template Parameters
Config- [optional] configuration of the primitive. It can be select_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. It can be a simple pointer type.
FlagIterator- random-access iterator type of the flag range. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
SelectedCountOutputIterator- random-access iterator type of the selected_count_output value. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the select operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to select values from.
[in]flags- iterator to the selection flag corresponding to the first element from input range.
[out]output- iterator to the first element in the output range.
[out]selected_count_output- iterator to the total number of selected values (length of output).
[in]size- number of element in the input range.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Example

In this example a device-level partition operation is performed on an array of integer values with array of chars used as flags.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
char * flags; // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
int * output; // empty array of 8 elements
size_t * output_count; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, flags,
output, output_count,
input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform partition
temporary_storage_ptr, temporary_storage_size_bytes,
input, flags,
output, output_count,
input_size
);
// output: [2, 3, 6, 8, 7, 5, 4, 1]
// output_count: 4

◆ partition() [2/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class SelectedCountOutputIterator , class UnaryPredicate >
hipError_t partition ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
SelectedCountOutputIterator  selected_count_output,
const size_t  size,
UnaryPredicate  predicate,
const hipStream_t  stream = 0,
const bool  debug_synchronous = false 
)
inline

Parallel select primitive for device level using selection predicate.

Performs a device-wide partition using selection predicate. Partition copies the values from input to output in such a way that all values for which the predicate returns true precede the elements for which it returns false.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input, flags and output must have at least size elements.
  • Range specified by selected_count_output must have at least 1 element.
  • Relative order is preserved for the elements for which the predicate returns true. Other elements are copied in reverse order.
Template Parameters
Config- [optional] configuration of the primitive. It can be select_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
SelectedCountOutputIterator- random-access iterator type of the selected_count_output value. It can be a simple pointer type.
UnaryPredicate- type of a unary selection predicate.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the select operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to select values from.
[out]output- iterator to the first element in the output range.
[out]selected_count_output- iterator to the total number of selected values (length of output).
[in]size- number of element in the input range.
[in]predicate- unary function object which returns /p true if the element should be ordered before other elements. The signature of the function should be equivalent to the following: bool f(const T &a);. The signature does not need to have const &, but function object must not modify the object passed to it.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Example

In this example a device-level partition operation is performed on an array of integer values, even values are copied before odd values.

auto predicate =
[] __device__ (int a) -> bool
{
return (a%2) == 0;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * output; // empty array of 8 elements
size_t * output_count; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input,
output, output_count,
input_size,
predicate
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform partition
temporary_storage_ptr, temporary_storage_size_bytes,
input,
output, output_count,
input_size,
predicate
);
// output: [2, 4, 6, 8, 7, 5, 3, 1]
// output_count: 4

◆ radix_sort_keys() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_keys ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort primitive for device level.

radix_sort_keys function performs a device-wide radix sort of keys. Function sorts input keys in ascending order.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input and keys_output must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed on an array of float values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
float * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size
);
// keys_output: [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]

◆ radix_sort_keys() [2/2]

template<class Config = default_config, class Key >
hipError_t radix_sort_keys ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort primitive for device level.

radix_sort_keys function performs a device-wide radix sort of keys. Function sorts input keys in ascending order.

Overview
  • The contents of both buffers of keys may be altered by the sorting function.
  • current() of keys is used as the input.
  • The function will update current() of keys to point to the buffer that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed on an array of float values.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
float * tmp; // empty array of 8 elements
// Create double-buffer
rocprim::double_buffer<float> keys(input, tmp);
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size
);
// keys.current(): [0.08, 0.2, 0.3, 0.4, 0.6, 0.65, 0.7, 1]

◆ radix_sort_keys_desc() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_keys_desc ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort primitive for device level.

radix_sort_keys_desc function performs a device-wide radix sort of keys. Function sorts input keys in descending order.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input and keys_output must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed on an array of integer values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
int * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size
);
// keys_output: [8, 7, 6, 5, 4, 3, 2, 1]

◆ radix_sort_keys_desc() [2/2]

template<class Config = default_config, class Key >
hipError_t radix_sort_keys_desc ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort primitive for device level.

radix_sort_keys_desc function performs a device-wide radix sort of keys. Function sorts input keys in descending order.

Overview
  • The contents of both buffers of keys may be altered by the sorting function.
  • current() of keys is used as the input.
  • The function will update current() of keys to point to the buffer that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed on an array of integer values.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
int * tmp; // empty array of 8 elements
// Create double-buffer
rocprim::double_buffer<int> keys(input, tmp);
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size
);
// keys.current(): [8, 7, 6, 5, 4, 3, 2, 1]

◆ radix_sort_pairs() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_pairs ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort-by-key primitive for device level.

radix_sort_pairs_desc function performs a device-wide radix sort of (key, value) pairs. Function sorts input pairs in ascending order of keys.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input, keys_output, values_input and values_output must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]values_input- pointer to the first element in the range to sort.
[out]values_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed where input keys are represented by an array of unsigned integers and input values by an array of doubles.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
unsigned int * keys_output; // empty array of 8 elements
double * values_output; // empty array of 8 elements
// Keys are in range [0; 8], so we can limit compared bit to bits on indexes
// 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
// is set to 5.
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size, 0, 5
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size, 0, 5
);
// keys_output: [ 1, 1, 3, 4, 5, 6, 7, 8]
// values_output: [-1, -2, 2, 3, -4, -5, 7, -8]

◆ radix_sort_pairs() [2/2]

template<class Config = default_config, class Key , class Value >
hipError_t radix_sort_pairs ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
double_buffer< Value > &  values,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort-by-key primitive for device level.

radix_sort_pairs_desc function performs a device-wide radix sort of (key, value) pairs. Function sorts input pairs in ascending order of keys.

Overview
  • The contents of both buffers of keys and values may be altered by the sorting function.
  • current() of keys and values are used as the input.
  • The function will update current() of keys and values to point to buffers that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
Value- value type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in,out]values- reference to the double-buffer of values, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed where input keys are represented by an array of unsigned integers and input values by an array of doubles.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
unsigned int * keys_tmp; // empty array of 8 elements
double* values_tmp; // empty array of 8 elements
// Create double-buffers
rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
rocprim::double_buffer<double> values(values_input, values_tmp);
// Keys are in range [0; 8], so we can limit compared bit to bits on indexes
// 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
// is set to 5.
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size,
0, 5
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size,
0, 5
);
// keys.current(): [ 1, 1, 3, 4, 5, 6, 7, 8]
// values.current(): [-1, -2, 2, 3, -4, -5, 7, -8]

◆ radix_sort_pairs_desc() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t radix_sort_pairs_desc ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort-by-key primitive for device level.

radix_sort_pairs_desc function performs a device-wide radix sort of (key, value) pairs. Function sorts input pairs in descending order of keys.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input, keys_output, values_input and values_output must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]values_input- pointer to the first element in the range to sort.
[out]values_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed where input keys are represented by an array of integers and input values by an array of doubles.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
int * keys_output; // empty array of 8 elements
double * values_output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size
);
// keys_output: [ 8, 7, 6, 5, 4, 3, 1, 1]
// values_output: [-8, 7, -5, -4, 3, 2, -1, -2]

◆ radix_sort_pairs_desc() [2/2]

template<class Config = default_config, class Key , class Value >
hipError_t radix_sort_pairs_desc ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
double_buffer< Value > &  values,
unsigned int  size,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort-by-key primitive for device level.

radix_sort_pairs_desc function performs a device-wide radix sort of (key, value) pairs. Function sorts input pairs in descending order of keys.

Overview
  • The contents of both buffers of keys and values may be altered by the sorting function.
  • current() of keys and values are used as the input.
  • The function will update current() of keys and values to point to buffers that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
Value- value type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in,out]values- reference to the double-buffer of values, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed where input keys are represented by an array of integers and input values by an array of doubles.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
int * keys_tmp; // empty array of 8 elements
double * values_tmp; // empty array of 8 elements
// Create double-buffers
rocprim::double_buffer<int> keys(keys_input, keys_tmp);
rocprim::double_buffer<double> values(values_input, values_tmp);
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size
);
// keys.current(): [ 8, 7, 6, 5, 4, 3, 1, 1]
// values.current(): [-8, 7, -5, -4, 3, 2, -1, -2]

◆ reduce() [1/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class InitValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t reduce ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
const InitValueType  initial_value,
const size_t  size,
BinaryFunction  reduce_op = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel reduction primitive for device level.

reduce function performs a device-wide reduction operation using binary reduce_op operator.

Overview
  • Does not support non-commutative reduction operators. Reduction operator should also be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input must have at least size elements, while output only needs one element.
Template Parameters
Config- [optional] configuration of the primitive. It can be reduce_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
InitValueType- type of the initial value.
BinaryFunction- type of binary function used for reduction. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to reduce.
[out]output- iterator to the first element in the output range. It can be same as input.
[in]initial_value- initial value to start the reduction.
[in]size- number of element in the input range.
[in]reduce_op- binary operation function object that will be used for reduction. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful reduction; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level min-reduction operation is performed on an array of integer values (shorts are reduced into ints) using custom operator.

// custom reduce function
auto min_op =
[] __device__ (int a, int b) -> int
{
return a < b ? a : b;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
int * output; // empty array of 1 element
int start_value; // e.g., 9
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, start_value, input_size, min_op
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform reduce
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, start_value, input_size, min_op
);
// output: [1]

◆ reduce() [2/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t reduce ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
const size_t  size,
BinaryFunction  reduce_op = BinaryFunction(),
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel reduce primitive for device level.

reduce function performs a device-wide reduction operation using binary reduce_op operator.

Overview
  • Does not support non-commutative reduction operators. Reduction operator should also be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input must have at least size elements, while output only needs one element.
Template Parameters
Config- [optional] configuration of the primitive. It can be reduce_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for reduction. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to reduce.
[out]output- iterator to the first element in the output range. It can be same as input.
[in]size- number of element in the input range.
[in]reduce_op- binary operation function object that will be used for reduction. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is BinaryFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful reduction; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level sum operation is performed on an array of integer values (shorts are reduced into ints).

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * output; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size, rocprim::plus<int>()
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform reduce
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size, rocprim::plus<int>()
);
// output: [36]

◆ reduce_by_key()

template<class Config = default_config, class KeysInputIterator , class ValuesInputIterator , class UniqueOutputIterator , class AggregatesOutputIterator , class UniqueCountOutputIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<ValuesInputIterator>::value_type>, class KeyCompareFunction = ::rocprim::equal_to<typename std::iterator_traits<KeysInputIterator>::value_type>>
hipError_t reduce_by_key ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
ValuesInputIterator  values_input,
unsigned int  size,
UniqueOutputIterator  unique_output,
AggregatesOutputIterator  aggregates_output,
UniqueCountOutputIterator  unique_count_output,
BinaryFunction  reduce_op = BinaryFunction(),
KeyCompareFunction  key_compare_op = KeyCompareFunction(),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel reduce-by-key primitive for device level.

reduce_by_key function performs a device-wide reduction operation of groups of consecutive values having the same key using binary reduce_op operator. The first key of each group is copied to unique_output and reduction of the group is written to aggregates_output. The total number of group is written to unique_count_output.

Overview
  • Supports non-commutative reduction operators. However, a reduction operator should be associative. When used with non-associative functions the results may be non-deterministic and/or vary in precision.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by keys_input and values_input must have at least size elements.
  • Range specified by unique_count_output must have at least 1 element.
  • Ranges specified by unique_output and aggregates_output must have at least *unique_count_output (i.e. the number of unique keys) elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be reduce_by_key_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
UniqueOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
AggregatesOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
UniqueCountOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for reduction. Default type is rocprim::plus<T>, where T is a value_type of ValuesInputIterator.
KeyCompareFunction- type of binary function used to determine keys equality. Default type is rocprim::equal_to<T>, where T is a value_type of KeysInputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- iterator to the first element in the range of keys.
[in]values_input- iterator to the first element in the range of values to reduce.
[in]size- number of element in the input range.
[out]unique_output- iterator to the first element in the output range of unique keys.
[out]aggregates_output- iterator to the first element in the output range of reductions.
[out]unique_count_output- iterator to total number of groups.
[in]reduce_op- binary operation function object that will be used for reduction. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is BinaryFunction().
[in]key_compare_op- binary operation function object that will be used to determine keys equality. The signature of the function should be equivalent to the following: bool f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. Default is KeyCompareFunction().
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful reduction; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level sum operation is performed on an array of integer values and integer keys.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * keys_input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
int * values_input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * unique_output; // empty array of at least 4 elements
int * aggregates_output; // empty array of at least 4 elements
int * unique_count_output; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, values_input, input_size,
unique_output, aggregates_output, unique_count_output
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform reduction
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, values_input, input_size,
unique_output, aggregates_output, unique_count_output
);
// unique_output: [1, 2, 10, 88]
// aggregates_output: [6, 4, 18, 8]
// unique_count_output: [4]

◆ run_length_encode()

template<class Config = default_config, class InputIterator , class UniqueOutputIterator , class CountsOutputIterator , class RunsCountOutputIterator >
hipError_t run_length_encode ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
unsigned int  size,
UniqueOutputIterator  unique_output,
CountsOutputIterator  counts_output,
RunsCountOutputIterator  runs_count_output,
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel run-length encoding for device level.

run_length_encode function performs a device-wide run-length encoding of runs (groups) of consecutive values. The first value of each run is copied to unique_output and the length of the run is written to counts_output. The total number of runs is written to runs_count_output.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Range specified by input must have at least size elements.
  • Range specified by runs_count_output must have at least 1 element.
  • Ranges specified by unique_output and counts_output must have at least *runs_count_output (i.e. the number of runs) elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be run_length_encode_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
UniqueOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
CountsOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
RunsCountOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range of values.
[in]size- number of element in the input range.
[out]unique_output- iterator to the first element in the output range of unique values.
[out]counts_output- iterator to the first element in the output range of lenghts.
[out]runs_count_output- iterator to total number of runs.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level run-length encoding operation is performed on an array of integer values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
int * unique_output; // empty array of at least 4 elements
int * counts_output; // empty array of at least 4 elements
int * runs_count_output; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, input_size,
unique_output, counts_output, runs_count_output
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform encoding
temporary_storage_ptr, temporary_storage_size_bytes,
input, input_size,
unique_output, counts_output, runs_count_output
);
// unique_output: [1, 2, 10, 88]
// counts_output: [3, 1, 3, 1]
// runs_count_output: [4]

◆ run_length_encode_non_trivial_runs()

template<class Config = default_config, class InputIterator , class OffsetsOutputIterator , class CountsOutputIterator , class RunsCountOutputIterator >
hipError_t run_length_encode_non_trivial_runs ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
unsigned int  size,
OffsetsOutputIterator  offsets_output,
CountsOutputIterator  counts_output,
RunsCountOutputIterator  runs_count_output,
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel run-length encoding of non-trivial runs for device level.

run_length_encode_non_trivial_runs function performs a device-wide run-length encoding of non-trivial runs (groups) of consecutive values (groups of more than one element). The offset of the first value of each non-trivial run is copied to offsets_output and the length of the run (the count of elements) is written to counts_output. The total number of non-trivial runs is written to runs_count_output.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Range specified by input must have at least size elements.
  • Range specified by runs_count_output must have at least 1 element.
  • Ranges specified by offsets_output and counts_output must have at least *runs_count_output (i.e. the number of non-trivial runs) elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be run_length_encode_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OffsetsOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
CountsOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
RunsCountOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range of values.
[in]size- number of element in the input range.
[out]offsets_output- iterator to the first element in the output range of offsets.
[out]counts_output- iterator to the first element in the output range of lenghts.
[out]runs_count_output- iterator to total number of runs.
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful operation; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level run-length encoding of non-trivial runs is performed on an array of integer values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 1, 1, 2, 10, 10, 10, 88]
int * offsets_output; // empty array of at least 2 elements
int * counts_output; // empty array of at least 2 elements
int * runs_count_output; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, input_size,
offsets_output, counts_output, runs_count_output
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform encoding
temporary_storage_ptr, temporary_storage_size_bytes,
input, input_size,
offsets_output, counts_output, runs_count_output
);
// offsets_output: [0, 4]
// counts_output: [3, 3]
// runs_count_output: [2]

◆ segmented_exclusive_scan() [1/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class OffsetIterator , class InitValueType , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_exclusive_scan ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
const InitValueType  initial_value,
BinaryFunction  scan_op = BinaryFunction(),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel segmented exclusive scan primitive for device level.

segmented_exclusive_scan function performs a device-wide exclusive scan operation across multiple sequences from input using binary scan_op operator.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input and output must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
InitValueType- type of the initial value.
BinaryFunction- type of binary function used for scan operation. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to scan.
[out]output- iterator to the first element in the output range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]initial_value- initial value to start the scan.
[in]scan_op- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level segmented exclusive min-scan operation is performed on an array of integer values (shorts are scanned into ints) using custom operator.

// custom scan function
auto min_op =
[] __device__ (int a, int b) -> int
{
return a < b ? a : b;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
int start_value; // e.g., 9
short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
int * output; // empty array of 8 elements
size_t segments; // e.g., 3
int * offsets; // e.g. [0, 2, 4, 8]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, segments, offsets, offsets + 1
start_value, min_op
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, segments, offsets, offsets + 1
start_value, min_op
);
// output: [9, 4, 9, 6, 9, 5, 1, 1]

◆ segmented_exclusive_scan() [2/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class InitValueType , class HeadFlagIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_exclusive_scan ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
HeadFlagIterator  head_flags,
const InitValueType  initial_value,
size_t  size,
BinaryFunction  scan_op = BinaryFunction(),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel segmented exclusive scan primitive for device level.

segmented_exclusive_scan function performs a device-wide exclusive scan operation across multiple sequences from input using binary scan_op operator. Beginnings of the segments should be marked by value convertible to true at corresponding position in flags range.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input, output, and flags must have at least size elements.
  • value_type of HeadFlagIterator iterator should be convertible to bool type.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
HeadFlagIterator- random-access iterator type of flags. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
InitValueType- type of the initial value.
BinaryFunction- type of binary function used for scan operation. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to scan.
[out]output- iterator to the first element in the output range.
[in]head_flags- iterator to the first element in the range of head flags marking beginnings of each segment in the input range.
[in]initial_value- initial value to start the scan.
[in]size- number of element in the input range.
[in]scan_op- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level segmented exclusive sum operation is performed on an array of integer values (shorts are added into ints).

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t size; // e.g., 8
short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * flags; // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
int init; // e.g., 9
int * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, flags, init, size, ::rocprim::plus<int>()
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, flags, init, size, ::rocprim::plus<int>()
);
// output: [9, 10, 12, 9, 13, 9, 15, 22]

◆ segmented_inclusive_scan() [1/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class OffsetIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_inclusive_scan ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
BinaryFunction  scan_op = BinaryFunction(),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel segmented inclusive scan primitive for device level.

segmented_inclusive_scan function performs a device-wide inclusive scan operation across multiple sequences from input using binary scan_op operator.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input and output must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for scan operation. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to scan.
[out]output- iterator to the first element in the output range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]scan_op- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level segmented inclusive min-scan operation is performed on an array of integer values (shorts are scanned into ints) using custom operator.

// custom scan function
auto min_op =
[] __device__ (int a, int b) -> int
{
return a < b ? a : b;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
int * output; // empty array of 8 elements
size_t segments; // e.g., 3
int * offsets; // e.g. [0, 2, 4, 8]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, segments, offsets, offsets + 1, min_op
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, segments, offsets, offsets + 1, min_op
);
// output: [4, 4, 6, 2, 5, 1, 1, 1]

◆ segmented_inclusive_scan() [2/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class HeadFlagIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t segmented_inclusive_scan ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
HeadFlagIterator  head_flags,
size_t  size,
BinaryFunction  scan_op = BinaryFunction(),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel segmented inclusive scan primitive for device level.

segmented_inclusive_scan function performs a device-wide inclusive scan operation across multiple sequences from input using binary scan_op operator. Beginnings of the segments should be marked by value convertible to true at corresponding position in flags range.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input, output, and flags must have at least size elements.
  • value_type of HeadFlagIterator iterator should be convertible to bool type.
Template Parameters
Config- [optional] configuration of the primitive. It can be scan_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
HeadFlagIterator- random-access iterator type of flags. Must meet the requirements of a C++ RandomAccessIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for scan operation. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the scan operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to scan.
[out]output- iterator to the first element in the output range.
[in]head_flags- iterator to the first element in the range of head flags marking beginnings of each segment in the input range.
[in]size- number of element in the input range.
[in]scan_op- binary operation function object that will be used for scan. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful scan; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level segmented inclusive sum operation is performed on an array of integer values (shorts are added into ints).

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t size; // e.g., 8
short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * flags; // e.g., [1, 0, 0, 1, 0, 1, 0, 0]
int * output; // empty array of 8 elements
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, flags, size, ::rocprim::plus<int>()
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform scan
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, flags, size, ::rocprim::plus<int>()
);
// output: [1, 3, 6, 4, 9, 6, 13, 21]

◆ segmented_radix_sort_keys() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_keys ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort primitive for device level.

segmented_radix_sort_keys function performs a device-wide radix sort across multiple, non-overlapping sequences of keys. Function sorts input keys in ascending order.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input and keys_output must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed on an array of float values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
float * output; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size,
segments, offsets, offsets + 1
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size,
segments, offsets, offsets + 1
);
// keys_output: [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]

◆ segmented_radix_sort_keys() [2/2]

template<class Config = default_config, class Key , class OffsetIterator >
hipError_t segmented_radix_sort_keys ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort primitive for device level.

segmented_radix_sort_keys function performs a device-wide radix sort across multiple, non-overlapping sequences of keys. Function sorts input keys in ascending order.

Overview
  • The contents of both buffers of keys may be altered by the sorting function.
  • current() of keys is used as the input.
  • The function will update current() of keys to point to the buffer that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed on an array of float values.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
float * input; // e.g., [0.6, 0.3, 0.65, 0.4, 0.2, 0.08, 1, 0.7]
float * tmp; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
// Create double-buffer
rocprim::double_buffer<float> keys(input, tmp);
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size,
segments, offsets, offsets + 1
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size,
segments, offsets, offsets + 1
);
// keys.current(): [0.3, 0.6, 0.65, 0.08, 0.2, 0.4, 0.7, 1]

◆ segmented_radix_sort_keys_desc() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_keys_desc ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort primitive for device level.

segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple, non-overlapping sequences of keys. Function sorts input keys in descending order.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input and keys_output must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed on an array of integer values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
int * output; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size,
segments, offsets, offsets + 1
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, input_size,
segments, offsets, offsets + 1
);
// keys_output: [6, 3, 5, 8, 7, 4, 2, 1]

◆ segmented_radix_sort_keys_desc() [2/2]

template<class Config = default_config, class Key , class OffsetIterator >
hipError_t segmented_radix_sort_keys_desc ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort primitive for device level.

segmented_radix_sort_keys_desc function performs a device-wide radix sort across multiple, non-overlapping sequences of keys. Function sorts input keys in descending order.

Overview
  • The contents of both buffers of keys may be altered by the sorting function.
  • current() of keys is used as the input.
  • The function will update current() of keys to point to the buffer that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed on an array of integer values.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [6, 3, 5, 4, 2, 8, 1, 7]
int * tmp; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
// Create double-buffer
rocprim::double_buffer<int> keys(input, tmp);
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size,
segments, offsets, offsets + 1
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, input_size,
segments, offsets, offsets + 1
);
// keys.current(): [6, 3, 5, 8, 7, 4, 2, 1]

◆ segmented_radix_sort_pairs() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_pairs ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort-by-key primitive for device level.

segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple, non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input, keys_output, values_input and values_output must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]values_input- pointer to the first element in the range to sort.
[out]values_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed where input keys are represented by an array of unsigned integers and input values by an array of doubles.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
unsigned int * keys_output; // empty array of 8 elements
double * values_output; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
// Keys are in range [0; 8], so we can limit compared bit to bits on indexes
// 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
// is set to 5.
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output, input_size,
segments, offsets, offsets + 1,
0, 5
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output, input_size,
segments, offsets, offsets + 1,
0, 5
);
// keys_output: [3, 6, 5, 1, 1, 4, 7, 8]
// values_output: [2, -5, -4, -1, -2, 3, 7, -8]

◆ segmented_radix_sort_pairs() [2/2]

template<class Config = default_config, class Key , class Value , class OffsetIterator >
hipError_t segmented_radix_sort_pairs ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
double_buffer< Value > &  values,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel ascending radix sort-by-key primitive for device level.

segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple, non-overlapping sequences of (key, value) pairs. Function sorts input pairs in ascending order of keys.

Overview
  • The contents of both buffers of keys and values may be altered by the sorting function.
  • current() of keys and values are used as the input.
  • The function will update current() of keys and values to point to buffers that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
Value- value type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in,out]values- reference to the double-buffer of values, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level ascending radix sort is performed where input keys are represented by an array of unsigned integers and input values by an array of doubles.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
unsigned int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
unsigned int * keys_tmp; // empty array of 8 elements
double* values_tmp; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
// Create double-buffers
rocprim::double_buffer<unsigned int> keys(keys_input, keys_tmp);
rocprim::double_buffer<double> values(values_input, values_tmp);
// Keys are in range [0; 8], so we can limit compared bit to bits on indexes
// 0, 1, 2, 3, and 4. In order to do this begin_bit is set to 0 and end_bit
// is set to 5.
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size,
segments, offsets, offsets + 1
0, 5
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size,
segments, offsets, offsets + 1
0, 5
);
// keys.current(): [3, 6, 5, 1, 1, 4, 7, 8]
// values.current(): [2, -5, -4, -1, -2, 3, 7, -8]

◆ segmented_radix_sort_pairs_desc() [1/2]

template<class Config = default_config, class KeysInputIterator , class KeysOutputIterator , class ValuesInputIterator , class ValuesOutputIterator , class OffsetIterator , class Key = typename std::iterator_traits<KeysInputIterator>::value_type>
hipError_t segmented_radix_sort_pairs_desc ( void *  temporary_storage,
size_t &  storage_size,
KeysInputIterator  keys_input,
KeysOutputIterator  keys_output,
ValuesInputIterator  values_input,
ValuesOutputIterator  values_output,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort-by-key primitive for device level.

segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple, non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.

Overview
  • The contents of the inputs are not altered by the sorting function.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Key type (a value_type of KeysInputIterator and KeysOutputIterator) must be an arithmetic type (that is, an integral type or a floating-point type).
  • Ranges specified by keys_input, keys_output, values_input and values_output must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
KeysInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
KeysOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
ValuesInputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
ValuesOutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]keys_input- pointer to the first element in the range to sort.
[out]keys_output- pointer to the first element in the output range.
[in]values_input- pointer to the first element in the range to sort.
[out]values_output- pointer to the first element in the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed where input keys are represented by an array of integers and input values by an array of doubles.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
int * keys_output; // empty array of 8 elements
double * values_output; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size,
segments, offsets, offsets + 1
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys_input, keys_output, values_input, values_output,
input_size,
segments, offsets, offsets + 1
);
// keys_output: [ 6, 3, 5, 8, 7, 4, 1, 1]
// values_output: [-5, 2, -4, -8, 7, 3, -1, -2]

◆ segmented_radix_sort_pairs_desc() [2/2]

template<class Config = default_config, class Key , class Value , class OffsetIterator >
hipError_t segmented_radix_sort_pairs_desc ( void *  temporary_storage,
size_t &  storage_size,
double_buffer< Key > &  keys,
double_buffer< Value > &  values,
unsigned int  size,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
unsigned int  begin_bit = 0,
unsigned int  end_bit = 8 * sizeof(Key),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel descending radix sort-by-key primitive for device level.

segmented_radix_sort_pairs_desc function performs a device-wide radix sort across multiple, non-overlapping sequences of (key, value) pairs. Function sorts input pairs in descending order of keys.

Overview
  • The contents of both buffers of keys and values may be altered by the sorting function.
  • current() of keys and values are used as the input.
  • The function will update current() of keys and values to point to buffers that contains the output range.
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • The function requires small temporary_storage as it does not need a temporary buffer of size elements.
  • Key type must be an arithmetic type (that is, an integral type or a floating-point type).
  • Buffers of keys must have at least size elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
  • If Key is an integer type and the range of keys is known in advance, the performance can be improved by setting begin_bit and end_bit, for example if all keys are in range [100, 10000], begin_bit = 0 and end_bit = 14 will cover the whole range.
Template Parameters
Config- [optional] configuration of the primitive. It can be segmented_radix_sort_config or a custom class with the same members.
Key- key type. Must be an integral type or a floating-point type.
Value- value type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the sort operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in,out]keys- reference to the double-buffer of keys, its current() contains the input range and will be updated to point to the output range.
[in,out]values- reference to the double-buffer of values, its current() contains the input range and will be updated to point to the output range.
[in]size- number of element in the input range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]begin_bit- [optional] index of the first (least significant) bit used in key comparison. Must be in range [0; 8 * sizeof(Key)). Default value: 0.
[in]end_bit- [optional] past-the-end index (most significant) bit used in key comparison. Must be in range (begin_bit; 8 * sizeof(Key)]. Default value: 8 * sizeof(Key).
[in]stream- [optional] HIP stream object. Default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. Default value is false.
Returns
hipSuccess (0) after successful sort; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level descending radix sort is performed where input keys are represented by an array of integers and input values by an array of doubles.

// Prepare input and tmp (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * keys_input; // e.g., [ 6, 3, 5, 4, 1, 8, 1, 7]
double * values_input; // e.g., [-5, 2, -4, 3, -1, -8, -2, 7]
int * keys_tmp; // empty array of 8 elements
double * values_tmp; // empty array of 8 elements
unsigned int segments; // e.g., 3
int * offsets; // e.g. [0, 2, 3, 8]
// Create double-buffers
rocprim::double_buffer<int> keys(keys_input, keys_tmp);
rocprim::double_buffer<double> values(values_input, values_tmp);
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size,
segments, offsets, offsets + 1
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform sort
temporary_storage_ptr, temporary_storage_size_bytes,
keys, values, input_size,
segments, offsets, offsets + 1
);
// keys.current(): [ 6, 3, 5, 8, 7, 4, 1, 1]
// values.current(): [-5, 2, -4, -8, 7, 3, -1, -2]

◆ segmented_reduce()

template<class Config = default_config, class InputIterator , class OutputIterator , class OffsetIterator , class BinaryFunction = ::rocprim::plus<typename std::iterator_traits<InputIterator>::value_type>, class InitValueType = typename std::iterator_traits<InputIterator>::value_type>
hipError_t segmented_reduce ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
unsigned int  segments,
OffsetIterator  begin_offsets,
OffsetIterator  end_offsets,
BinaryFunction  reduce_op = BinaryFunction(),
InitValueType  initial_value = InitValueType(),
hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel segmented reduction primitive for device level.

segmented_reduce function performs a device-wide reduction operation across multiple sequences using binary reduce_op operator.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input must have at least size elements, output must have segments elements.
  • Ranges specified by begin_offsets and end_offsets must have at least segments elements. They may use the same sequence offsets of at least segments + 1 elements: offsets for begin_offsets and offsets + 1 for end_offsets.
Template Parameters
Config- [optional] configuration of the primitive. It can be reduce_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
OffsetIterator- random-access iterator type of segment offsets. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for reduction. Default type is rocprim::plus<T>, where T is a value_type of InputIterator.
InitValueType- type of the initial value.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the reduction operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to reduce.
[out]output- iterator to the first element in the output range.
[in]segments- number of segments in the input range.
[in]begin_offsets- iterator to the first element in the range of beginning offsets.
[in]end_offsets- iterator to the first element in the range of ending offsets.
[in]initial_value- initial value to start the reduction.
[in]reduce_op- binary operation function object that will be used for reduction. The signature of the function should be equivalent to the following: T f(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the objects passed to it. The default value is BinaryFunction().
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Returns
hipSuccess (0) after successful reduction; otherwise a HIP runtime error of type hipError_t.
Example

In this example a device-level segmented min-reduction operation is performed on an array of integer values (shorts are reduced into ints) using custom operator.

// custom reduce function
auto min_op =
[] __device__ (int a, int b) -> int
{
return a < b ? a : b;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
unsigned int segments; // e.g., 3
short * input; // e.g., [4, 7, 6, 2, 5, 1, 3, 8]
int * output; // empty array of 3 elements
int * offsets; // e.g. [0, 2, 3, 8]
int init_value; // e.g., 9
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output,
segments, offsets, offsets + 1,
min_op, init_value
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform segmented reduction
temporary_storage_ptr, temporary_storage_size_bytes,
input, output,
segments, offsets, offsets + 1,
min_op, init_value
);
// output: [4, 6, 1]

◆ select() [1/2]

template<class Config = default_config, class InputIterator , class FlagIterator , class OutputIterator , class SelectedCountOutputIterator >
hipError_t select ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
FlagIterator  flags,
OutputIterator  output,
SelectedCountOutputIterator  selected_count_output,
const size_t  size,
const hipStream_t  stream = 0,
const bool  debug_synchronous = false 
)
inline

Parallel select primitive for device level using range of flags.

Performs a device-wide selection based on input flags. If a value from input should be selected and copied into output range the corresponding item from flags range should be set to such value that can be implicitly converted to true (bool type).

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Ranges specified by input and flags must have at least size elements.
  • Range specified by output must have at least so many elements, that all positively flagged values can be copied into it.
  • Range specified by selected_count_output must have at least 1 element.
  • Values of flag range should be implicitly convertible to bool type.
Template Parameters
Config- [optional] configuration of the primitive. It can be select_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. It can be a simple pointer type.
FlagIterator- random-access iterator type of the flag range. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
SelectedCountOutputIterator- random-access iterator type of the selected_count_output value. It can be a simple pointer type.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the select operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to select values from.
[in]flags- iterator to the selection flag corresponding to the first element from input range.
[out]output- iterator to the first element in the output range.
[out]selected_count_output- iterator to the total number of selected values (length of output).
[in]size- number of element in the input range.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Example

In this example a device-level select operation is performed on an array of integer values with array of chars used as flags.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
char * flags; // e.g., [0, 1, 1, 0, 0, 1, 0, 1]
int * output; // empty array of 8 elements
size_t * output_count; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, flags,
output, output_count,
input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform selection
temporary_storage_ptr, temporary_storage_size_bytes,
input, flags,
output, output_count,
input_size
);
// output: [2, 3, 6, 8]
// output_count: 4

◆ select() [2/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class SelectedCountOutputIterator , class UnaryPredicate >
hipError_t select ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
SelectedCountOutputIterator  selected_count_output,
const size_t  size,
UnaryPredicate  predicate,
const hipStream_t  stream = 0,
const bool  debug_synchronous = false 
)
inline

Parallel select primitive for device level using selection operator.

Performs a device-wide selection using selection operator. If a value x from input should be selected and copied into output range, then predicate(x) has to return true.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Range specified by input must have at least size elements.
  • Range specified by output must have at least so many elements, that all selected values can be copied into it.
  • Range specified by selected_count_output must have at least 1 element.
Template Parameters
Config- [optional] configuration of the primitive. It can be select_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
SelectedCountOutputIterator- random-access iterator type of the selected_count_output value. It can be a simple pointer type.
UnaryPredicate- type of a unary selection predicate.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the select operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to select values from.
[out]output- iterator to the first element in the output range.
[out]selected_count_output- iterator to the total number of selected values (length of output).
[in]size- number of element in the input range.
[in]predicate- unary function object that will be used for selecting values. The signature of the function should be equivalent to the following: bool f(const T &a);. The signature does not need to have const &, but function object must not modify the object passed to it.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Example

In this example a device-level select operation is performed on an array of integer values, only even values are selected.

auto predicate =
[] __device__ (int a) -> bool
{
return (a%2) == 0;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * output; // empty array of 8 elements
size_t * output_count; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, output_count,
predicate, input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform selection
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, output_count,
predicate, input_size
);
// output: [2, 4, 6, 8]
// output_count: 4

◆ transform() [1/2]

template<class Config = default_config, class InputIterator , class OutputIterator , class UnaryFunction >
hipError_t transform ( InputIterator  input,
OutputIterator  output,
const size_t  size,
UnaryFunction  transform_op,
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel transform primitive for device level.

transform function performs a device-wide transformation operation using unary transform_op operator.

Overview
  • Ranges specified by input and output must have at least size elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be transform_config or a custom class with the same members.
InputIterator- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
UnaryFunction- type of unary function used for transform.
Parameters
[in]input- iterator to the first element in the range to transform.
[out]output- iterator to the first element in the output range.
[in]size- number of element in the input range.
[in]transform_op- unary operation function object that will be used for transform. The signature of the function should be equivalent to the following: U f(const T &a);. The signature does not need to have const &, but function object must not modify the object passed to it.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Example

In this example a device-level transform operation is performed on an array of integer values (shorts are transformed into ints).

// custom transform function
[] __device__ (int a) -> int
{
return a + 5;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
short * input; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int * output; // empty array of 8 elements
// perform transform
input, output, input_size, transform_op
);
// output: [6, 7, 8, 9, 10, 11, 12, 13]

◆ transform() [2/2]

template<class Config = default_config, class InputIterator1 , class InputIterator2 , class OutputIterator , class BinaryFunction >
hipError_t transform ( InputIterator1  input1,
InputIterator2  input2,
OutputIterator  output,
const size_t  size,
BinaryFunction  transform_op,
const hipStream_t  stream = 0,
bool  debug_synchronous = false 
)
inline

Parallel device-level transform primitive for two inputs.

transform function performs a device-wide transformation operation on two input ranges using binary transform_op operator.

Overview
  • Ranges specified by input1, input2, and output must have at least size elements.
Template Parameters
Config- [optional] configuration of the primitive. It can be transform_config or a custom class with the same members.
InputIterator1- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
InputIterator2- random-access iterator type of the input range. Must meet the requirements of a C++ InputIterator concept. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. Must meet the requirements of a C++ OutputIterator concept. It can be a simple pointer type.
BinaryFunction- type of binary function used for transform.
Parameters
[in]input1- iterator to the first element in the 1st range to transform.
[in]input2- iterator to the first element in the 2nd range to transform.
[out]output- iterator to the first element in the output range.
[in]size- number of element in the input range.
[in]transform_op- binary operation function object that will be used for transform. The signature of the function should be equivalent to the following: U f(const T1& a, const T2& b);. The signature does not need to have const &, but function object must not modify the object passed to it.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced. Default value is false.
Example

In this example a device-level transform operation is performed on two arrays of integer values (element-wise sum is performed).

// custom transform function
[] __device__ (int a, int b) -> int
{
return a + b;
};
// Prepare input and output (declare pointers, allocate device memory etc.)
size_t size; // e.g., 8
int* input1; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int* input2; // e.g., [1, 2, 3, 4, 5, 6, 7, 8]
int* output; // empty array of 8 elements
// perform transform
input1, input2, output, input1.size(), transform_op
);
// output: [2, 4, 6, 8, 10, 12, 14, 16]

◆ unique()

template<class Config = default_config, class InputIterator , class OutputIterator , class UniqueCountOutputIterator , class EqualityOp = ::rocprim::equal_to<typename std::iterator_traits<InputIterator>::value_type>>
hipError_t unique ( void *  temporary_storage,
size_t &  storage_size,
InputIterator  input,
OutputIterator  output,
UniqueCountOutputIterator  unique_count_output,
const size_t  size,
EqualityOp  equality_op = EqualityOp(),
const hipStream_t  stream = 0,
const bool  debug_synchronous = false 
)
inline

Device-level parallel unique primitive.

From given input range unique primitive eliminates all but the first element from every consecutive group of equivalent elements and copies them into output.

Overview
  • Returns the required size of temporary_storage in storage_size if temporary_storage in a null pointer.
  • Range specified by input must have at least size elements.
  • Range specified by output must have at least so many elements, that all selected values can be copied into it.
  • Range specified by unique_count_output must have at least 1 element.
  • By default InputIterator::value_type's equality operator is used to check if elements are equivalent.
Template Parameters
InputIterator- random-access iterator type of the input range. It can be a simple pointer type.
OutputIterator- random-access iterator type of the output range. It can be a simple pointer type.
UniqueCountOutputIterator- random-access iterator type of the unique_count_output value used to return number of unique values. It can be a simple pointer type.
EqualityOp- type of an binary operator used to compare values for equality.
Parameters
[in]temporary_storage- pointer to a device-accessible temporary storage. When a null pointer is passed, the required allocation size (in bytes) is written to storage_size and function returns without performing the unique operation.
[in,out]storage_size- reference to a size (in bytes) of temporary_storage.
[in]input- iterator to the first element in the range to select values from.
[out]output- iterator to the first element in the output range.
[out]unique_count_output- iterator to the total number of selected values (length of output).
[in]size- number of element in the input range.
[in]equality_op- [optional] binary function object used to compare input values for equality. The signature of the function should be equivalent to the following: bool equal_to(const T &a, const T &b);. The signature does not need to have const &, but function object must not modify the object passed to it.
[in]stream- [optional] HIP stream object. The default is 0 (default stream).
[in]debug_synchronous- [optional] If true, synchronization after every kernel launch is forced in order to check for errors. The default value is false.
Example

In this example a device-level unique operation is performed on an array of integer values.

// Prepare input and output (declare pointers, allocate device memory etc.)
size_t input_size; // e.g., 8
int * input; // e.g., [1, 4, 2, 4, 4, 7, 7, 7]
int * output; // empty array of 8 elements
size_t * output_count; // empty array of 1 element
size_t temporary_storage_size_bytes;
void * temporary_storage_ptr = nullptr;
// Get required size of the temporary storage
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, output_count,
input_size
);
// allocate temporary storage
hipMalloc(&temporary_storage_ptr, temporary_storage_size_bytes);
// perform unique operation
temporary_storage_ptr, temporary_storage_size_bytes,
input, output, output_count,
input_size
);
// output: [1, 4, 2, 4, 7]
// output_count: 5