21 #ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_ 22 #define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_ 26 #include <type_traits> 29 #include "../config.hpp" 30 #include "../detail/various.hpp" 31 #include "../detail/radix_sort.hpp" 33 #include "../intrinsics.hpp" 34 #include "../functional.hpp" 35 #include "../types.hpp" 37 #include "../block/block_load.hpp" 38 #include "../iterator/counting_iterator.hpp" 39 #include "../iterator/reverse_iterator.hpp" 40 #include "detail/device_segmented_radix_sort.hpp" 41 #include "device_partition.hpp" 42 #include "device_segmented_radix_sort_config.hpp" 47 BEGIN_ROCPRIM_NAMESPACE
55 unsigned int BlockSize,
56 class KeysInputIterator,
57 class KeysOutputIterator,
58 class ValuesInputIterator,
59 class ValuesOutputIterator,
63 __launch_bounds__(BlockSize)
64 void segmented_sort_kernel(KeysInputIterator keys_input,
65 typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
66 KeysOutputIterator keys_output,
67 ValuesInputIterator values_input,
68 typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
69 ValuesOutputIterator values_output,
71 OffsetIterator begin_offsets,
72 OffsetIterator end_offsets,
73 unsigned int long_iterations,
74 unsigned int short_iterations,
75 unsigned int begin_bit,
78 segmented_sort<Config, Descending>(
79 keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
81 begin_offsets, end_offsets,
82 long_iterations, short_iterations,
90 unsigned int BlockSize,
91 class KeysInputIterator,
92 class KeysOutputIterator,
93 class ValuesInputIterator,
94 class ValuesOutputIterator,
95 class SegmentIndexIterator,
99 __launch_bounds__(BlockSize)
100 void segmented_sort_large_kernel(KeysInputIterator keys_input,
101 typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
102 KeysOutputIterator keys_output,
103 ValuesInputIterator values_input,
104 typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
105 ValuesOutputIterator values_output,
107 SegmentIndexIterator segment_indices,
108 OffsetIterator begin_offsets,
109 OffsetIterator end_offsets,
110 unsigned int long_iterations,
111 unsigned int short_iterations,
112 unsigned int begin_bit,
113 unsigned int end_bit)
115 segmented_sort_large<Config, Descending>(
116 keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
117 to_output, segment_indices,
118 begin_offsets, end_offsets,
119 long_iterations, short_iterations,
124 template<
class Config,
126 unsigned int BlockSize,
127 class KeysInputIterator,
128 class KeysOutputIterator,
129 class ValuesInputIterator,
130 class ValuesOutputIterator,
131 class SegmentIndexIterator,
132 class OffsetIterator>
133 ROCPRIM_KERNEL __launch_bounds__(BlockSize)
void segmented_sort_small_or_medium_kernel(
134 KeysInputIterator keys_input,
135 typename std::iterator_traits<KeysInputIterator>::value_type* keys_tmp,
136 KeysOutputIterator keys_output,
137 ValuesInputIterator values_input,
138 typename std::iterator_traits<ValuesInputIterator>::value_type* values_tmp,
139 ValuesOutputIterator values_output,
141 unsigned int num_segments,
142 SegmentIndexIterator segment_indices,
143 OffsetIterator begin_offsets,
144 OffsetIterator end_offsets,
145 unsigned int begin_bit,
146 unsigned int end_bit)
148 segmented_sort_small<Config, Descending>(
149 keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
150 to_output, num_segments, segment_indices,
151 begin_offsets, end_offsets,
156 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ 158 auto _error = hipGetLastError(); \ 159 if(_error != hipSuccess) return _error; \ 160 if(debug_synchronous) \ 162 std::cout << name << "(" << size << ")"; \ 163 auto __error = hipStreamSynchronize(stream); \ 164 if(__error != hipSuccess) return __error; \ 165 auto _end = std::chrono::high_resolution_clock::now(); \ 166 auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \ 167 std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ 173 template<
typename InputIterator,
174 typename FirstOutputIterator,
175 typename SecondOutputIterator,
176 typename UnselectedOutputIterator,
177 typename SelectedCountOutputIterator,
178 typename FirstUnaryPredicate,
179 typename SecondUnaryPredicate>
180 hipError_t operator()(
void* temporary_storage,
181 size_t& storage_size,
183 FirstOutputIterator output_first_part,
184 SecondOutputIterator ,
185 UnselectedOutputIterator ,
186 SelectedCountOutputIterator selected_count_output,
188 FirstUnaryPredicate select_first_part_op,
189 SecondUnaryPredicate ,
190 const hipStream_t stream,
191 const bool debug_synchronous)
197 selected_count_output,
199 select_first_part_op,
207 template<
typename InputIterator,
208 typename FirstOutputIterator,
209 typename SecondOutputIterator,
210 typename UnselectedOutputIterator,
211 typename SelectedCountOutputIterator,
212 typename FirstUnaryPredicate,
213 typename SecondUnaryPredicate>
214 hipError_t operator()(
void* temporary_storage,
215 size_t& storage_size,
217 FirstOutputIterator output_first_part,
218 SecondOutputIterator output_second_part,
219 UnselectedOutputIterator output_unselected,
220 SelectedCountOutputIterator selected_count_output,
222 FirstUnaryPredicate select_first_part_op,
223 SecondUnaryPredicate select_second_part_op,
224 const hipStream_t stream,
225 const bool debug_synchronous)
233 selected_count_output,
235 select_first_part_op,
236 select_second_part_op,
245 class KeysInputIterator,
246 class KeysOutputIterator,
247 class ValuesInputIterator,
248 class ValuesOutputIterator,
252 hipError_t segmented_radix_sort_impl(
void * temporary_storage,
253 size_t& storage_size,
254 KeysInputIterator keys_input,
255 typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
256 KeysOutputIterator keys_output,
257 ValuesInputIterator values_input,
258 typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
259 ValuesOutputIterator values_output,
261 bool& is_result_in_output,
262 unsigned int segments,
263 OffsetIterator begin_offsets,
264 OffsetIterator end_offsets,
265 unsigned int begin_bit,
266 unsigned int end_bit,
268 bool debug_synchronous)
270 using key_type =
typename std::iterator_traits<KeysInputIterator>::value_type;
271 using value_type =
typename std::iterator_traits<ValuesInputIterator>::value_type;
272 using segment_index_type =
unsigned int;
276 std::is_same<key_type,
typename std::iterator_traits<KeysOutputIterator>::value_type>::value,
277 "KeysInputIterator and KeysOutputIterator must have the same value_type" 280 std::is_same<value_type,
typename std::iterator_traits<ValuesOutputIterator>::value_type>::value,
281 "ValuesInputIterator and ValuesOutputIterator must have the same value_type" 284 using config = default_or_custom_config<
289 static constexpr
bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
290 static constexpr
bool partitioning_allowed =
291 !std::is_same<typename config::warp_sort_config, DisabledWarpSortConfig>::value;
292 static constexpr
unsigned int max_small_segment_length
293 = config::warp_sort_config::items_per_thread_small
294 * config::warp_sort_config::logical_warp_size_small;
295 static constexpr
unsigned int small_segments_per_block
296 = config::warp_sort_config::block_size_small
297 / config::warp_sort_config::logical_warp_size_small;
298 static constexpr
unsigned int max_medium_segment_length
299 = config::warp_sort_config::items_per_thread_medium
300 * config::warp_sort_config::logical_warp_size_medium;
301 static constexpr
unsigned int medium_segments_per_block
302 = config::warp_sort_config::block_size_medium
303 / config::warp_sort_config::logical_warp_size_medium;
305 max_small_segment_length <= max_medium_segment_length,
306 "The max length of small segments cannot be higher than the max length of medium segments");
308 static constexpr
bool three_way_partitioning
309 = max_small_segment_length < max_medium_segment_length;
310 using partitioner_type
311 = std::conditional_t<three_way_partitioning, ThreeWayPartitioner, TwoWayPartitioner>;
312 partitioner_type partitioner;
314 const auto large_segment_selector = [=](
const unsigned int segment_index)
mutable ->
bool 316 const unsigned int segment_length
317 = end_offsets[segment_index] - begin_offsets[segment_index];
318 return segment_length > max_medium_segment_length;
320 const auto medium_segment_selector = [=](
const unsigned int segment_index)
mutable ->
bool 322 const unsigned int segment_length = end_offsets[segment_index] - begin_offsets[segment_index];
323 return segment_length > max_small_segment_length;
326 const bool with_double_buffer = keys_tmp !=
nullptr;
327 const unsigned int bits = end_bit - begin_bit;
328 const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits);
329 const bool to_output = with_double_buffer || (iterations - 1) % 2 == 0;
330 is_result_in_output = (iterations % 2 == 0) != to_output;
331 const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits;
332 const unsigned int short_iterations = radix_bits_diff != 0
333 ?
::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff)
335 const unsigned int long_iterations = iterations - short_iterations;
336 const bool do_partitioning = partitioning_allowed
337 && segments >= config::warp_sort_config::partitioning_threshold;
339 const size_t medium_segment_indices_size = three_way_partitioning ? segments : 0;
340 static constexpr
size_t segment_count_output_size = three_way_partitioning ? 2 : 1;
341 const size_t segment_count_output_bytes
342 = segment_count_output_size *
sizeof(segment_index_type);
344 segment_index_type* large_segment_indices_output{};
347 auto small_segment_indices_output
349 key_type* keys_tmp_storage;
350 value_type* values_tmp_storage;
351 segment_index_type* medium_segment_indices_output{};
352 segment_index_type* segment_count_output{};
353 size_t partition_storage_size{};
354 void* partition_temporary_storage{};
356 const auto partitioner_result = partitioner(
nullptr,
357 partition_storage_size,
358 segment_index_iterator{},
359 large_segment_indices_output,
360 medium_segment_indices_output,
361 small_segment_indices_output,
362 segment_count_output,
364 large_segment_selector,
365 medium_segment_selector,
368 if(hipSuccess != partitioner_result)
370 return partitioner_result;
376 detail::temp_storage::make_linear_partition(
378 detail::temp_storage::ptr_aligned_array(&large_segment_indices_output, segments),
379 detail::temp_storage::ptr_aligned_array(&medium_segment_indices_output,
380 medium_segment_indices_size),
381 detail::temp_storage::ptr_aligned_array(&segment_count_output,
382 segment_count_output_size),
383 detail::temp_storage::make_union_partition(
385 detail::temp_storage::make_partition(&partition_temporary_storage,
386 partition_storage_size),
388 detail::temp_storage::make_linear_partition(
389 detail::temp_storage::ptr_aligned_array(&keys_tmp_storage,
390 !with_double_buffer ? size : 0),
391 detail::temp_storage::ptr_aligned_array(
393 !with_double_buffer && with_values ? size : 0)))));
394 if(partition_result != hipSuccess || temporary_storage ==
nullptr)
396 return partition_result;
403 if(debug_synchronous)
405 std::cout <<
"begin_bit " << begin_bit <<
'\n';
406 std::cout <<
"end_bit " << end_bit <<
'\n';
407 std::cout <<
"bits " << bits <<
'\n';
408 std::cout <<
"segments " << segments <<
'\n';
409 std::cout <<
"radix_bits_diff " << radix_bits_diff <<
'\n';
410 std::cout <<
"storage_size " << storage_size <<
'\n';
411 std::cout <<
"iterations " << iterations <<
'\n';
412 std::cout <<
"long_iterations " << long_iterations <<
'\n';
413 std::cout <<
"short_iterations " << short_iterations <<
'\n';
414 std::cout <<
"do_partitioning " << do_partitioning <<
'\n';
415 std::cout <<
"config::sort::block_size: " << config::sort::block_size <<
'\n';
416 std::cout <<
"config::sort::items_per_thread: " << config::sort::items_per_thread <<
'\n';
417 hipError_t error = hipStreamSynchronize(stream);
418 if(error != hipSuccess)
return error;
421 if(!with_double_buffer)
423 keys_tmp = keys_tmp_storage;
424 values_tmp = values_tmp_storage;
430 hipError_t result = partitioner(partition_temporary_storage,
431 partition_storage_size,
432 segment_index_iterator{},
433 large_segment_indices_output,
434 medium_segment_indices_output,
435 small_segment_indices_output,
436 segment_count_output,
438 large_segment_selector,
439 medium_segment_selector,
442 if(hipSuccess != result)
446 segment_index_type segment_counts[segment_count_output_size]{};
448 segment_count_output,
449 segment_count_output_bytes,
450 hipMemcpyDeviceToHost,
452 if(hipSuccess != result)
456 const auto large_segment_count = segment_counts[0];
457 const auto medium_segment_count = three_way_partitioning ? segment_counts[1] : 0;
458 const auto small_segment_count = segments - large_segment_count - medium_segment_count;
459 if(debug_synchronous)
461 std::cout <<
"large_segment_count " << large_segment_count <<
'\n';
462 std::cout <<
"medium_segment_count " << medium_segment_count <<
'\n';
463 std::cout <<
"small_segment_count " << small_segment_count <<
'\n';
465 if(large_segment_count > 0)
467 std::chrono::high_resolution_clock::time_point start;
468 if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
470 HIP_KERNEL_NAME(segmented_sort_large_kernel<config, Descending, config::sort::block_size>),
471 dim3(large_segment_count), dim3(config::sort::block_size), 0, stream,
472 keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
473 to_output, large_segment_indices_output,
474 begin_offsets, end_offsets,
475 long_iterations, short_iterations,
478 ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort:large_segments",
482 if(three_way_partitioning && medium_segment_count > 0)
484 const auto medium_segment_grid_size
485 = ::rocprim::detail::ceiling_div(medium_segment_count, medium_segments_per_block);
486 std::chrono::high_resolution_clock::time_point start;
487 if(debug_synchronous)
488 start = std::chrono::high_resolution_clock::now();
491 segmented_sort_small_or_medium_kernel<
492 select_warp_sort_helper_config_medium_t<typename config::warp_sort_config>,
494 config::warp_sort_config::block_size_medium>),
495 dim3(medium_segment_grid_size),
496 dim3(config::warp_sort_config::block_size_medium),
506 medium_segment_count,
507 medium_segment_indices_output,
512 ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort:medium_segments",
513 medium_segment_count,
516 if(small_segment_count > 0)
518 const auto small_segment_grid_size = ::rocprim::detail::ceiling_div(small_segment_count,
519 small_segments_per_block);
520 std::chrono::high_resolution_clock::time_point start;
521 if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
524 segmented_sort_small_or_medium_kernel<
525 select_warp_sort_helper_config_small_t<typename config::warp_sort_config>,
527 config::warp_sort_config::block_size_small>),
528 dim3(small_segment_grid_size),
529 dim3(config::warp_sort_config::block_size_small),
540 small_segment_indices_output,
545 ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort:small_segments",
552 std::chrono::high_resolution_clock::time_point start;
553 if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
555 HIP_KERNEL_NAME(segmented_sort_kernel<config, Descending, config::sort::block_size>),
556 dim3(segments), dim3(config::sort::block_size), 0, stream,
557 keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
559 begin_offsets, end_offsets,
560 long_iterations, short_iterations,
563 ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort", segments, start)
568 #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR 662 class KeysInputIterator,
663 class KeysOutputIterator,
664 class OffsetIterator,
665 class Key =
typename std::iterator_traits<KeysInputIterator>::value_type
669 size_t& storage_size,
670 KeysInputIterator keys_input,
671 KeysOutputIterator keys_output,
673 unsigned int segments,
674 OffsetIterator begin_offsets,
675 OffsetIterator end_offsets,
676 unsigned int begin_bit = 0,
677 unsigned int end_bit = 8 *
sizeof(Key),
678 hipStream_t stream = 0,
679 bool debug_synchronous =
false)
683 return detail::segmented_radix_sort_impl<Config, false>(
684 temporary_storage, storage_size,
685 keys_input,
nullptr, keys_output,
686 values,
nullptr, values,
688 segments, begin_offsets, end_offsets,
690 stream, debug_synchronous
784 class KeysInputIterator,
785 class KeysOutputIterator,
786 class OffsetIterator,
787 class Key =
typename std::iterator_traits<KeysInputIterator>::value_type
791 size_t& storage_size,
792 KeysInputIterator keys_input,
793 KeysOutputIterator keys_output,
795 unsigned int segments,
796 OffsetIterator begin_offsets,
797 OffsetIterator end_offsets,
798 unsigned int begin_bit = 0,
799 unsigned int end_bit = 8 *
sizeof(Key),
800 hipStream_t stream = 0,
801 bool debug_synchronous =
false)
805 return detail::segmented_radix_sort_impl<Config, true>(
806 temporary_storage, storage_size,
807 keys_input,
nullptr, keys_output,
808 values,
nullptr, values,
810 segments, begin_offsets, end_offsets,
812 stream, debug_synchronous
922 class KeysInputIterator,
923 class KeysOutputIterator,
924 class ValuesInputIterator,
925 class ValuesOutputIterator,
926 class OffsetIterator,
927 class Key =
typename std::iterator_traits<KeysInputIterator>::value_type
931 size_t& storage_size,
932 KeysInputIterator keys_input,
933 KeysOutputIterator keys_output,
934 ValuesInputIterator values_input,
935 ValuesOutputIterator values_output,
937 unsigned int segments,
938 OffsetIterator begin_offsets,
939 OffsetIterator end_offsets,
940 unsigned int begin_bit = 0,
941 unsigned int end_bit = 8 *
sizeof(Key),
942 hipStream_t stream = 0,
943 bool debug_synchronous =
false)
946 return detail::segmented_radix_sort_impl<Config, false>(
947 temporary_storage, storage_size,
948 keys_input,
nullptr, keys_output,
949 values_input,
nullptr, values_output,
951 segments, begin_offsets, end_offsets,
953 stream, debug_synchronous
1059 class KeysInputIterator,
1060 class KeysOutputIterator,
1061 class ValuesInputIterator,
1062 class ValuesOutputIterator,
1063 class OffsetIterator,
1064 class Key =
typename std::iterator_traits<KeysInputIterator>::value_type
1068 size_t& storage_size,
1069 KeysInputIterator keys_input,
1070 KeysOutputIterator keys_output,
1071 ValuesInputIterator values_input,
1072 ValuesOutputIterator values_output,
1074 unsigned int segments,
1075 OffsetIterator begin_offsets,
1076 OffsetIterator end_offsets,
1077 unsigned int begin_bit = 0,
1078 unsigned int end_bit = 8 *
sizeof(Key),
1079 hipStream_t stream = 0,
1080 bool debug_synchronous =
false)
1083 return detail::segmented_radix_sort_impl<Config, true>(
1084 temporary_storage, storage_size,
1085 keys_input,
nullptr, keys_output,
1086 values_input,
nullptr, values_output,
1088 segments, begin_offsets, end_offsets,
1090 stream, debug_synchronous
1189 class OffsetIterator
1193 size_t& storage_size,
1196 unsigned int segments,
1197 OffsetIterator begin_offsets,
1198 OffsetIterator end_offsets,
1199 unsigned int begin_bit = 0,
1200 unsigned int end_bit = 8 *
sizeof(Key),
1201 hipStream_t stream = 0,
1202 bool debug_synchronous =
false)
1205 bool is_result_in_output;
1206 hipError_t error = detail::segmented_radix_sort_impl<Config, false>(
1207 temporary_storage, storage_size,
1209 values, values, values,
1210 size, is_result_in_output,
1211 segments, begin_offsets, end_offsets,
1213 stream, debug_synchronous
1215 if(temporary_storage !=
nullptr && is_result_in_output)
1317 class OffsetIterator
1321 size_t& storage_size,
1324 unsigned int segments,
1325 OffsetIterator begin_offsets,
1326 OffsetIterator end_offsets,
1327 unsigned int begin_bit = 0,
1328 unsigned int end_bit = 8 *
sizeof(Key),
1329 hipStream_t stream = 0,
1330 bool debug_synchronous =
false)
1333 bool is_result_in_output;
1334 hipError_t error = detail::segmented_radix_sort_impl<Config, true>(
1335 temporary_storage, storage_size,
1337 values, values, values,
1338 size, is_result_in_output,
1339 segments, begin_offsets, end_offsets,
1341 stream, debug_synchronous
1343 if(temporary_storage !=
nullptr && is_result_in_output)
1459 class OffsetIterator
1463 size_t& storage_size,
1467 unsigned int segments,
1468 OffsetIterator begin_offsets,
1469 OffsetIterator end_offsets,
1470 unsigned int begin_bit = 0,
1471 unsigned int end_bit = 8 *
sizeof(Key),
1472 hipStream_t stream = 0,
1473 bool debug_synchronous =
false)
1475 bool is_result_in_output;
1476 hipError_t error = detail::segmented_radix_sort_impl<Config, false>(
1477 temporary_storage, storage_size,
1480 size, is_result_in_output,
1481 segments, begin_offsets, end_offsets,
1483 stream, debug_synchronous
1485 if(temporary_storage !=
nullptr && is_result_in_output)
1596 class OffsetIterator
1600 size_t& storage_size,
1604 unsigned int segments,
1605 OffsetIterator begin_offsets,
1606 OffsetIterator end_offsets,
1607 unsigned int begin_bit = 0,
1608 unsigned int end_bit = 8 *
sizeof(Key),
1609 hipStream_t stream = 0,
1610 bool debug_synchronous =
false)
1612 bool is_result_in_output;
1613 hipError_t error = detail::segmented_radix_sort_impl<Config, true>(
1614 temporary_storage, storage_size,
1617 size, is_result_in_output,
1618 segments, begin_offsets, end_offsets,
1620 stream, debug_synchronous
1622 if(temporary_storage !=
nullptr && is_result_in_output)
1630 END_ROCPRIM_NAMESPACE
1635 #endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_ Empty type used as a placeholder, usually used to flag that given template parameter should not be us...
Definition: types.hpp:135
Definition: device_segmented_radix_sort_config.hpp:344
hipError_t segmented_radix_sort_pairs_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel descending radix sort-by-key primitive for device level.
Definition: device_segmented_radix_sort.hpp:1067
hipError_t partition(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using range of flags.
Definition: device_partition.hpp:721
hipError_t memcpy_and_sync(void *dst, const void *src, size_t size_bytes, hipMemcpyKind kind, hipStream_t stream)
Copy data from src to dest with stream ordering and synchronization.
Definition: various.hpp:286
This class provides an convenient way to do double buffering.
Definition: double_buffer.hpp:37
hipError_t segmented_radix_sort_keys_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel descending radix sort primitive for device level.
Definition: device_segmented_radix_sort.hpp:790
ROCPRIM_HOST_DEVICE reverse_iterator< SourceIterator > make_reverse_iterator(SourceIterator source_iterator)
make_reverse_iterator creates a reverse_iterator wrapping source_iterator.
Definition: reverse_iterator.hpp:204
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
hipError_t segmented_radix_sort_keys(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel ascending radix sort primitive for device level.
Definition: device_segmented_radix_sort.hpp:668
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
ROCPRIM_HOST_DEVICE T * current() const
Returns a pointer to the current buffer.
Definition: double_buffer.hpp:69
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: device_segmented_radix_sort.hpp:171
A random-access input (read-only) iterator over a sequence of consecutive integer values...
Definition: counting_iterator.hpp:51
hipError_t segmented_radix_sort_pairs(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel ascending radix sort-by-key primitive for device level.
Definition: device_segmented_radix_sort.hpp:930
Definition: device_segmented_radix_sort.hpp:205
hipError_t partition_three_way(void *temporary_storage, size_t &storage_size, InputIterator input, FirstOutputIterator output_first_part, SecondOutputIterator output_second_part, UnselectedOutputIterator output_unselected, SelectedCountOutputIterator selected_count_output, const size_t size, FirstUnaryPredicate select_first_part_op, SecondUnaryPredicate select_second_part_op, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using two selection predicates.
Definition: device_partition.hpp:1029
ROCPRIM_HOST_DEVICE T * alternate() const
Returns a pointer to the alternate buffer.
Definition: double_buffer.hpp:76
ROCPRIM_HOST_DEVICE void swap()
Swaps the current and alternate buffers.
Definition: double_buffer.hpp:83