21 #ifndef ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_    22 #define ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_    26 #include <type_traits>    29 #include "../config.hpp"    30 #include "../detail/various.hpp"    31 #include "../detail/radix_sort.hpp"    33 #include "../intrinsics.hpp"    34 #include "../functional.hpp"    35 #include "../types.hpp"    37 #include "../block/block_load.hpp"    38 #include "../iterator/counting_iterator.hpp"    39 #include "../iterator/reverse_iterator.hpp"    40 #include "detail/device_segmented_radix_sort.hpp"    41 #include "device_partition.hpp"    42 #include "device_segmented_radix_sort_config.hpp"    47 BEGIN_ROCPRIM_NAMESPACE
    55     unsigned int BlockSize,
    56     class KeysInputIterator,
    57     class KeysOutputIterator,
    58     class ValuesInputIterator,
    59     class ValuesOutputIterator,
    63 __launch_bounds__(BlockSize)
    64 void segmented_sort_kernel(KeysInputIterator keys_input,
    65                            typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
    66                            KeysOutputIterator keys_output,
    67                            ValuesInputIterator values_input,
    68                            typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
    69                            ValuesOutputIterator values_output,
    71                            OffsetIterator begin_offsets,
    72                            OffsetIterator end_offsets,
    73                            unsigned int long_iterations,
    74                            unsigned int short_iterations,
    75                            unsigned int begin_bit,
    78     segmented_sort<Config, Descending>(
    79         keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
    81         begin_offsets, end_offsets,
    82         long_iterations, short_iterations,
    90     unsigned int BlockSize,
    91     class KeysInputIterator,
    92     class KeysOutputIterator,
    93     class ValuesInputIterator,
    94     class ValuesOutputIterator,
    95     class SegmentIndexIterator,
    99 __launch_bounds__(BlockSize)
   100 void segmented_sort_large_kernel(KeysInputIterator keys_input,
   101                                  typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
   102                                  KeysOutputIterator keys_output,
   103                                  ValuesInputIterator values_input,
   104                                  typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
   105                                  ValuesOutputIterator values_output,
   107                                  SegmentIndexIterator segment_indices,
   108                                  OffsetIterator begin_offsets,
   109                                  OffsetIterator end_offsets,
   110                                  unsigned int long_iterations,
   111                                  unsigned int short_iterations,
   112                                  unsigned int begin_bit,
   113                                  unsigned int end_bit)
   115     segmented_sort_large<Config, Descending>(
   116         keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
   117         to_output, segment_indices,
   118         begin_offsets, end_offsets,
   119         long_iterations, short_iterations,
   124 template<
class Config,
   126          unsigned int BlockSize,
   127          class KeysInputIterator,
   128          class KeysOutputIterator,
   129          class ValuesInputIterator,
   130          class ValuesOutputIterator,
   131          class SegmentIndexIterator,
   132          class OffsetIterator>
   133 ROCPRIM_KERNEL __launch_bounds__(BlockSize) 
void segmented_sort_small_or_medium_kernel(
   134     KeysInputIterator                                               keys_input,
   135     typename std::iterator_traits<KeysInputIterator>::value_type*   keys_tmp,
   136     KeysOutputIterator                                              keys_output,
   137     ValuesInputIterator                                             values_input,
   138     typename std::iterator_traits<ValuesInputIterator>::value_type* values_tmp,
   139     ValuesOutputIterator                                            values_output,
   141     unsigned int                                                    num_segments,
   142     SegmentIndexIterator                                            segment_indices,
   143     OffsetIterator                                                  begin_offsets,
   144     OffsetIterator                                                  end_offsets,
   145     unsigned int                                                    begin_bit,
   146     unsigned int                                                    end_bit)
   148     segmented_sort_small<Config, Descending>(
   149         keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
   150         to_output, num_segments, segment_indices,
   151         begin_offsets, end_offsets,
   156 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \   158         auto _error = hipGetLastError(); \   159         if(_error != hipSuccess) return _error; \   160         if(debug_synchronous) \   162             std::cout << name << "(" << size << ")"; \   163             auto __error = hipStreamSynchronize(stream); \   164             if(__error != hipSuccess) return __error; \   165             auto _end = std::chrono::high_resolution_clock::now(); \   166             auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \   167             std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \   173     template<
typename InputIterator,
   174              typename FirstOutputIterator,
   175              typename SecondOutputIterator,
   176              typename UnselectedOutputIterator,
   177              typename SelectedCountOutputIterator,
   178              typename FirstUnaryPredicate,
   179              typename SecondUnaryPredicate>
   180     hipError_t operator()(
void*               temporary_storage,
   181                           size_t&             storage_size,
   183                           FirstOutputIterator output_first_part,
   184                           SecondOutputIterator ,
   185                           UnselectedOutputIterator ,
   186                           SelectedCountOutputIterator selected_count_output,
   188                           FirstUnaryPredicate         select_first_part_op,
   189                           SecondUnaryPredicate ,
   190                           const hipStream_t stream,
   191                           const bool        debug_synchronous)
   197                          selected_count_output,
   199                          select_first_part_op,
   207     template<
typename InputIterator,
   208              typename FirstOutputIterator,
   209              typename SecondOutputIterator,
   210              typename UnselectedOutputIterator,
   211              typename SelectedCountOutputIterator,
   212              typename FirstUnaryPredicate,
   213              typename SecondUnaryPredicate>
   214     hipError_t operator()(
void*                       temporary_storage,
   215                           size_t&                     storage_size,
   217                           FirstOutputIterator         output_first_part,
   218                           SecondOutputIterator        output_second_part,
   219                           UnselectedOutputIterator    output_unselected,
   220                           SelectedCountOutputIterator selected_count_output,
   222                           FirstUnaryPredicate         select_first_part_op,
   223                           SecondUnaryPredicate        select_second_part_op,
   224                           const hipStream_t           stream,
   225                           const bool                  debug_synchronous)
   233                                    selected_count_output,
   235                                    select_first_part_op,
   236                                    select_second_part_op,
   245     class KeysInputIterator,
   246     class KeysOutputIterator,
   247     class ValuesInputIterator,
   248     class ValuesOutputIterator,
   252 hipError_t segmented_radix_sort_impl(
void * temporary_storage,
   253                                      size_t& storage_size,
   254                                      KeysInputIterator keys_input,
   255                                      typename std::iterator_traits<KeysInputIterator>::value_type * keys_tmp,
   256                                      KeysOutputIterator keys_output,
   257                                      ValuesInputIterator values_input,
   258                                      typename std::iterator_traits<ValuesInputIterator>::value_type * values_tmp,
   259                                      ValuesOutputIterator values_output,
   261                                      bool& is_result_in_output,
   262                                      unsigned int segments,
   263                                      OffsetIterator begin_offsets,
   264                                      OffsetIterator end_offsets,
   265                                      unsigned int begin_bit,
   266                                      unsigned int end_bit,
   268                                      bool debug_synchronous)
   270     using key_type = 
typename std::iterator_traits<KeysInputIterator>::value_type;
   271     using value_type = 
typename std::iterator_traits<ValuesInputIterator>::value_type;
   272     using segment_index_type = 
unsigned int;
   276         std::is_same<key_type, 
typename std::iterator_traits<KeysOutputIterator>::value_type>::value,
   277         "KeysInputIterator and KeysOutputIterator must have the same value_type"   280         std::is_same<value_type, 
typename std::iterator_traits<ValuesOutputIterator>::value_type>::value,
   281         "ValuesInputIterator and ValuesOutputIterator must have the same value_type"   284     using config = default_or_custom_config<
   289     static constexpr 
bool with_values = !std::is_same<value_type, ::rocprim::empty_type>::value;
   290     static constexpr 
bool partitioning_allowed =
   291         !std::is_same<typename config::warp_sort_config, DisabledWarpSortConfig>::value;
   292     static constexpr 
unsigned int max_small_segment_length
   293         = config::warp_sort_config::items_per_thread_small
   294           * config::warp_sort_config::logical_warp_size_small;
   295     static constexpr 
unsigned int small_segments_per_block
   296         = config::warp_sort_config::block_size_small
   297           / config::warp_sort_config::logical_warp_size_small;
   298     static constexpr 
unsigned int max_medium_segment_length
   299         = config::warp_sort_config::items_per_thread_medium
   300           * config::warp_sort_config::logical_warp_size_medium;
   301     static constexpr 
unsigned int medium_segments_per_block
   302         = config::warp_sort_config::block_size_medium
   303           / config::warp_sort_config::logical_warp_size_medium;
   305         max_small_segment_length <= max_medium_segment_length,
   306         "The max length of small segments cannot be higher than the max length of medium segments");
   308     static constexpr 
bool three_way_partitioning
   309         = max_small_segment_length < max_medium_segment_length;
   310     using partitioner_type
   311         = std::conditional_t<three_way_partitioning, ThreeWayPartitioner, TwoWayPartitioner>;
   312     partitioner_type partitioner;
   314     const auto large_segment_selector = [=](
const unsigned int segment_index) 
mutable -> 
bool   316         const unsigned int segment_length
   317             = end_offsets[segment_index] - begin_offsets[segment_index];
   318         return segment_length > max_medium_segment_length;
   320     const auto medium_segment_selector = [=](
const unsigned int segment_index) 
mutable -> 
bool   322         const unsigned int segment_length = end_offsets[segment_index] - begin_offsets[segment_index];
   323         return segment_length > max_small_segment_length;
   326     const bool with_double_buffer = keys_tmp != 
nullptr;
   327     const unsigned int bits = end_bit - begin_bit;
   328     const unsigned int iterations = ::rocprim::detail::ceiling_div(bits, config::long_radix_bits);
   329     const bool to_output = with_double_buffer || (iterations - 1) % 2 == 0;
   330     is_result_in_output = (iterations % 2 == 0) != to_output;
   331     const unsigned int radix_bits_diff = config::long_radix_bits - config::short_radix_bits;
   332     const unsigned int short_iterations = radix_bits_diff != 0
   333         ? 
::rocprim::min(iterations, (config::long_radix_bits * iterations - bits) / radix_bits_diff)
   335     const unsigned int long_iterations = iterations - short_iterations;
   336     const bool do_partitioning = partitioning_allowed
   337         && segments >= config::warp_sort_config::partitioning_threshold;
   339     const size_t            medium_segment_indices_size = three_way_partitioning ? segments : 0;
   340     static constexpr 
size_t segment_count_output_size = three_way_partitioning ? 2 : 1;
   341     const size_t            segment_count_output_bytes
   342         = segment_count_output_size * 
sizeof(segment_index_type);
   344     segment_index_type* large_segment_indices_output{};
   347     auto small_segment_indices_output
   349     key_type*           keys_tmp_storage;
   350     value_type*         values_tmp_storage;
   351     segment_index_type* medium_segment_indices_output{};
   352     segment_index_type* segment_count_output{};
   353     size_t              partition_storage_size{};
   354     void*               partition_temporary_storage{};
   356     const auto partitioner_result = partitioner(
nullptr,
   357                                                 partition_storage_size,
   358                                                 segment_index_iterator{},
   359                                                 large_segment_indices_output,
   360                                                 medium_segment_indices_output,
   361                                                 small_segment_indices_output,
   362                                                 segment_count_output,
   364                                                 large_segment_selector,
   365                                                 medium_segment_selector,
   368     if(hipSuccess != partitioner_result)
   370         return partitioner_result;
   376         detail::temp_storage::make_linear_partition(
   378             detail::temp_storage::ptr_aligned_array(&large_segment_indices_output, segments),
   379             detail::temp_storage::ptr_aligned_array(&medium_segment_indices_output,
   380                                                     medium_segment_indices_size),
   381             detail::temp_storage::ptr_aligned_array(&segment_count_output,
   382                                                     segment_count_output_size),
   383             detail::temp_storage::make_union_partition(
   385                 detail::temp_storage::make_partition(&partition_temporary_storage,
   386                                                      partition_storage_size),
   388                 detail::temp_storage::make_linear_partition(
   389                     detail::temp_storage::ptr_aligned_array(&keys_tmp_storage,
   390                                                             !with_double_buffer ? size : 0),
   391                     detail::temp_storage::ptr_aligned_array(
   393                         !with_double_buffer && with_values ? size : 0)))));
   394     if(partition_result != hipSuccess || temporary_storage == 
nullptr)
   396         return partition_result;
   403     if(debug_synchronous)
   405         std::cout << 
"begin_bit " << begin_bit << 
'\n';
   406         std::cout << 
"end_bit " << end_bit << 
'\n';
   407         std::cout << 
"bits " << bits << 
'\n';
   408         std::cout << 
"segments " << segments << 
'\n';
   409         std::cout << 
"radix_bits_diff " << radix_bits_diff << 
'\n';
   410         std::cout << 
"storage_size " << storage_size << 
'\n';
   411         std::cout << 
"iterations " << iterations << 
'\n';
   412         std::cout << 
"long_iterations " << long_iterations << 
'\n';
   413         std::cout << 
"short_iterations " << short_iterations << 
'\n';
   414         std::cout << 
"do_partitioning " << do_partitioning << 
'\n';
   415         std::cout << 
"config::sort::block_size: " << config::sort::block_size << 
'\n';
   416         std::cout << 
"config::sort::items_per_thread: " << config::sort::items_per_thread << 
'\n';
   417         hipError_t error = hipStreamSynchronize(stream);
   418         if(error != hipSuccess) 
return error;
   421     if(!with_double_buffer)
   423         keys_tmp   = keys_tmp_storage;
   424         values_tmp = values_tmp_storage;
   430         hipError_t result = partitioner(partition_temporary_storage,
   431                                         partition_storage_size,
   432                                         segment_index_iterator{},
   433                                         large_segment_indices_output,
   434                                         medium_segment_indices_output,
   435                                         small_segment_indices_output,
   436                                         segment_count_output,
   438                                         large_segment_selector,
   439                                         medium_segment_selector,
   442         if(hipSuccess != result)
   446         segment_index_type segment_counts[segment_count_output_size]{};
   448                                          segment_count_output,
   449                                          segment_count_output_bytes,
   450                                          hipMemcpyDeviceToHost,
   452         if(hipSuccess != result)
   456         const auto large_segment_count  = segment_counts[0];
   457         const auto medium_segment_count = three_way_partitioning ? segment_counts[1] : 0;
   458         const auto small_segment_count  = segments - large_segment_count - medium_segment_count;
   459         if(debug_synchronous)
   461             std::cout << 
"large_segment_count " << large_segment_count << 
'\n';
   462             std::cout << 
"medium_segment_count " << medium_segment_count << 
'\n';
   463             std::cout << 
"small_segment_count " << small_segment_count << 
'\n';
   465         if(large_segment_count > 0)
   467             std::chrono::high_resolution_clock::time_point start;
   468             if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
   470                 HIP_KERNEL_NAME(segmented_sort_large_kernel<config, Descending, config::sort::block_size>),
   471                 dim3(large_segment_count), dim3(config::sort::block_size), 0, stream,
   472                 keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
   473                 to_output, large_segment_indices_output,
   474                 begin_offsets, end_offsets,
   475                 long_iterations, short_iterations,
   478             ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort:large_segments",
   482         if(three_way_partitioning && medium_segment_count > 0)
   484             const auto medium_segment_grid_size
   485                 = ::rocprim::detail::ceiling_div(medium_segment_count, medium_segments_per_block);
   486             std::chrono::high_resolution_clock::time_point start;
   487             if(debug_synchronous)
   488                 start = std::chrono::high_resolution_clock::now();
   491                     segmented_sort_small_or_medium_kernel<
   492                         select_warp_sort_helper_config_medium_t<typename config::warp_sort_config>,
   494                         config::warp_sort_config::block_size_medium>),
   495                 dim3(medium_segment_grid_size),
   496                 dim3(config::warp_sort_config::block_size_medium),
   506                 medium_segment_count,
   507                 medium_segment_indices_output,
   512             ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort:medium_segments",
   513                                                         medium_segment_count,
   516         if(small_segment_count > 0)
   518             const auto small_segment_grid_size = ::rocprim::detail::ceiling_div(small_segment_count,
   519                                                                                 small_segments_per_block);
   520             std::chrono::high_resolution_clock::time_point start;
   521             if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
   524                     segmented_sort_small_or_medium_kernel<
   525                         select_warp_sort_helper_config_small_t<typename config::warp_sort_config>,
   527                         config::warp_sort_config::block_size_small>),
   528                 dim3(small_segment_grid_size),
   529                 dim3(config::warp_sort_config::block_size_small),
   540                 small_segment_indices_output,
   545             ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort:small_segments",
   552         std::chrono::high_resolution_clock::time_point start;
   553         if(debug_synchronous) start = std::chrono::high_resolution_clock::now();
   555             HIP_KERNEL_NAME(segmented_sort_kernel<config, Descending, config::sort::block_size>),
   556             dim3(segments), dim3(config::sort::block_size), 0, stream,
   557             keys_input, keys_tmp, keys_output, values_input, values_tmp, values_output,
   559             begin_offsets, end_offsets,
   560             long_iterations, short_iterations,
   563         ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
"segmented_sort", segments, start)
   568 #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR   662     class KeysInputIterator,
   663     class KeysOutputIterator,
   664     class OffsetIterator,
   665     class Key = 
typename std::iterator_traits<KeysInputIterator>::value_type
   669                                      size_t& storage_size,
   670                                      KeysInputIterator keys_input,
   671                                      KeysOutputIterator keys_output,
   673                                      unsigned int segments,
   674                                      OffsetIterator begin_offsets,
   675                                      OffsetIterator end_offsets,
   676                                      unsigned int begin_bit = 0,
   677                                      unsigned int end_bit = 8 * 
sizeof(Key),
   678                                      hipStream_t stream = 0,
   679                                      bool debug_synchronous = 
false)
   683     return detail::segmented_radix_sort_impl<Config, false>(
   684         temporary_storage, storage_size,
   685         keys_input, 
nullptr, keys_output,
   686         values, 
nullptr, values,
   688         segments, begin_offsets, end_offsets,
   690         stream, debug_synchronous
   784     class KeysInputIterator,
   785     class KeysOutputIterator,
   786     class OffsetIterator,
   787     class Key = 
typename std::iterator_traits<KeysInputIterator>::value_type
   791                                           size_t& storage_size,
   792                                           KeysInputIterator keys_input,
   793                                           KeysOutputIterator keys_output,
   795                                           unsigned int segments,
   796                                           OffsetIterator begin_offsets,
   797                                           OffsetIterator end_offsets,
   798                                           unsigned int begin_bit = 0,
   799                                           unsigned int end_bit = 8 * 
sizeof(Key),
   800                                           hipStream_t stream = 0,
   801                                           bool debug_synchronous = 
false)
   805     return detail::segmented_radix_sort_impl<Config, true>(
   806         temporary_storage, storage_size,
   807         keys_input, 
nullptr, keys_output,
   808         values, 
nullptr, values,
   810         segments, begin_offsets, end_offsets,
   812         stream, debug_synchronous
   922     class KeysInputIterator,
   923     class KeysOutputIterator,
   924     class ValuesInputIterator,
   925     class ValuesOutputIterator,
   926     class OffsetIterator,
   927     class Key = 
typename std::iterator_traits<KeysInputIterator>::value_type
   931                                       size_t& storage_size,
   932                                       KeysInputIterator keys_input,
   933                                       KeysOutputIterator keys_output,
   934                                       ValuesInputIterator values_input,
   935                                       ValuesOutputIterator values_output,
   937                                       unsigned int segments,
   938                                       OffsetIterator begin_offsets,
   939                                       OffsetIterator end_offsets,
   940                                       unsigned int begin_bit = 0,
   941                                       unsigned int end_bit = 8 * 
sizeof(Key),
   942                                       hipStream_t stream = 0,
   943                                       bool debug_synchronous = 
false)
   946     return detail::segmented_radix_sort_impl<Config, false>(
   947         temporary_storage, storage_size,
   948         keys_input, 
nullptr, keys_output,
   949         values_input, 
nullptr, values_output,
   951         segments, begin_offsets, end_offsets,
   953         stream, debug_synchronous
  1059     class KeysInputIterator,
  1060     class KeysOutputIterator,
  1061     class ValuesInputIterator,
  1062     class ValuesOutputIterator,
  1063     class OffsetIterator,
  1064     class Key = 
typename std::iterator_traits<KeysInputIterator>::value_type
  1068                                            size_t& storage_size,
  1069                                            KeysInputIterator keys_input,
  1070                                            KeysOutputIterator keys_output,
  1071                                            ValuesInputIterator values_input,
  1072                                            ValuesOutputIterator values_output,
  1074                                            unsigned int segments,
  1075                                            OffsetIterator begin_offsets,
  1076                                            OffsetIterator end_offsets,
  1077                                            unsigned int begin_bit = 0,
  1078                                            unsigned int end_bit = 8 * 
sizeof(Key),
  1079                                            hipStream_t stream = 0,
  1080                                            bool debug_synchronous = 
false)
  1083     return detail::segmented_radix_sort_impl<Config, true>(
  1084         temporary_storage, storage_size,
  1085         keys_input, 
nullptr, keys_output,
  1086         values_input, 
nullptr, values_output,
  1088         segments, begin_offsets, end_offsets,
  1090         stream, debug_synchronous
  1189     class OffsetIterator
  1193                                      size_t& storage_size,
  1196                                      unsigned int segments,
  1197                                      OffsetIterator begin_offsets,
  1198                                      OffsetIterator end_offsets,
  1199                                      unsigned int begin_bit = 0,
  1200                                      unsigned int end_bit = 8 * 
sizeof(Key),
  1201                                      hipStream_t stream = 0,
  1202                                      bool debug_synchronous = 
false)
  1205     bool is_result_in_output;
  1206     hipError_t error = detail::segmented_radix_sort_impl<Config, false>(
  1207         temporary_storage, storage_size,
  1209         values, values, values,
  1210         size, is_result_in_output,
  1211         segments, begin_offsets, end_offsets,
  1213         stream, debug_synchronous
  1215     if(temporary_storage != 
nullptr && is_result_in_output)
  1317     class OffsetIterator
  1321                                           size_t& storage_size,
  1324                                           unsigned int segments,
  1325                                           OffsetIterator begin_offsets,
  1326                                           OffsetIterator end_offsets,
  1327                                           unsigned int begin_bit = 0,
  1328                                           unsigned int end_bit = 8 * 
sizeof(Key),
  1329                                           hipStream_t stream = 0,
  1330                                           bool debug_synchronous = 
false)
  1333     bool is_result_in_output;
  1334     hipError_t error = detail::segmented_radix_sort_impl<Config, true>(
  1335         temporary_storage, storage_size,
  1337         values, values, values,
  1338         size, is_result_in_output,
  1339         segments, begin_offsets, end_offsets,
  1341         stream, debug_synchronous
  1343     if(temporary_storage != 
nullptr && is_result_in_output)
  1459     class OffsetIterator
  1463                                       size_t& storage_size,
  1467                                       unsigned int segments,
  1468                                       OffsetIterator begin_offsets,
  1469                                       OffsetIterator end_offsets,
  1470                                       unsigned int begin_bit = 0,
  1471                                       unsigned int end_bit = 8 * 
sizeof(Key),
  1472                                       hipStream_t stream = 0,
  1473                                       bool debug_synchronous = 
false)
  1475     bool is_result_in_output;
  1476     hipError_t error = detail::segmented_radix_sort_impl<Config, false>(
  1477         temporary_storage, storage_size,
  1480         size, is_result_in_output,
  1481         segments, begin_offsets, end_offsets,
  1483         stream, debug_synchronous
  1485     if(temporary_storage != 
nullptr && is_result_in_output)
  1596     class OffsetIterator
  1600                                            size_t& storage_size,
  1604                                            unsigned int segments,
  1605                                            OffsetIterator begin_offsets,
  1606                                            OffsetIterator end_offsets,
  1607                                            unsigned int begin_bit = 0,
  1608                                            unsigned int end_bit = 8 * 
sizeof(Key),
  1609                                            hipStream_t stream = 0,
  1610                                            bool debug_synchronous = 
false)
  1612     bool is_result_in_output;
  1613     hipError_t error = detail::segmented_radix_sort_impl<Config, true>(
  1614         temporary_storage, storage_size,
  1617         size, is_result_in_output,
  1618         segments, begin_offsets, end_offsets,
  1620         stream, debug_synchronous
  1622     if(temporary_storage != 
nullptr && is_result_in_output)
  1630 END_ROCPRIM_NAMESPACE
  1635 #endif // ROCPRIM_DEVICE_DEVICE_SEGMENTED_RADIX_SORT_HPP_ Empty type used as a placeholder, usually used to flag that given template parameter should not be us...
Definition: types.hpp:135
Definition: device_segmented_radix_sort_config.hpp:344
hipError_t segmented_radix_sort_pairs_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel descending radix sort-by-key primitive for device level. 
Definition: device_segmented_radix_sort.hpp:1067
hipError_t partition(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using range of flags. 
Definition: device_partition.hpp:721
hipError_t memcpy_and_sync(void *dst, const void *src, size_t size_bytes, hipMemcpyKind kind, hipStream_t stream)
Copy data from src to dest with stream ordering and synchronization. 
Definition: various.hpp:286
This class provides an convenient way to do double buffering. 
Definition: double_buffer.hpp:37
hipError_t segmented_radix_sort_keys_desc(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel descending radix sort primitive for device level. 
Definition: device_segmented_radix_sort.hpp:790
ROCPRIM_HOST_DEVICE reverse_iterator< SourceIterator > make_reverse_iterator(SourceIterator source_iterator)
make_reverse_iterator creates a reverse_iterator wrapping source_iterator. 
Definition: reverse_iterator.hpp:204
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
hipError_t segmented_radix_sort_keys(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel ascending radix sort primitive for device level. 
Definition: device_segmented_radix_sort.hpp:668
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments. 
Definition: functional.hpp:63
ROCPRIM_HOST_DEVICE T * current() const
Returns a pointer to the current buffer. 
Definition: double_buffer.hpp:69
Deprecated: Configuration of device-level scan primitives. 
Definition: block_histogram.hpp:62
Definition: device_segmented_radix_sort.hpp:171
A random-access input (read-only) iterator over a sequence of consecutive integer values...
Definition: counting_iterator.hpp:51
hipError_t segmented_radix_sort_pairs(void *temporary_storage, size_t &storage_size, KeysInputIterator keys_input, KeysOutputIterator keys_output, ValuesInputIterator values_input, ValuesOutputIterator values_output, unsigned int size, unsigned int segments, OffsetIterator begin_offsets, OffsetIterator end_offsets, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key), hipStream_t stream=0, bool debug_synchronous=false)
Parallel ascending radix sort-by-key primitive for device level. 
Definition: device_segmented_radix_sort.hpp:930
Definition: device_segmented_radix_sort.hpp:205
hipError_t partition_three_way(void *temporary_storage, size_t &storage_size, InputIterator input, FirstOutputIterator output_first_part, SecondOutputIterator output_second_part, UnselectedOutputIterator output_unselected, SelectedCountOutputIterator selected_count_output, const size_t size, FirstUnaryPredicate select_first_part_op, SecondUnaryPredicate select_second_part_op, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using two selection predicates. 
Definition: device_partition.hpp:1029
ROCPRIM_HOST_DEVICE T * alternate() const
Returns a pointer to the alternate buffer. 
Definition: double_buffer.hpp:76
ROCPRIM_HOST_DEVICE void swap()
Swaps the current and alternate buffers. 
Definition: double_buffer.hpp:83