21 #ifndef ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_ 22 #define ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_ 24 #include <type_traits> 26 #include "../config.hpp" 27 #include "../detail/various.hpp" 28 #include "../detail/radix_sort.hpp" 29 #include "../warp/detail/warp_scan_crosslane.hpp" 31 #include "../intrinsics.hpp" 32 #include "../functional.hpp" 33 #include "../types.hpp" 35 #include "block_exchange.hpp" 36 #include "block_radix_rank.hpp" 41 BEGIN_ROCPRIM_NAMESPACE
91 unsigned int BlockSizeX,
92 unsigned int ItemsPerThread,
94 unsigned int BlockSizeY = 1,
95 unsigned int BlockSizeZ = 1
99 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
100 static constexpr
bool with_values = !std::is_same<Value, empty_type>::value;
101 static constexpr
unsigned int radix_bits_per_pass = 4;
103 using bit_key_type = typename ::rocprim::detail::radix_key_codec<Key>::bit_key_type;
104 using block_rank_type = ::rocprim::block_radix_rank<BlockSizeX,
109 using bit_keys_exchange_type = ::rocprim::block_exchange<bit_key_type, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
110 using values_exchange_type
111 = ::rocprim::block_exchange<Value, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
116 typename bit_keys_exchange_type::storage_type bit_keys_exchange;
117 typename values_exchange_type::storage_type values_exchange;
118 typename block_rank_type::storage_type rank;
131 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 178 ROCPRIM_DEVICE ROCPRIM_INLINE
179 void sort(Key (&keys)[ItemsPerThread],
181 unsigned int begin_bit = 0,
182 unsigned int end_bit = 8 *
sizeof(Key))
185 sort_impl<false>(keys, values, storage, begin_bit, end_bit);
200 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
201 void sort(Key (&keys)[ItemsPerThread],
202 unsigned int begin_bit = 0,
203 unsigned int end_bit = 8 *
sizeof(Key))
206 sort(keys, storage, begin_bit, end_bit);
250 ROCPRIM_DEVICE ROCPRIM_INLINE
253 unsigned int begin_bit = 0,
254 unsigned int end_bit = 8 *
sizeof(Key))
257 sort_impl<true>(keys, values, storage, begin_bit, end_bit);
272 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
274 unsigned int begin_bit = 0,
275 unsigned int end_bit = 8 *
sizeof(Key))
278 sort_desc(keys, storage, begin_bit, end_bit);
330 template<
bool WithValues = with_values>
331 ROCPRIM_DEVICE ROCPRIM_INLINE
332 void sort(Key (&keys)[ItemsPerThread],
333 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
335 unsigned int begin_bit = 0,
336 unsigned int end_bit = 8 *
sizeof(Key))
338 sort_impl<false>(keys, values, storage, begin_bit, end_bit);
357 template<
bool WithValues = with_values>
358 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
359 void sort(Key (&keys)[ItemsPerThread],
360 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
361 unsigned int begin_bit = 0,
362 unsigned int end_bit = 8 *
sizeof(Key))
365 sort(keys, values, storage, begin_bit, end_bit);
417 template<
bool WithValues = with_values>
418 ROCPRIM_DEVICE ROCPRIM_INLINE
420 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
422 unsigned int begin_bit = 0,
423 unsigned int end_bit = 8 *
sizeof(Key))
425 sort_impl<true>(keys, values, storage, begin_bit, end_bit);
444 template<
bool WithValues = with_values>
445 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
447 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
448 unsigned int begin_bit = 0,
449 unsigned int end_bit = 8 *
sizeof(Key))
452 sort_desc(keys, values, storage, begin_bit, end_bit);
497 ROCPRIM_DEVICE ROCPRIM_INLINE
500 unsigned int begin_bit = 0,
501 unsigned int end_bit = 8 *
sizeof(Key))
504 sort_impl<false, true>(keys, values, storage, begin_bit, end_bit);
520 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
522 unsigned int begin_bit = 0,
523 unsigned int end_bit = 8 *
sizeof(Key))
571 ROCPRIM_DEVICE ROCPRIM_INLINE
574 unsigned int begin_bit = 0,
575 unsigned int end_bit = 8 *
sizeof(Key))
578 sort_impl<true, true>(keys, values, storage, begin_bit, end_bit);
594 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
596 unsigned int begin_bit = 0,
597 unsigned int end_bit = 8 *
sizeof(Key))
652 template<
bool WithValues = with_values>
653 ROCPRIM_DEVICE ROCPRIM_INLINE
655 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
657 unsigned int begin_bit = 0,
658 unsigned int end_bit = 8 *
sizeof(Key))
660 sort_impl<false, true>(keys, values, storage, begin_bit, end_bit);
677 template<
bool WithValues = with_values>
678 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
680 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
681 unsigned int begin_bit = 0,
682 unsigned int end_bit = 8 *
sizeof(Key))
737 template<
bool WithValues = with_values>
738 ROCPRIM_DEVICE ROCPRIM_INLINE
740 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
742 unsigned int begin_bit = 0,
743 unsigned int end_bit = 8 *
sizeof(Key))
745 sort_impl<true, true>(keys, values, storage, begin_bit, end_bit);
762 template<
bool WithValues = with_values>
763 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
765 typename std::enable_if<WithValues, Value>::type (&values)[ItemsPerThread],
766 unsigned int begin_bit = 0,
767 unsigned int end_bit = 8 *
sizeof(Key))
775 template<
bool Descending,
bool ToStriped = false,
class SortedValue>
776 ROCPRIM_DEVICE ROCPRIM_INLINE
777 void sort_impl(Key (&keys)[ItemsPerThread],
778 SortedValue (&values)[ItemsPerThread],
780 unsigned int begin_bit,
781 unsigned int end_bit)
783 using key_codec = ::rocprim::detail::radix_key_codec<Key, Descending>;
785 bit_key_type bit_keys[ItemsPerThread];
787 for(
unsigned int i = 0; i < ItemsPerThread; i++)
789 bit_keys[i] = key_codec::encode(keys[i]);
794 const int pass_bits =
min(radix_bits_per_pass, end_bit - begin_bit);
796 unsigned int ranks[ItemsPerThread];
797 block_rank_type().rank_keys(
801 [begin_bit, pass_bits](
const bit_key_type& key)
802 {
return key_codec::extract_digit(key, begin_bit, pass_bits); });
803 begin_bit += radix_bits_per_pass;
805 exchange_keys(storage, bit_keys, ranks);
806 exchange_values(storage, values, ranks);
808 if(begin_bit >= end_bit)
815 if ROCPRIM_IF_CONSTEXPR(ToStriped)
817 to_striped_keys(storage, bit_keys);
818 to_striped_values(storage, values);
822 for(
unsigned int i = 0; i < ItemsPerThread; i++)
824 keys[i] = key_codec::decode(bit_keys[i]);
828 ROCPRIM_DEVICE ROCPRIM_INLINE
830 bit_key_type (&bit_keys)[ItemsPerThread],
831 const unsigned int (&ranks)[ItemsPerThread])
833 storage_type_& storage_ = storage.get();
835 bit_keys_exchange_type().scatter_to_blocked(bit_keys, bit_keys, ranks, storage_.bit_keys_exchange);
838 template<
class SortedValue>
839 ROCPRIM_DEVICE ROCPRIM_INLINE
841 SortedValue (&values)[ItemsPerThread],
842 const unsigned int (&ranks)[ItemsPerThread])
844 storage_type_& storage_ = storage.get();
846 values_exchange_type().scatter_to_blocked(values, values, ranks, storage_.values_exchange);
849 ROCPRIM_DEVICE ROCPRIM_INLINE
852 const unsigned int (&ranks)[ItemsPerThread])
859 ROCPRIM_DEVICE ROCPRIM_INLINE
861 bit_key_type (&bit_keys)[ItemsPerThread])
863 storage_type_& storage_ = storage.get();
865 bit_keys_exchange_type().blocked_to_striped(bit_keys, bit_keys, storage_.bit_keys_exchange);
868 template<
class SortedValue>
869 ROCPRIM_DEVICE ROCPRIM_INLINE
871 SortedValue (&values)[ItemsPerThread])
873 storage_type_& storage_ = storage.get();
875 values_exchange_type().blocked_to_striped(values, values, storage_.values_exchange);
878 ROCPRIM_DEVICE ROCPRIM_INLINE
887 END_ROCPRIM_NAMESPACE
892 #endif // ROCPRIM_BLOCK_BLOCK_RADIX_SORT_HPP_ Empty type used as a placeholder, usually used to flag that given template parameter should not be us...
Definition: types.hpp:135
ROCPRIM_DEVICE ROCPRIM_INLINE void sort_desc_to_striped(Key(&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs descending radix sort over keys partitioned across threads in a block, results are saved in ...
Definition: block_radix_sort.hpp:572
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort_to_striped(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:679
The basic block radix rank algorithm, configured to memoize intermediate values.
ROCPRIM_DEVICE ROCPRIM_INLINE void sort_to_striped(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs ascending radix sort over key-value pairs partitioned across threads in a block...
Definition: block_radix_sort.hpp:654
ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs ascending radix sort over key-value pairs partitioned across threads in a block...
Definition: block_radix_sort.hpp:332
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
The block_radix_sort class is a block level parallel primitive which provides methods for sorting of ...
Definition: block_radix_sort.hpp:97
ROCPRIM_DEVICE ROCPRIM_INLINE void sort_desc_to_striped(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs descending radix sort over key-value pairs partitioned across threads in a block...
Definition: block_radix_sort.hpp:739
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort_desc(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:446
ROCPRIM_DEVICE ROCPRIM_INLINE void sort_to_striped(Key(&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs ascending radix sort over keys partitioned across threads in a block, results are saved in a...
Definition: block_radix_sort.hpp:498
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key(&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs ascending radix sort over keys partitioned across threads in a block.
Definition: block_radix_sort.hpp:179
ROCPRIM_DEVICE ROCPRIM_INLINE void sort_desc(Key(&keys)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs descending radix sort over keys partitioned across threads in a block.
Definition: block_radix_sort.hpp:251
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort(Key(&keys)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:201
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort_desc_to_striped(Key(&keys)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:595
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort_to_striped(Key(&keys)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:521
ROCPRIM_DEVICE ROCPRIM_INLINE void sort_desc(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], storage_type &storage, unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
Performs descending radix sort over key-value pairs partitioned across threads in a block...
Definition: block_radix_sort.hpp:419
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:359
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort_desc(Key(&keys)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:273
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void sort_desc_to_striped(Key(&keys)[ItemsPerThread], typename std::enable_if< WithValues, Value >::type(&values)[ItemsPerThread], unsigned int begin_bit=0, unsigned int end_bit=8 *sizeof(Key))
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_radix_sort.hpp:764