rocPRIM
warp_sort_stable.hpp
1 // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE
20 
21 #ifndef ROCPRIM_WARP_DETAIL_WARP_SORT_STABLE_HPP_
22 #define ROCPRIM_WARP_DETAIL_WARP_SORT_STABLE_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../functional.hpp"
30 #include "../../intrinsics.hpp"
31 
32 BEGIN_ROCPRIM_NAMESPACE
33 
34 namespace detail
35 {
36 
37 template<typename Key,
38  unsigned int BlockSize,
39  unsigned int WarpSize,
40  unsigned int ItemsPerThread,
41  typename Value>
43 {
44 private:
45  constexpr static unsigned int items_per_block = BlockSize * ItemsPerThread;
46  constexpr static bool with_values = !std::is_same<Value, rocprim::empty_type>::value;
47 
48  struct storage_type_keys
49  {
50  Key keys[items_per_block];
51  };
52 
53  struct storage_type_keys_values
54  {
55  Key keys[items_per_block];
56  Value values[items_per_block];
57  };
58 
59  using storage_type_
60  = std::conditional_t<with_values, storage_type_keys_values, storage_type_keys>;
61 
63  template<bool is_incomplete, typename CompareFunction>
64  ROCPRIM_DEVICE ROCPRIM_INLINE void thread_sort(Key (&thread_keys)[ItemsPerThread],
65  CompareFunction compare_function,
66  const unsigned int input_size = items_per_block)
67  {
68  const auto thread_offset = rocprim::flat_block_thread_id() * ItemsPerThread;
69  const auto thread_input_size = thread_offset > input_size ? 0 : input_size - thread_offset;
70 
71  ROCPRIM_UNROLL
72  for(auto i = 0u; i < ItemsPerThread; ++i)
73  {
74  ROCPRIM_UNROLL
75  for(auto j = i & 1u; j < ItemsPerThread - 1u; j += 2u)
76  {
77  if(j + 1 < thread_input_size
78  && compare_function(thread_keys[j + 1], thread_keys[j]))
79  {
80  ::rocprim::swap(thread_keys[j + 1], thread_keys[j]);
81  }
82  }
83  }
84  }
85 
87  template<bool is_incomplete, typename CompareFunction>
88  ROCPRIM_DEVICE ROCPRIM_INLINE void thread_sort(Key (&thread_keys)[ItemsPerThread],
89  Value (&thread_values)[ItemsPerThread],
90  CompareFunction compare_function,
91  const unsigned int input_size = items_per_block)
92  {
93  const auto thread_offset = rocprim::flat_block_thread_id() * ItemsPerThread;
94  const auto thread_input_size = thread_offset > input_size ? 0 : input_size - thread_offset;
95 
96  ROCPRIM_UNROLL
97  for(auto i = 0u; i < ItemsPerThread; ++i)
98  {
99  ROCPRIM_UNROLL
100  for(auto j = i & 1u; j < ItemsPerThread - 1u; j += 2u)
101  {
102  if(j + 1 < thread_input_size
103  && compare_function(thread_keys[j + 1], thread_keys[j]))
104  {
105  ::rocprim::swap(thread_keys[j + 1], thread_keys[j]);
106  ::rocprim::swap(thread_values[j + 1], thread_values[j]);
107  }
108  }
109  }
110  }
111 
112  template<bool is_incomplete, class BinaryFunction>
113  ROCPRIM_DEVICE ROCPRIM_INLINE void merge_path_merge(Key (&thread_keys)[ItemsPerThread],
114  storage_type_& storage,
115  BinaryFunction compare_function,
116  const unsigned int input_size
117  = items_per_block)
118  {
119  const auto lane = lane_id();
120  const auto warp = warp_id();
121 
122  const auto warp_offset = warp * ItemsPerThread * device_warp_size();
123  const auto warp_input_size = warp_offset > input_size ? 0 : input_size - warp_offset;
124  const auto shared_keys = &storage.keys[warp_offset];
125 
126  ROCPRIM_UNROLL
127  for(auto partition_size = 1u; partition_size < WarpSize; partition_size <<= 1u)
128  {
129  ROCPRIM_UNROLL
130  for(auto i = 0u; i < ItemsPerThread; ++i)
131  {
132  shared_keys[ItemsPerThread * lane + i] = thread_keys[i];
133  }
134 
135  wave_barrier();
136 
137  const auto size = partition_size * ItemsPerThread;
138  const auto mask = (partition_size * 2) - 1;
139 
140  const auto start = lane & ~mask;
141  const auto keys1_begin = start * ItemsPerThread;
142  const auto keys1_end = std::min(keys1_begin + size, warp_input_size);
143  const auto keys2_begin = keys1_end;
144  const auto keys2_end = std::min(keys2_begin + size, warp_input_size);
145 
146  const auto diag = std::min(ItemsPerThread * (mask & lane), warp_input_size);
147  const auto partition = merge_path(&shared_keys[keys1_begin],
148  &shared_keys[keys2_begin],
149  keys1_end - keys1_begin,
150  keys2_end - keys2_begin,
151  diag,
152  compare_function);
153 
154  const auto keys1_merge_begin = keys1_begin + partition;
155  const auto keys2_merge_begin = keys2_begin + diag - partition;
156 
157  const range_t range = {
158  keys1_merge_begin,
159  keys1_end,
160  keys2_merge_begin,
161  keys2_end,
162  };
163 
164  serial_merge(shared_keys, thread_keys, range, compare_function);
165 
166  wave_barrier();
167  }
168  }
169 
170  template<bool is_incomplete, class BinaryFunction>
171  ROCPRIM_DEVICE ROCPRIM_INLINE void merge_path_merge(Key (&thread_keys)[ItemsPerThread],
172  Value (&thread_values)[ItemsPerThread],
173  storage_type_& storage,
174  BinaryFunction compare_function,
175  const unsigned int input_size
176  = items_per_block)
177  {
178  const auto lane = lane_id();
179  const auto warp = warp_id();
180 
181  const auto warp_offset = warp * ItemsPerThread * device_warp_size();
182  const auto warp_input_size = warp_offset > input_size ? 0 : input_size - warp_offset;
183  const auto shared_keys = &storage.keys[warp_offset];
184  const auto shared_values = &storage.values[warp_offset];
185 
186  ROCPRIM_UNROLL
187  for(auto partition_size = 1u; partition_size < WarpSize; partition_size <<= 1u)
188  {
189  ROCPRIM_UNROLL
190  for(auto i = 0u; i < ItemsPerThread; ++i)
191  {
192  shared_keys[ItemsPerThread * lane + i] = thread_keys[i];
193  shared_values[ItemsPerThread * lane + i] = thread_values[i];
194  }
195 
196  wave_barrier();
197 
198  const auto size = partition_size * ItemsPerThread;
199  const auto mask = (partition_size * 2) - 1;
200 
201  const auto start = lane & ~mask;
202  const auto keys1_begin = start * ItemsPerThread;
203  const auto keys1_end = std::min(keys1_begin + size, warp_input_size);
204  const auto keys2_begin = keys1_end;
205  const auto keys2_end = std::min(keys2_begin + size, warp_input_size);
206 
207  const auto diag = std::min(ItemsPerThread * (mask & lane), warp_input_size);
208  const auto partition = merge_path(&shared_keys[keys1_begin],
209  &shared_keys[keys2_begin],
210  keys1_end - keys1_begin,
211  keys2_end - keys2_begin,
212  diag,
213  compare_function);
214 
215  const auto keys1_merge_begin = keys1_begin + partition;
216  const auto keys2_merge_begin = keys2_begin + diag - partition;
217 
218  const range_t range = {
219  keys1_merge_begin,
220  keys1_end,
221  keys2_merge_begin,
222  keys2_end,
223  };
224 
225  serial_merge(shared_keys,
226  thread_keys,
227  shared_values,
228  thread_values,
229  range,
230  compare_function);
231 
232  wave_barrier();
233  }
234  }
235 
236 public:
237  static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2");
238 
240 
241  template<class BinaryFunction>
242  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key& thread_key, BinaryFunction compare_function)
243  {
244  ROCPRIM_SHARED_MEMORY storage_type storage;
245  sort(thread_key, storage, compare_function);
246  }
247 
248  template<class BinaryFunction>
249  ROCPRIM_DEVICE ROCPRIM_INLINE void
250  sort(Key& thread_key, storage_type& storage, BinaryFunction compare_function)
251  {
252  Key thread_keys[] = {thread_key};
253  sort(thread_keys, storage, compare_function);
254  }
255 
256  template<class BinaryFunction>
257  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[ItemsPerThread],
258  BinaryFunction compare_function)
259  {
260  ROCPRIM_SHARED_MEMORY storage_type storage;
261  sort(thread_keys, storage, compare_function);
262  }
263 
264  template<class BinaryFunction>
265  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[ItemsPerThread],
266  storage_type& storage,
267  BinaryFunction compare_function)
268  {
269  thread_sort<false>(thread_keys, compare_function);
270 
271  merge_path_merge<false>(thread_keys, storage.get(), compare_function);
272  syncthreads();
273  }
274 
275  template<class BinaryFunction, class V = Value>
276  ROCPRIM_DEVICE ROCPRIM_INLINE void
277  sort(Key& thread_key, Value& thread_value, BinaryFunction compare_function)
278  {
279  Key thread_keys[] = {thread_key};
280  Value thread_values[] = {thread_value};
281  sort(thread_keys, thread_values, compare_function);
282  }
283 
284  template<class BinaryFunction>
285  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key& thread_key,
286  Value& thread_value,
287  storage_type& storage,
288  BinaryFunction compare_function)
289  {
290  Key thread_keys[] = {thread_key};
291  Value thread_values[] = {thread_value};
292  sort(thread_keys, thread_values, storage, compare_function);
293  }
294 
295  template<class BinaryFunction>
296  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[ItemsPerThread],
297  Value (&thread_values)[ItemsPerThread],
298  BinaryFunction compare_function)
299  {
300  ROCPRIM_SHARED_MEMORY storage_type storage;
301  sort(thread_keys, thread_values, storage, compare_function);
302  }
303 
304  template<class BinaryFunction>
305  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[ItemsPerThread],
306  storage_type& storage,
307  const unsigned int input_size,
308  BinaryFunction compare_function)
309  {
310  thread_sort<true>(thread_keys, compare_function, input_size);
311 
312  merge_path_merge<true>(thread_keys, storage.get(), compare_function, input_size);
313 
314  syncthreads();
315  }
316 
317  template<class BinaryFunction>
318  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[ItemsPerThread],
319  Value (&thread_values)[ItemsPerThread],
320  storage_type& storage,
321  BinaryFunction compare_function)
322  {
323  thread_sort<false>(thread_keys, thread_values, compare_function);
324 
325  merge_path_merge<false>(thread_keys, thread_values, storage.get(), compare_function);
326  syncthreads();
327  }
328 
329  template<class BinaryFunction>
330  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[ItemsPerThread],
331  Value (&thread_values)[ItemsPerThread],
332  storage_type& storage,
333  const unsigned int input_size,
334  BinaryFunction compare_function)
335  {
336  thread_sort<true>(thread_keys, thread_values, compare_function, input_size);
337 
338  merge_path_merge<true>(thread_keys,
339  thread_values,
340  storage.get(),
341  compare_function,
342  input_size);
343 
344  syncthreads();
345  }
346 };
347 
348 template<typename Key, unsigned int BlockSize, unsigned int WarpSize, typename Value>
349 class warp_sort_stable<Key, BlockSize, WarpSize, 1, Value>
350 {
351 private:
352  constexpr static unsigned items_per_thread = 1;
364  template<bool is_incomplete, typename BinaryFunction>
365  ROCPRIM_DEVICE ROCPRIM_INLINE int merge_rank(const unsigned int m,
366  Key& thread_key,
367  BinaryFunction compare_function,
368  const unsigned int valid_items = BlockSize)
369  {
370  // The thread's index in the current warp.
371  const auto lane = lane_id();
372  // The size of each merged subsequence.
373  const auto n = m * 2;
374  // The thread's index in its (merged) subsequence.
375  const auto index = lane % n;
376  // Whether the thread is in the lower- or upper-half of the merged range.
377  const auto is_lower = index < m;
378  // The starting offset of the (merged) subsequence that the thread is in.
379  const auto base = lane - index;
380 
381  // The starting index of the to-be-searched subsequence of elements. If in the lower
382  // half, this points to the first element of the upper half, and vice versa.
383  auto begin = base + (is_lower ? m : 0);
384  // The past-ending index of the to-be-searched subsequence of elements.
385  auto end = begin + m;
386 
387  // Note: we cannot use a while loop here because all threads need to be active during the
388  // shuffle.
389  ROCPRIM_UNROLL
390  for(auto i = 1u; i <= m; i <<= 1u)
391  {
392  const auto mid = (begin + end) / 2;
393  // Swap keys if in the lower half to eliminate a more expensive divergent branch in the comparator.
394  // Note: this needs to be done in order to achieve stability, in the left subsequence we want the index
395  // to be before any equal elements, but in the right subsequence it must be after.
396  auto key_a = thread_key;
397  auto key_b = warp_shuffle(thread_key, mid);
398  if(is_lower)
399  ::rocprim::swap(key_a, key_b);
400 
401  const auto mid_smaller = ((!is_incomplete || (lane < valid_items && mid < valid_items))
402  && compare_function(key_a, key_b))
403  == is_lower;
404 
405  if(mid_smaller && begin != end)
406  begin = mid + 1;
407  else
408  end = mid;
409  }
410 
411  // The rank of an item in the merged sequence is given by
412  // rank(merged) = rank(left) + rank(right).
413  // The rank in one of the subsequences is given by `begin`, and the other is given by `index`.
414  // Note that for the left subsequence `begin` is offset by `m`, and for the right subsequence
415  // `index` is offset by `m`. Subtracting `m` for the result this gives the correct final rank.
416  return index + begin - m;
417  }
418 
419 public:
420  static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2");
421 
423 
424  template<class BinaryFunction>
425  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key& thread_key, BinaryFunction compare_function)
426  {
427  ROCPRIM_UNROLL
428  for(auto i = 1u; i < WarpSize; i <<= 1u)
429  {
430  const auto thread_rank = merge_rank<false>(i, thread_key, compare_function);
431  thread_key = warp_permute(thread_key, thread_rank);
432  }
433  }
434 
435  template<class BinaryFunction>
436  ROCPRIM_DEVICE ROCPRIM_INLINE void
437  sort(Key& thread_key, storage_type& storage, BinaryFunction compare_function)
438  {
439  (void)storage;
440  sort(thread_key, compare_function);
441  }
442 
443  template<class BinaryFunction>
444  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[items_per_thread],
445  BinaryFunction compare_function)
446  {
447  sort(thread_keys[0], compare_function);
448  }
449 
450  template<class BinaryFunction>
451  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[items_per_thread],
452  storage_type& storage,
453  BinaryFunction compare_function)
454  {
455  sort(thread_keys[0], storage, compare_function);
456  }
457 
458  template<class BinaryFunction>
459  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[items_per_thread],
460  storage_type& storage,
461  const unsigned int input_size,
462  BinaryFunction compare_function)
463  {
464  sort(thread_keys[0], storage, input_size, compare_function);
465  }
466 
467  template<class BinaryFunction>
468  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key& thread_key,
469  storage_type& storage,
470  const unsigned int input_size,
471  BinaryFunction compare_function)
472  {
473  (void)storage;
474 
475  const auto warp_offset = warp_id() * device_warp_size();
476  const auto warp_input_size = warp_offset > input_size ? 0 : input_size - warp_offset;
477 
478  ROCPRIM_UNROLL
479  for(auto i = 1u; i < WarpSize; i <<= 1u)
480  {
481  const auto thread_rank
482  = merge_rank<true>(i, thread_key, compare_function, warp_input_size);
483  thread_key = warp_permute(thread_key, thread_rank);
484  }
485  }
486 
487  template<class BinaryFunction, class V = Value>
488  ROCPRIM_DEVICE ROCPRIM_INLINE typename std::enable_if<(sizeof(V) <= sizeof(int))>::type
489  sort(Key& thread_key, V& thread_value, BinaryFunction compare_function)
490  {
491  ROCPRIM_UNROLL
492  for(auto i = 1u; i < WarpSize; i <<= 1u)
493  {
494  const auto thread_rank = merge_rank<false>(i, thread_key, compare_function);
495  thread_key = warp_permute(thread_key, thread_rank);
496  thread_value = warp_permute(thread_value, thread_rank);
497  }
498  }
499 
500  template<class BinaryFunction, class V = Value>
501  ROCPRIM_DEVICE ROCPRIM_INLINE typename std::enable_if<!(sizeof(V) <= sizeof(int))>::type
502  sort(Key& thread_key, V& thread_value, BinaryFunction compare_function)
503  {
504  // Use indices to reduce the amount of permutations.
505  auto value_index = lane_id();
506  sort(thread_key, value_index, compare_function);
507  // Perform a shuffle to get the final value.
508  thread_value = warp_shuffle(thread_value, value_index);
509  }
510 
511  template<class BinaryFunction>
512  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key& thread_key,
513  Value& thread_value,
514  storage_type& storage,
515  BinaryFunction compare_function)
516  {
517  (void)storage;
518  sort(compare_function, thread_key, thread_value);
519  }
520 
521  template<class BinaryFunction>
522  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[items_per_thread],
523  Value (&thread_values)[items_per_thread],
524  BinaryFunction compare_function)
525  {
526  sort(thread_keys[0], thread_values[0], compare_function);
527  }
528 
529  template<class BinaryFunction>
530  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[items_per_thread],
531  Value (&thread_values)[items_per_thread],
532  storage_type& storage,
533  BinaryFunction compare_function)
534  {
535  (void)storage;
536  sort(thread_keys[0], thread_values[0], compare_function);
537  }
538 
539  template<class BinaryFunction>
540  ROCPRIM_DEVICE ROCPRIM_INLINE void sort(Key (&thread_keys)[items_per_thread],
541  Value (&thread_values)[items_per_thread],
542  storage_type& storage,
543  unsigned int input_size,
544  BinaryFunction compare_function)
545  {
546  (void)storage;
547  sort(thread_keys[0], thread_values[0], storage, input_size, compare_function);
548  }
549 
550  template<class BinaryFunction, typename V = Value>
551  ROCPRIM_DEVICE ROCPRIM_INLINE typename std::enable_if<(sizeof(V) <= sizeof(int))>::type
552  sort(Key& thread_key,
553  V& thread_value,
554  storage_type& storage,
555  unsigned int input_size,
556  BinaryFunction compare_function)
557  {
558  (void)storage;
559 
560  const auto warp_offset = warp_id() * device_warp_size();
561  const auto warp_input_size = warp_offset > input_size ? 0 : input_size - warp_offset;
562 
563  ROCPRIM_UNROLL
564  for(auto i = 1u; i < WarpSize; i <<= 1u)
565  {
566  const auto thread_rank
567  = merge_rank<true>(i, thread_key, compare_function, warp_input_size);
568  thread_key = warp_permute(thread_key, thread_rank);
569  thread_value = warp_permute(thread_value, thread_rank);
570  }
571  }
572 
573  template<class BinaryFunction, typename V = Value>
574  ROCPRIM_DEVICE ROCPRIM_INLINE typename std::enable_if<!(sizeof(V) <= sizeof(int))>::type
575  sort(Key& thread_key,
576  V& thread_value,
577  storage_type& storage,
578  unsigned int input_size,
579  BinaryFunction compare_function)
580  {
581  // Use indices to reduce the amount of permutations.
582  auto value_index = lane_id();
583  sort(thread_key, value_index, storage, input_size, compare_function);
584  // Perform a shuffle to get the final value.
585  thread_value = warp_shuffle(thread_value, value_index);
586  }
587 };
588 
589 } // end namespace detail
590 
591 END_ROCPRIM_NAMESPACE
592 
593 #endif // ROCPRIM_WARP_DETAIL_WARP_SORT_SHUFFLE_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int flat_block_thread_id()
Returns flat (linear, 1D) thread identifier in a multidimensional block (tile).
Definition: thread.hpp:106
Definition: warp_sort_stable.hpp:42
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_permute(const T &input, const int dst_lane, const int width=device_warp_size())
Permute items across the threads in a warp.
Definition: warp_shuffle.hpp:273
hipError_t partition(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using range of flags.
Definition: device_partition.hpp:721
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type.
Definition: warp_shuffle.hpp:172
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_HOST_DEVICE constexpr T min(const T &a, const T &b)
Returns the minimum of its arguments.
Definition: functional.hpp:63
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront.
Definition: thread.hpp:235
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
const unsigned int warp_id
Returns warp id in a block (tile).
Definition: benchmark_warp_exchange.cpp:153
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
Definition: merge_path.hpp:33
ROCPRIM_HOST_DEVICE void swap(T &a, T &b)
Swaps two values.
Definition: functional.hpp:71
Definition: various.hpp:52
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int lane_id()
Returns thread identifier in a warp.
Definition: thread.hpp:93