rocPRIM
warp_scan.hpp
1 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_WARP_WARP_SCAN_HPP_
22 #define ROCPRIM_WARP_WARP_SCAN_HPP_
23 
24 #include <type_traits>
25 
26 #include "../config.hpp"
27 #include "../detail/various.hpp"
28 
29 #include "../intrinsics.hpp"
30 #include "../functional.hpp"
31 #include "../types.hpp"
32 
33 #include "detail/warp_scan_crosslane.hpp"
34 #include "detail/warp_scan_shared_mem.hpp"
35 
38 
39 BEGIN_ROCPRIM_NAMESPACE
40 
41 namespace detail
42 {
43 
44 // Select warp_scan implementation based WarpSize
45 template<class T, unsigned int WarpSize>
47 {
48  typedef typename std::conditional<
49  // can we use crosslane (DPP or shuffle-based) implementation?
51  detail::warp_scan_crosslane<T, WarpSize>, // yes
53  >::type type;
54 };
55 
56 } // end namespace detail
57 
107 template<
108  class T,
109  unsigned int WarpSize = device_warp_size()
110 >
112 #ifndef DOXYGEN_SHOULD_SKIP_THIS
113  : private detail::select_warp_scan_impl<T, WarpSize>::type
114 #endif
115 {
116  using base_type = typename detail::select_warp_scan_impl<T, WarpSize>::type;
117 
118  // Check if WarpSize is valid for the targets
119  static_assert(WarpSize <= ROCPRIM_MAX_WARP_SIZE, "WarpSize can't be greater than hardware warp size.");
120 
121 public:
130  using storage_type = typename base_type::storage_type;
131 
180  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
181  ROCPRIM_DEVICE ROCPRIM_INLINE
182  auto inclusive_scan(T input,
183  T& output,
184  storage_type& storage,
185  BinaryFunction scan_op = BinaryFunction())
186  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
187  {
188  base_type::inclusive_scan(input, output, storage, scan_op);
189  }
190 
193  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
194  ROCPRIM_DEVICE ROCPRIM_INLINE
195  auto inclusive_scan(T ,
196  T& ,
197  storage_type& ,
198  BinaryFunction scan_op = BinaryFunction())
199  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
200  {
201  (void) scan_op;
202  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
203  return;
204  }
205 
255  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
256  ROCPRIM_DEVICE ROCPRIM_INLINE
257  auto inclusive_scan(T input,
258  T& output,
259  T& reduction,
260  storage_type& storage,
261  BinaryFunction scan_op = BinaryFunction())
262  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
263  {
264  base_type::inclusive_scan(input, output, reduction, storage, scan_op);
265  }
266 
269  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
270  ROCPRIM_DEVICE ROCPRIM_INLINE
271  auto inclusive_scan(T ,
272  T& ,
273  T& ,
274  storage_type& ,
275  BinaryFunction scan_op = BinaryFunction())
276  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
277  {
278  (void) scan_op;
279  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
280  return;
281  }
282 
335  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
336  ROCPRIM_DEVICE ROCPRIM_INLINE
337  auto exclusive_scan(T input,
338  T& output,
339  T init,
340  storage_type& storage,
341  BinaryFunction scan_op = BinaryFunction())
342  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
343  {
344  base_type::exclusive_scan(input, output, init, storage, scan_op);
345  }
346 
349  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
350  ROCPRIM_DEVICE ROCPRIM_INLINE
351  auto exclusive_scan(T ,
352  T& ,
353  T ,
354  storage_type& ,
355  BinaryFunction scan_op = BinaryFunction())
356  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
357  {
358  (void) scan_op;
359  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
360  return;
361  }
362 
416  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
417  ROCPRIM_DEVICE ROCPRIM_INLINE
418  auto exclusive_scan(T input,
419  T& output,
420  T init,
421  T& reduction,
422  storage_type& storage,
423  BinaryFunction scan_op = BinaryFunction())
424  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
425  {
426  base_type::exclusive_scan(input, output, init, reduction, storage, scan_op);
427  }
428 
431  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
432  ROCPRIM_DEVICE ROCPRIM_INLINE
433  auto exclusive_scan(T ,
434  T& ,
435  T ,
436  T& ,
437  storage_type& ,
438  BinaryFunction scan_op = BinaryFunction())
439  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
440  {
441  (void) scan_op;
442  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
443  return;
444  }
445 
504  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
505  ROCPRIM_DEVICE ROCPRIM_INLINE
506  auto scan(T input,
507  T& inclusive_output,
508  T& exclusive_output,
509  T init,
510  storage_type& storage,
511  BinaryFunction scan_op = BinaryFunction())
512  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
513  {
514  base_type::scan(input, inclusive_output, exclusive_output, init, storage, scan_op);
515  }
516 
519  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
520  ROCPRIM_DEVICE ROCPRIM_INLINE
521  auto scan(T ,
522  T& ,
523  T& ,
524  T ,
525  storage_type& ,
526  BinaryFunction scan_op = BinaryFunction())
527  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
528  {
529  (void) scan_op;
530  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
531  return;
532  }
533 
592  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
593  ROCPRIM_DEVICE ROCPRIM_INLINE
594  auto scan(T input,
595  T& inclusive_output,
596  T& exclusive_output,
597  T init,
598  T& reduction,
599  storage_type& storage,
600  BinaryFunction scan_op = BinaryFunction())
601  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
602  {
603  base_type::scan(
604  input, inclusive_output, exclusive_output, init, reduction,
605  storage, scan_op
606  );
607  }
608 
611  template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
612  ROCPRIM_DEVICE ROCPRIM_INLINE
613  auto scan(T ,
614  T& ,
615  T& ,
616  T ,
617  T& ,
618  storage_type& ,
619  BinaryFunction scan_op = BinaryFunction())
620  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
621  {
622  (void) scan_op;
623  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size . Aborting warp sort.");
624  return;
625  }
626 
636  template<unsigned int FunctionWarpSize = WarpSize>
637  ROCPRIM_DEVICE ROCPRIM_INLINE
638  auto broadcast(T input,
639  const unsigned int src_lane,
640  storage_type& storage)
641  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), T>::type
642  {
643  return base_type::broadcast(input, src_lane, storage);
644  }
645 
648  template<unsigned int FunctionWarpSize = WarpSize>
649  ROCPRIM_DEVICE ROCPRIM_INLINE
650  auto broadcast(T ,
651  const unsigned int ,
652  storage_type& )
653  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T>::type
654  {
655  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort.");
656  return T();
657  }
658 
659 #ifndef DOXYGEN_SHOULD_SKIP_THIS
660 protected:
661 
662  template<unsigned int FunctionWarpSize = WarpSize>
663  ROCPRIM_DEVICE ROCPRIM_INLINE
664  auto to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage)
665  -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
666  {
667  return base_type::to_exclusive(inclusive_input, exclusive_output, storage);
668  }
669 
670  template<unsigned int FunctionWarpSize = WarpSize>
671  ROCPRIM_DEVICE ROCPRIM_INLINE
672  auto to_exclusive(T , T& , storage_type&)
673  -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
674  {
675  ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort.");
676  return;
677  }
678 #endif
679 };
680 
681 END_ROCPRIM_NAMESPACE
682 
684 // end of group warpmodule
685 
686 #endif // ROCPRIM_WARP_WARP_SCAN_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T input, T &inclusive_output, T &exclusive_output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations, and reduction across threads in a logical warp...
Definition: warp_scan.hpp:594
The warp_scan class is a warp level parallel primitive which provides methods for performing inclusiv...
Definition: warp_scan.hpp:111
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T input, T &output, T init, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:418
typename base_type::storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: warp_scan.hpp:130
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:351
ROCPRIM_DEVICE ROCPRIM_INLINE auto broadcast(T, const unsigned int, storage_type &) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), T >::type
Broadcasts value from one thread to all threads in logical warp.
Definition: warp_scan.hpp:650
Definition: warp_scan.hpp:46
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T input, T &inclusive_output, T &exclusive_output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations across threads in a logical warp.
Definition: warp_scan.hpp:506
hipError_t exclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel exclusive scan primitive for device level.
Definition: device_scan.hpp:651
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T, T &, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:271
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:433
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T, T &, T &, T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations across threads Invalid Warp Size.
Definition: warp_scan.hpp:613
ROCPRIM_DEVICE ROCPRIM_INLINE auto scan(T, T &, T &, T, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive and exclusive scan operations across threads Invalid Warp Size.
Definition: warp_scan.hpp:521
ROCPRIM_DEVICE ROCPRIM_INLINE auto exclusive_scan(T input, T &output, T init, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs exclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:337
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T input, T &output, T &reduction, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan and reduction across threads in a logical warp.
Definition: warp_scan.hpp:257
Definition: warp_scan_shared_mem.hpp:41
hipError_t inclusive_scan(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const size_t size, BinaryFunction scan_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel inclusive scan primitive for device level.
Definition: device_scan.hpp:539
#define ROCPRIM_PRINT_ERROR_ONCE(message)
Prints the supplied error message only once (using only one of the active threads).
Definition: functional.hpp:42
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T, T &, storage_type &, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:195
ROCPRIM_DEVICE ROCPRIM_INLINE auto broadcast(T input, const unsigned int src_lane, storage_type &storage) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), T >::type
Broadcasts value from one thread to all threads in logical warp.
Definition: warp_scan.hpp:638
ROCPRIM_DEVICE ROCPRIM_INLINE auto inclusive_scan(T input, T &output, storage_type &storage, BinaryFunction scan_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs inclusive scan across threads in a logical warp.
Definition: warp_scan.hpp:182
Definition: various.hpp:108