rocPRIM
warp_reduce_dpp.hpp
1 // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_WARP_DETAIL_WARP_REDUCE_DPP_HPP_
22 #define ROCPRIM_WARP_DETAIL_WARP_REDUCE_DPP_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../intrinsics.hpp"
28 #include "../../types.hpp"
29 #include "../../detail/various.hpp"
30 
31 #include "warp_reduce_shuffle.hpp"
32 
33 BEGIN_ROCPRIM_NAMESPACE
34 
35 namespace detail
36 {
37 
38 template<
39  class T,
40  unsigned int WarpSize,
41  bool UseAllReduce
42 >
44 {
45 public:
46  static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2");
47 
49 
50  template<class BinaryFunction>
51  ROCPRIM_DEVICE ROCPRIM_INLINE
52  void reduce(T input, T& output, BinaryFunction reduce_op)
53  {
54  output = input;
55 
56  if(WarpSize > 1)
57  {
58  // quad_perm:[1,0,3,2] -> 10110001
59  output = reduce_op(warp_move_dpp<T, 0xb1>(output), output);
60  }
61  if(WarpSize > 2)
62  {
63  // quad_perm:[2,3,0,1] -> 01001110
64  output = reduce_op(warp_move_dpp<T, 0x4e>(output), output);
65  }
66  if(WarpSize > 4)
67  {
68  // row_shr:4
69  output = reduce_op(warp_move_dpp<T, 0x114>(output), output);
70  }
71  if(WarpSize > 8)
72  {
73  // row_shr:8
74  output = reduce_op(warp_move_dpp<T, 0x118>(output), output);
75  }
76 #if ROCPRIM_NAVI
77  if(WarpSize > 16)
78  {
79  // row_bcast:15
80  output = reduce_op(warp_swizzle<T, 0x1e0>(output), output);
81  }
82 #else
83  if(WarpSize > 16)
84  {
85  // row_bcast:15
86  output = reduce_op(warp_move_dpp<T, 0x142>(output), output);
87  }
88  if(WarpSize > 32)
89  {
90  // row_bcast:31
91  output = reduce_op(warp_move_dpp<T, 0x143>(output), output);
92  }
93 #endif
94  // Read the result from the last lane of the logical warp
95  output = warp_shuffle(output, WarpSize - 1, WarpSize);
96  }
97 
98  template<class BinaryFunction>
99  ROCPRIM_DEVICE ROCPRIM_INLINE
100  void reduce(T input, T& output, storage_type& storage, BinaryFunction reduce_op)
101  {
102  (void) storage; // disables unused parameter warning
103  this->reduce(input, output, reduce_op);
104  }
105 
106  template<class BinaryFunction>
107  ROCPRIM_DEVICE ROCPRIM_INLINE
108  void reduce(T input, T& output, unsigned int valid_items, BinaryFunction reduce_op)
109  {
110  // Fallback to shuffle-based implementation
112  .reduce(input, output, valid_items, reduce_op);
113  }
114 
115  template<class BinaryFunction>
116  ROCPRIM_DEVICE ROCPRIM_INLINE
117  void reduce(T input, T& output, unsigned int valid_items,
118  storage_type& storage, BinaryFunction reduce_op)
119  {
120  (void) storage; // disables unused parameter warning
121  this->reduce(input, output, valid_items, reduce_op);
122  }
123 
124  template<class Flag, class BinaryFunction>
125  ROCPRIM_DEVICE ROCPRIM_INLINE
126  void head_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op)
127  {
128  // Fallback to shuffle-based implementation
130  .head_segmented_reduce(input, output, flag, reduce_op);
131  }
132 
133  template<class Flag, class BinaryFunction>
134  ROCPRIM_DEVICE ROCPRIM_INLINE
135  void tail_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op)
136  {
137  // Fallback to shuffle-based implementation
139  .tail_segmented_reduce(input, output, flag, reduce_op);
140  }
141 
142  template<class Flag, class BinaryFunction>
143  ROCPRIM_DEVICE ROCPRIM_INLINE
144  void head_segmented_reduce(T input, T& output, Flag flag,
145  storage_type& storage, BinaryFunction reduce_op)
146  {
147  // Fallback to shuffle-based implementation
149  .head_segmented_reduce(input, output, flag, storage, reduce_op);
150  }
151 
152  template<class Flag, class BinaryFunction>
153  ROCPRIM_DEVICE ROCPRIM_INLINE
154  void tail_segmented_reduce(T input, T& output, Flag flag,
155  storage_type& storage, BinaryFunction reduce_op)
156  {
157  // Fallback to shuffle-based implementation
159  .tail_segmented_reduce(input, output, flag, storage, reduce_op);
160  }
161 };
162 
163 } // end namespace detail
164 
165 END_ROCPRIM_NAMESPACE
166 
167 #endif // ROCPRIM_WARP_DETAIL_WARP_REDUCE_DPP_HPP_
Definition: warp_reduce_dpp.hpp:43
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type.
Definition: warp_shuffle.hpp:172
Definition: benchmark_block_reduce.cpp:63
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: warp_reduce_shuffle.hpp:43
Definition: various.hpp:52