rocPRIM
warp_scan_shuffle.hpp
1 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_WARP_DETAIL_WARP_SCAN_SHUFFLE_HPP_
22 #define ROCPRIM_WARP_DETAIL_WARP_SCAN_SHUFFLE_HPP_
23 
24 #include <type_traits>
25 
26 #include "../../config.hpp"
27 #include "../../detail/various.hpp"
28 
29 #include "../../intrinsics.hpp"
30 #include "../../types.hpp"
31 
32 BEGIN_ROCPRIM_NAMESPACE
33 
34 namespace detail
35 {
36 
37 template<
38  class T,
39  unsigned int WarpSize
40 >
42 {
43 public:
44  static_assert(detail::is_power_of_two(WarpSize), "WarpSize must be power of 2");
45 
47 
48  template<class BinaryFunction>
49  ROCPRIM_DEVICE ROCPRIM_INLINE
50  void inclusive_scan(T input, T& output, BinaryFunction scan_op)
51  {
52  output = input;
53 
54  T value;
55  const unsigned int id = detail::logical_lane_id<WarpSize>();
56  ROCPRIM_UNROLL
57  for(unsigned int offset = 1; offset < WarpSize; offset *= 2)
58  {
59  value = warp_shuffle_up(output, offset, WarpSize);
60  if(id >= offset) output = scan_op(value, output);
61  }
62  }
63 
64  template<class BinaryFunction>
65  ROCPRIM_DEVICE ROCPRIM_INLINE
66  void inclusive_scan(T input, T& output,
67  storage_type& storage, BinaryFunction scan_op)
68  {
69  (void) storage; // disables unused parameter warning
70  inclusive_scan(input, output, scan_op);
71  }
72 
73  template<class BinaryFunction>
74  ROCPRIM_DEVICE ROCPRIM_INLINE
75  void inclusive_scan(T input, T& output, T& reduction,
76  BinaryFunction scan_op)
77  {
78  inclusive_scan(input, output, scan_op);
79  // Broadcast value from the last thread in warp
80  reduction = warp_shuffle(output, WarpSize-1, WarpSize);
81  }
82 
83  template<class BinaryFunction>
84  ROCPRIM_DEVICE ROCPRIM_INLINE
85  void inclusive_scan(T input, T& output, T& reduction,
86  storage_type& storage, BinaryFunction scan_op)
87  {
88  (void) storage;
89  inclusive_scan(input, output, reduction, scan_op);
90  }
91 
92  template<class BinaryFunction>
93  ROCPRIM_DEVICE ROCPRIM_INLINE
94  void exclusive_scan(T input, T& output, T init, BinaryFunction scan_op)
95  {
96  inclusive_scan(input, output, scan_op);
97  // Convert inclusive scan result to exclusive
98  to_exclusive(output, output, init, scan_op);
99  }
100 
101  template<class BinaryFunction>
102  ROCPRIM_DEVICE ROCPRIM_INLINE
103  void exclusive_scan(T input, T& output, T init,
104  storage_type& storage, BinaryFunction scan_op)
105  {
106  (void) storage; // disables unused parameter warning
107  exclusive_scan(input, output, init, scan_op);
108  }
109 
110  template<class BinaryFunction>
111  ROCPRIM_DEVICE ROCPRIM_INLINE
112  void exclusive_scan(T input, T& output,
113  storage_type& storage, BinaryFunction scan_op)
114  {
115  (void) storage; // disables unused parameter warning
116  inclusive_scan(input, output, scan_op);
117  // Convert inclusive scan result to exclusive
118  to_exclusive(output, output);
119  }
120 
121  template<class BinaryFunction>
122  ROCPRIM_DEVICE ROCPRIM_INLINE
123  void exclusive_scan(T input, T& output, T init, T& reduction,
124  BinaryFunction scan_op)
125  {
126  inclusive_scan(input, output, scan_op);
127  // Broadcast value from the last thread in warp
128  reduction = warp_shuffle(output, WarpSize-1, WarpSize);
129  // Convert inclusive scan result to exclusive
130  to_exclusive(output, output, init, scan_op);
131  }
132 
133  template<class BinaryFunction>
134  ROCPRIM_DEVICE ROCPRIM_INLINE
135  void exclusive_scan(T input, T& output, T init, T& reduction,
136  storage_type& storage, BinaryFunction scan_op)
137  {
138  (void) storage;
139  exclusive_scan(input, output, init, reduction, scan_op);
140  }
141 
142  template<class BinaryFunction>
143  ROCPRIM_DEVICE ROCPRIM_INLINE
144  void scan(T input, T& inclusive_output, T& exclusive_output, T init,
145  BinaryFunction scan_op)
146  {
147  inclusive_scan(input, inclusive_output, scan_op);
148  // Convert inclusive scan result to exclusive
149  to_exclusive(inclusive_output, exclusive_output, init, scan_op);
150  }
151 
152  template<class BinaryFunction>
153  ROCPRIM_DEVICE ROCPRIM_INLINE
154  void scan(T input, T& inclusive_output, T& exclusive_output, T init,
155  storage_type& storage, BinaryFunction scan_op)
156  {
157  (void) storage; // disables unused parameter warning
158  scan(input, inclusive_output, exclusive_output, init, scan_op);
159  }
160 
161  template<class BinaryFunction>
162  ROCPRIM_DEVICE ROCPRIM_INLINE
163  void scan(T input, T& inclusive_output, T& exclusive_output,
164  storage_type& storage, BinaryFunction scan_op)
165  {
166  (void) storage; // disables unused parameter warning
167  inclusive_scan(input, inclusive_output, scan_op);
168  // Convert inclusive scan result to exclusive
169  to_exclusive(inclusive_output, exclusive_output);
170  }
171 
172  template<class BinaryFunction>
173  ROCPRIM_DEVICE ROCPRIM_INLINE
174  void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction,
175  BinaryFunction scan_op)
176  {
177  inclusive_scan(input, inclusive_output, scan_op);
178  // Broadcast value from the last thread in warp
179  reduction = warp_shuffle(inclusive_output, WarpSize-1, WarpSize);
180  // Convert inclusive scan result to exclusive
181  to_exclusive(inclusive_output, exclusive_output, init, scan_op);
182  }
183 
184  template<class BinaryFunction>
185  ROCPRIM_DEVICE ROCPRIM_INLINE
186  void scan(T input, T& inclusive_output, T& exclusive_output, T init, T& reduction,
187  storage_type& storage, BinaryFunction scan_op)
188  {
189  (void) storage;
190  scan(input, inclusive_output, exclusive_output, init, reduction, scan_op);
191  }
192 
193  ROCPRIM_DEVICE ROCPRIM_INLINE
194  T broadcast(T input, const unsigned int src_lane, storage_type& storage)
195  {
196  (void) storage;
197  return warp_shuffle(input, src_lane, WarpSize);
198  }
199 
200 protected:
201  ROCPRIM_DEVICE ROCPRIM_INLINE
202  void to_exclusive(T inclusive_input, T& exclusive_output, storage_type& storage)
203  {
204  (void) storage;
205  return to_exclusive(inclusive_input, exclusive_output);
206  }
207 
208 private:
209  // Changes inclusive scan results to exclusive scan results
210  template<class BinaryFunction>
211  ROCPRIM_DEVICE ROCPRIM_INLINE
212  void to_exclusive(T inclusive_input, T& exclusive_output, T init,
213  BinaryFunction scan_op)
214  {
215  // include init value in scan results
216  exclusive_output = scan_op(init, inclusive_input);
217  // get exclusive results
218  exclusive_output = warp_shuffle_up(exclusive_output, 1, WarpSize);
219  if(detail::logical_lane_id<WarpSize>() == 0)
220  {
221  exclusive_output = init;
222  }
223  }
224 
225  ROCPRIM_DEVICE ROCPRIM_INLINE
226  void to_exclusive(T inclusive_input, T& exclusive_output)
227  {
228  // shift to get exclusive results
229  exclusive_output = warp_shuffle_up(inclusive_input, 1, WarpSize);
230  }
231 };
232 
233 } // end namespace detail
234 
235 END_ROCPRIM_NAMESPACE
236 
237 #endif // ROCPRIM_WARP_DETAIL_WARP_SCAN_SHUFFLE_HPP_
Definition: benchmark_block_scan.cpp:63
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type.
Definition: warp_shuffle.hpp:172
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle_up(const T &input, const unsigned int delta, const int width=device_warp_size())
Shuffle up for any data type.
Definition: warp_shuffle.hpp:197
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: warp_scan_shuffle.hpp:41
Definition: various.hpp:52
Definition: benchmark_block_scan.cpp:100