rocPRIM
warp_exchange.hpp
1 // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_WARP_WARP_EXCHANGE_HPP_
22 #define ROCPRIM_WARP_WARP_EXCHANGE_HPP_
23 
24 #include "../config.hpp"
25 #include "../detail/various.hpp"
26 
27 #include "../intrinsics.hpp"
28 #include "../intrinsics/warp_shuffle.hpp"
29 #include "../functional.hpp"
30 #include "../types.hpp"
31 
34 
35 BEGIN_ROCPRIM_NAMESPACE
36 
75 template<
76  class T,
77  unsigned int ItemsPerThread,
78  unsigned int WarpSize = ::rocprim::device_warp_size()
79 >
81 {
82  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
83  "Logical warp size must be a power of two.");
84  static_assert(WarpSize <= ::rocprim::device_warp_size(),
85  "Logical warp size cannot be larger than physical warp size.");
86 
87  // Struct used for creating a raw_storage object for this primitive's temporary storage.
88  struct storage_type_
89  {
90  T buffer[WarpSize * ItemsPerThread];
91  };
92 
93 public:
94 
103  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
105  #else
106  using storage_type = storage_type_; // only for Doxygen
107  #endif
108 
143  template<class U>
144  ROCPRIM_DEVICE ROCPRIM_INLINE
145  void blocked_to_striped(const T (&input)[ItemsPerThread],
146  U (&output)[ItemsPerThread],
147  storage_type& storage)
148  {
149  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
150  storage_type_& storage_ = storage.get();
151 
152  ROCPRIM_UNROLL
153  for(unsigned int i = 0; i < ItemsPerThread; i++)
154  {
155  storage_.buffer[flat_id * ItemsPerThread + i] = input[i];
156  }
158 
159  ROCPRIM_UNROLL
160  for(unsigned int i = 0; i < ItemsPerThread; i++)
161  {
162  output[i] = storage_.buffer[i * WarpSize + flat_id];
163  }
164  }
165 
195  template<class U>
196  ROCPRIM_DEVICE ROCPRIM_INLINE
197  void blocked_to_striped_shuffle(const T (&input)[ItemsPerThread],
198  U (&output)[ItemsPerThread])
199  {
200  static_assert(WarpSize % ItemsPerThread == 0,
201  "ItemsPerThread must be a divisor of WarpSize to use blocked_to_striped_shuffle");
202  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
203  U work_array[ItemsPerThread];
204 
205  ROCPRIM_UNROLL
206  for(unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++)
207  {
208  ROCPRIM_UNROLL
209  for(unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++)
210  {
211  const auto value = ::rocprim::warp_shuffle(
212  input[src_idx],
213  flat_id / ItemsPerThread + dst_idx * (WarpSize / ItemsPerThread)
214  );
215  if(src_idx == flat_id % ItemsPerThread)
216  {
217  work_array[dst_idx] = value;
218  }
219  }
220  }
221 
222  ROCPRIM_UNROLL
223  for(unsigned int i = 0; i < ItemsPerThread; i++)
224  {
225  output[i] = work_array[i];
226  }
227  }
228 
263  template<class U>
264  ROCPRIM_DEVICE ROCPRIM_INLINE
265  void striped_to_blocked(const T (&input)[ItemsPerThread],
266  U (&output)[ItemsPerThread],
267  storage_type& storage)
268  {
269  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
270  storage_type_& storage_ = storage.get();
271 
272  ROCPRIM_UNROLL
273  for(unsigned int i = 0; i < ItemsPerThread; i++)
274  {
275  storage_.buffer[i * WarpSize + flat_id] = input[i];
276  }
278 
279  ROCPRIM_UNROLL
280  for(unsigned int i = 0; i < ItemsPerThread; i++)
281  {
282  output[i] = storage_.buffer[flat_id * ItemsPerThread + i];
283  }
284  }
285 
315  template<class U>
316  ROCPRIM_DEVICE ROCPRIM_INLINE
317  void striped_to_blocked_shuffle(const T (&input)[ItemsPerThread],
318  U (&output)[ItemsPerThread])
319  {
320  static_assert(WarpSize % ItemsPerThread == 0,
321  "ItemsPerThread must be a divisor of WarpSize to use striped_to_blocked_shuffle");
322  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
323  U work_array[ItemsPerThread];
324 
325  ROCPRIM_UNROLL
326  for(unsigned int dst_idx = 0; dst_idx < ItemsPerThread; dst_idx++)
327  {
328  ROCPRIM_UNROLL
329  for(unsigned int src_idx = 0; src_idx < ItemsPerThread; src_idx++)
330  {
331  const auto value = ::rocprim::warp_shuffle(
332  input[src_idx],
333  (ItemsPerThread * flat_id + dst_idx) % WarpSize
334  );
335  if(flat_id / (WarpSize / ItemsPerThread) == src_idx)
336  {
337  work_array[dst_idx] = value;
338  }
339  }
340  }
341 
342  ROCPRIM_UNROLL
343  for(unsigned int i = 0; i < ItemsPerThread; i++)
344  {
345  output[i] = work_array[i];
346  }
347  }
348 
388  template<class U, class OffsetT>
389  ROCPRIM_DEVICE ROCPRIM_INLINE
391  const T (&input)[ItemsPerThread],
392  U (&output)[ItemsPerThread],
393  const OffsetT (&ranks)[ItemsPerThread],
394  storage_type& storage)
395  {
396  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
397  storage_type_& storage_ = storage.get();
398 
399  ROCPRIM_UNROLL
400  for (unsigned int i = 0; i < ItemsPerThread; i++)
401  {
402  storage_.buffer[ranks[i]] = input[i];
403  }
405 
406  ROCPRIM_UNROLL
407  for (unsigned int i = 0; i < ItemsPerThread; i++)
408  {
409  unsigned int item_offset = (i * WarpSize) + flat_id;
410  output[i] = storage_.buffer[item_offset];
411  }
412  }
413 };
414 
415 END_ROCPRIM_NAMESPACE
416 
418 // end of group warpmodule
419 
420 #endif // ROCPRIM_WARP_WARP_EXCHANGE_HPP_
The warp_exchange class is a warp level parallel primitive which provides methods for rearranging ite...
Definition: warp_exchange.hpp:80
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const OffsetT(&ranks)[ItemsPerThread], storage_type &storage)
Orders input values according to ranks using temporary storage, then writes the values to output in a...
Definition: warp_exchange.hpp:390
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type.
Definition: warp_shuffle.hpp:172
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront.
Definition: thread.hpp:235
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped_shuffle(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the warp, using warp shuffle operations.
Definition: warp_exchange.hpp:197
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a blocked arrangement of items to a striped arrangement across the warp, using temporary storage.
Definition: warp_exchange.hpp:145
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked_shuffle(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a striped arrangement of items to a blocked arrangement across the warp, using warp shuffle operations.
Definition: warp_exchange.hpp:317
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
Transposes a striped arrangement of items to a blocked arrangement across the warp, using temporary storage.
Definition: warp_exchange.hpp:265