rocPRIM
warp_load.hpp
1 // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_WARP_WARP_LOAD_HPP_
22 #define ROCPRIM_WARP_WARP_LOAD_HPP_
23 
24 #include "../config.hpp"
25 #include "../intrinsics.hpp"
26 #include "../detail/various.hpp"
27 
28 #include "warp_exchange.hpp"
29 #include "../block/block_load_func.hpp"
30 
33 
34 BEGIN_ROCPRIM_NAMESPACE
35 
38 enum class warp_load_method
39 {
45 
48 
63 
72 
75 };
76 
116 template<
117  class T,
118  unsigned int ItemsPerThread,
119  unsigned int WarpSize = ::rocprim::device_warp_size(),
121 >
123 {
124  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
125  "Logical warp size must be a power of two.");
126  static_assert(WarpSize <= ::rocprim::device_warp_size(),
127  "Logical warp size cannot be larger than physical warp size.");
128 
129 private:
130  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
131 
132 public:
141  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
142  using storage_type = typename ::rocprim::detail::empty_storage_type;
143  #else
144  using storage_type = storage_type_; // only for Doxygen
145  #endif
146 
160  template<class InputIterator>
161  ROCPRIM_DEVICE ROCPRIM_INLINE
162  void load(InputIterator input,
163  T (&items)[ItemsPerThread],
164  storage_type& /*storage*/)
165  {
166  using value_type = typename std::iterator_traits<InputIterator>::value_type;
167  static_assert(std::is_convertible<value_type, T>::value,
168  "The type T must be such that an object of type InputIterator "
169  "can be dereferenced and then implicitly converted to T.");
170  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
171  block_load_direct_blocked(flat_id, input, items);
172  }
173 
188  template<class InputIterator>
189  ROCPRIM_DEVICE ROCPRIM_INLINE
190  void load(InputIterator input,
191  T (&items)[ItemsPerThread],
192  unsigned int valid,
193  storage_type& /*storage*/)
194  {
195  using value_type = typename std::iterator_traits<InputIterator>::value_type;
196  static_assert(std::is_convertible<value_type, T>::value,
197  "The type T must be such that an object of type InputIterator "
198  "can be dereferenced and then implicitly converted to T.");
199  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
200  block_load_direct_blocked(flat_id, input, items, valid);
201  }
202 
218  template<
219  class InputIterator,
220  class Default
221  >
222  ROCPRIM_DEVICE ROCPRIM_INLINE
223  void load(InputIterator input,
224  T (&items)[ItemsPerThread],
225  unsigned int valid,
226  Default out_of_bounds,
227  storage_type& /*storage*/)
228  {
229  using value_type = typename std::iterator_traits<InputIterator>::value_type;
230  static_assert(std::is_convertible<value_type, T>::value,
231  "The type T must be such that an object of type InputIterator "
232  "can be dereferenced and then implicitly converted to T.");
233  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
234  block_load_direct_blocked(flat_id, input, items, valid,
235  out_of_bounds);
236  }
237 };
238 
239 #ifndef DOXYGEN_SHOULD_SKIP_THIS
240 
241 template<
242  class T,
243  unsigned int ItemsPerThread,
244  unsigned int WarpSize
245 >
246 class warp_load<T, ItemsPerThread, WarpSize, warp_load_method::warp_load_striped>
247 {
248  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
249  "Logical warp size must be a power of two.");
250  static_assert(WarpSize <= ::rocprim::device_warp_size(),
251  "Logical warp size cannot be larger than physical warp size.");
252 
253 public:
254  using storage_type = typename ::rocprim::detail::empty_storage_type;
255 
256  template<class InputIterator>
257  ROCPRIM_DEVICE ROCPRIM_INLINE
258  void load(InputIterator input,
259  T (&items)[ItemsPerThread],
260  storage_type& /*storage*/)
261  {
262  using value_type = typename std::iterator_traits<InputIterator>::value_type;
263  static_assert(std::is_convertible<value_type, T>::value,
264  "The type T must be such that an object of type InputIterator "
265  "can be dereferenced and then implicitly converted to T.");
266  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
267  block_load_direct_warp_striped<WarpSize>(flat_id, input, items);
268  }
269 
270  template<class InputIterator>
271  ROCPRIM_DEVICE ROCPRIM_INLINE
272  void load(InputIterator input,
273  T (&items)[ItemsPerThread],
274  unsigned int valid,
275  storage_type& /*storage*/)
276  {
277  using value_type = typename std::iterator_traits<InputIterator>::value_type;
278  static_assert(std::is_convertible<value_type, T>::value,
279  "The type T must be such that an object of type InputIterator "
280  "can be dereferenced and then implicitly converted to T.");
281  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
282  block_load_direct_warp_striped<WarpSize>(flat_id, input, items, valid);
283  }
284 
285  template<
286  class InputIterator,
287  class Default
288  >
289  ROCPRIM_DEVICE ROCPRIM_INLINE
290  void load(InputIterator input,
291  T (&items)[ItemsPerThread],
292  unsigned int valid,
293  Default out_of_bounds,
294  storage_type& /*storage*/)
295  {
296  using value_type = typename std::iterator_traits<InputIterator>::value_type;
297  static_assert(std::is_convertible<value_type, T>::value,
298  "The type T must be such that an object of type InputIterator "
299  "can be dereferenced and then implicitly converted to T.");
300  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
301  block_load_direct_warp_striped<WarpSize>(flat_id, input, items, valid,
302  out_of_bounds);
303  }
304 };
305 
306 template<
307  class T,
308  unsigned int ItemsPerThread,
309  unsigned int WarpSize
310 >
311 class warp_load<T, ItemsPerThread, WarpSize, warp_load_method::warp_load_vectorize>
312 {
313  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
314  "Logical warp size must be a power of two.");
315  static_assert(WarpSize <= ::rocprim::device_warp_size(),
316  "Logical warp size cannot be larger than physical warp size.");
317 
318 public:
319  using storage_type = typename ::rocprim::detail::empty_storage_type;
320 
321  ROCPRIM_DEVICE ROCPRIM_INLINE
322  void load(T* input,
323  T (&items)[ItemsPerThread],
324  storage_type& /*storage*/)
325  {
326  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
327  block_load_direct_blocked_vectorized(flat_id, input, items);
328  }
329 
330  template<class InputIterator>
331  ROCPRIM_DEVICE ROCPRIM_INLINE
332  void load(InputIterator input,
333  T (&items)[ItemsPerThread],
334  storage_type& /*storage*/)
335  {
336  using value_type = typename std::iterator_traits<InputIterator>::value_type;
337  static_assert(std::is_convertible<value_type, T>::value,
338  "The type T must be such that an object of type InputIterator "
339  "can be dereferenced and then implicitly converted to T.");
340  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
341  block_load_direct_blocked(flat_id, input, items);
342  }
343 
344  template<class InputIterator>
345  ROCPRIM_DEVICE ROCPRIM_INLINE
346  void load(InputIterator input,
347  T (&items)[ItemsPerThread],
348  unsigned int valid,
349  storage_type& /*storage*/)
350  {
351  using value_type = typename std::iterator_traits<InputIterator>::value_type;
352  static_assert(std::is_convertible<value_type, T>::value,
353  "The type T must be such that an object of type InputIterator "
354  "can be dereferenced and then implicitly converted to T.");
355  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
356  block_load_direct_blocked(flat_id, input, items, valid);
357  }
358 
359  template<
360  class InputIterator,
361  class Default
362  >
363  ROCPRIM_DEVICE ROCPRIM_INLINE
364  void load(InputIterator input,
365  T (&items)[ItemsPerThread],
366  unsigned int valid,
367  Default out_of_bounds,
368  storage_type& /*storage*/)
369  {
370  using value_type = typename std::iterator_traits<InputIterator>::value_type;
371  static_assert(std::is_convertible<value_type, T>::value,
372  "The type T must be such that an object of type InputIterator "
373  "can be dereferenced and then implicitly converted to T.");
374  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
375  block_load_direct_blocked(flat_id, input, items, valid,
376  out_of_bounds);
377  }
378 };
379 
380 template<
381  class T,
382  unsigned int ItemsPerThread,
383  unsigned int WarpSize
384 >
385 class warp_load<T, ItemsPerThread, WarpSize, warp_load_method::warp_load_transpose>
386 {
387  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
388  "Logical warp size must be a power of two.");
389  static_assert(WarpSize <= ::rocprim::device_warp_size(),
390  "Logical warp size cannot be larger than physical warp size.");
391 
392 private:
393  using exchange_type = ::rocprim::warp_exchange<T, ItemsPerThread, WarpSize>;
394 
395 public:
396  using storage_type = typename exchange_type::storage_type;
397 
398  template<class InputIterator>
399  ROCPRIM_DEVICE ROCPRIM_INLINE
400  void load(InputIterator input,
401  T (&items)[ItemsPerThread],
402  storage_type& storage)
403  {
404  using value_type = typename std::iterator_traits<InputIterator>::value_type;
405  static_assert(std::is_convertible<value_type, T>::value,
406  "The type T must be such that an object of type InputIterator "
407  "can be dereferenced and then implicitly converted to T.");
408  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
409  block_load_direct_warp_striped<WarpSize>(flat_id, input, items);
410  exchange_type().striped_to_blocked(items, items, storage);
411  }
412 
413  template<class InputIterator>
414  ROCPRIM_DEVICE ROCPRIM_INLINE
415  void load(InputIterator input,
416  T (&items)[ItemsPerThread],
417  unsigned int valid,
418  storage_type& storage)
419  {
420  using value_type = typename std::iterator_traits<InputIterator>::value_type;
421  static_assert(std::is_convertible<value_type, T>::value,
422  "The type T must be such that an object of type InputIterator "
423  "can be dereferenced and then implicitly converted to T.");
424  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
425  block_load_direct_warp_striped<WarpSize>(flat_id, input, items, valid);
426  exchange_type().striped_to_blocked(items, items, storage);
427  }
428 
429  template<
430  class InputIterator,
431  class Default
432  >
433  ROCPRIM_DEVICE ROCPRIM_INLINE
434  void load(InputIterator input,
435  T (&items)[ItemsPerThread],
436  unsigned int valid,
437  Default out_of_bounds,
438  storage_type& storage)
439  {
440  using value_type = typename std::iterator_traits<InputIterator>::value_type;
441  static_assert(std::is_convertible<value_type, T>::value,
442  "The type T must be such that an object of type InputIterator "
443  "can be dereferenced and then implicitly converted to T.");
444  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
445  block_load_direct_warp_striped<WarpSize>(flat_id, input, items, valid,
446  out_of_bounds);
447  exchange_type().striped_to_blocked(items, items, storage);
448  }
449 };
450 
451 #endif // DOXYGEN_SHOULD_SKIP_THIS
452 
453 END_ROCPRIM_NAMESPACE
454 
456 // end of group warpmodule
457 
458 #endif // ROCPRIM_WARP_WARP_LOAD_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds, storage_type &)
Loads data from continuous memory into an arrangement of items across the warp.
Definition: warp_load.hpp:223
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator input, T(&items)[ItemsPerThread], unsigned int valid, storage_type &)
Loads data from continuous memory into an arrangement of items across the warp.
Definition: warp_load.hpp:190
Data from continuous memory is loaded into a blocked arrangement of items.
A striped arrangement of data is read directly from memory.
Defaults to block_load_direct.
ROCPRIM_DEVICE ROCPRIM_INLINE auto block_load_direct_blocked_vectorized(unsigned int flat_id, T *block_input, U(&items)[ItemsPerThread]) -> typename std::enable_if< detail::is_vectorizable< T, ItemsPerThread >::value >::type
Loads data from continuous memory into a blocked arrangement of items across the thread block...
Definition: block_load_func.hpp:186
typename ::rocprim::detail::empty_storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: warp_load.hpp:142
warp_load_method
warp_load_method enumerates the methods available to load data from continuous memory into a blocked/...
Definition: warp_load.hpp:38
ROCPRIM_DEVICE ROCPRIM_INLINE void load(InputIterator input, T(&items)[ItemsPerThread], storage_type &)
Loads data from continuous memory into an arrangement of items across the warp.
Definition: warp_load.hpp:162
Data from continuous memory is loaded into a blocked arrangement of items using vectorization as an o...
A striped arrangement of data from continuous memory is locally transposed into a blocked arrangement...
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread])
Loads data from continuous memory into a blocked arrangement of items across the thread block...
Definition: block_load_func.hpp:58
The warp_load class is a warp level parallel primitive which provides methods for loading data from c...
Definition: warp_load.hpp:122