rocPRIM
warp_store.hpp
1 // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_WARP_WARP_STORE_HPP_
22 #define ROCPRIM_WARP_WARP_STORE_HPP_
23 
24 #include "../config.hpp"
25 #include "../intrinsics.hpp"
26 #include "../detail/various.hpp"
27 
28 #include "warp_exchange.hpp"
29 #include "../block/block_store_func.hpp"
30 
33 
34 BEGIN_ROCPRIM_NAMESPACE
35 
39 {
46 
50 
65 
74 
77 };
78 
118 template<
119  class T,
120  unsigned int ItemsPerThread,
121  unsigned int WarpSize = ::rocprim::device_warp_size(),
123 >
125 {
126  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
127  "Logical warp size must be a power of two.");
128  static_assert(WarpSize <= ::rocprim::device_warp_size(),
129  "Logical warp size cannot be larger than physical warp size.");
130 
131 private:
132  using storage_type_ = typename ::rocprim::detail::empty_storage_type;
133 
134 public:
143  #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen
144  using storage_type = typename ::rocprim::detail::empty_storage_type;
145  #else
146  using storage_type = storage_type_; // only for Doxygen
147  #endif
148 
165  template<class OutputIterator>
166  ROCPRIM_DEVICE ROCPRIM_INLINE
167  void store(OutputIterator output,
168  T (&items)[ItemsPerThread],
169  storage_type& /*storage*/)
170  {
171  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
172  static_assert(std::is_convertible<T, value_type>::value,
173  "The type T must be such that an object of type OutputIterator "
174  "can be dereferenced and then implicitly assigned from T.");
175  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
176  block_store_direct_blocked(flat_id, output, items);
177  }
178 
197  template<class OutputIterator>
198  ROCPRIM_DEVICE ROCPRIM_INLINE
199  void store(OutputIterator output,
200  T (&items)[ItemsPerThread],
201  unsigned int valid,
202  storage_type& /*storage*/)
203  {
204  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
205  static_assert(std::is_convertible<T, value_type>::value,
206  "The type T must be such that an object of type OutputIterator "
207  "can be dereferenced and then implicitly assigned from T.");
208  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
209  block_store_direct_blocked(flat_id, output, items, valid);
210  }
211 };
212 
213 #ifndef DOXYGEN_SHOULD_SKIP_THIS
214 
215 template<
216  class T,
217  unsigned int ItemsPerThread,
218  unsigned int WarpSize
219 >
220 class warp_store<T, ItemsPerThread, WarpSize, warp_store_method::warp_store_striped>
221 {
222  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
223  "Logical warp size must be a power of two.");
224  static_assert(WarpSize <= ::rocprim::device_warp_size(),
225  "Logical warp size cannot be larger than physical warp size.");
226 
227 public:
228  using storage_type = typename ::rocprim::detail::empty_storage_type;
229 
230  template<class OutputIterator>
231  ROCPRIM_DEVICE ROCPRIM_INLINE
232  void store(OutputIterator output,
233  T (&items)[ItemsPerThread],
234  storage_type& /*storage*/)
235  {
236  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
237  static_assert(std::is_convertible<T, value_type>::value,
238  "The type T must be such that an object of type OutputIterator "
239  "can be dereferenced and then implicitly assigned from T.");
240  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
241  block_store_direct_warp_striped<WarpSize>(flat_id, output, items);
242  }
243 
244  template<class OutputIterator>
245  ROCPRIM_DEVICE ROCPRIM_INLINE
246  void store(OutputIterator output,
247  T (&items)[ItemsPerThread],
248  unsigned int valid,
249  storage_type& /*storage*/)
250  {
251  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
252  static_assert(std::is_convertible<T, value_type>::value,
253  "The type T must be such that an object of type OutputIterator "
254  "can be dereferenced and then implicitly assigned from T.");
255  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
256  block_store_direct_warp_striped<WarpSize>(flat_id, output, items, valid);
257  }
258 };
259 
260 template<
261  class T,
262  unsigned int ItemsPerThread,
263  unsigned int WarpSize
264 >
265 class warp_store<T, ItemsPerThread, WarpSize, warp_store_method::warp_store_vectorize>
266 {
267  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
268  "Logical warp size must be a power of two.");
269  static_assert(WarpSize <= ::rocprim::device_warp_size(),
270  "Logical warp size cannot be larger than physical warp size.");
271 
272 public:
273  using storage_type = typename ::rocprim::detail::empty_storage_type;
274 
275  ROCPRIM_DEVICE ROCPRIM_INLINE
276  void store(T* output,
277  T (&items)[ItemsPerThread],
278  storage_type& /*storage*/)
279  {
280  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
281  block_store_direct_blocked_vectorized(flat_id, output, items);
282  }
283 
284  template<class OutputIterator>
285  ROCPRIM_DEVICE ROCPRIM_INLINE
286  void store(OutputIterator output,
287  T (&items)[ItemsPerThread],
288  storage_type& /*storage*/)
289  {
290  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
291  static_assert(std::is_convertible<T, value_type>::value,
292  "The type T must be such that an object of type OutputIterator "
293  "can be dereferenced and then implicitly assigned from T.");
294  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
295  block_store_direct_blocked(flat_id, output, items);
296  }
297 
298  template<class OutputIterator>
299  ROCPRIM_DEVICE ROCPRIM_INLINE
300  void store(OutputIterator output,
301  T (&items)[ItemsPerThread],
302  unsigned int valid,
303  storage_type& /*storage*/)
304  {
305  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
306  static_assert(std::is_convertible<T, value_type>::value,
307  "The type T must be such that an object of type OutputIterator "
308  "can be dereferenced and then implicitly assigned from T.");
309  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
310  block_store_direct_blocked(flat_id, output, items, valid);
311  }
312 };
313 
314 template<
315  class T,
316  unsigned int ItemsPerThread,
317  unsigned int WarpSize
318 >
319 class warp_store<T, ItemsPerThread, WarpSize, warp_store_method::warp_store_transpose>
320 {
321  static_assert(::rocprim::detail::is_power_of_two(WarpSize),
322  "Logical warp size must be a power of two.");
323  static_assert(WarpSize <= ::rocprim::device_warp_size(),
324  "Logical warp size cannot be larger than physical warp size.");
325 
326 private:
327  using exchange_type = ::rocprim::warp_exchange<T, ItemsPerThread, WarpSize>;
328 
329 public:
330  using storage_type = typename exchange_type::storage_type;
331 
332  template<class OutputIterator>
333  ROCPRIM_DEVICE ROCPRIM_INLINE
334  void store(OutputIterator output,
335  T (&items)[ItemsPerThread],
336  storage_type& storage)
337  {
338  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
339  static_assert(std::is_convertible<T, value_type>::value,
340  "The type T must be such that an object of type OutputIterator "
341  "can be dereferenced and then implicitly assigned from T.");
342  exchange_type().blocked_to_striped(items, items, storage);
343  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
344  block_store_direct_warp_striped<WarpSize>(flat_id, output, items);
345  }
346 
347  template<class OutputIterator>
348  ROCPRIM_DEVICE ROCPRIM_INLINE
349  void store(OutputIterator output,
350  T (&items)[ItemsPerThread],
351  unsigned int valid,
352  storage_type& storage)
353  {
354  using value_type = typename std::iterator_traits<OutputIterator>::value_type;
355  static_assert(std::is_convertible<T, value_type>::value,
356  "The type T must be such that an object of type OutputIterator "
357  "can be dereferenced and then implicitly assigned from T.");
358  exchange_type().blocked_to_striped(items, items, storage);
359  const unsigned int flat_id = ::rocprim::detail::logical_lane_id<WarpSize>();
360  block_store_direct_warp_striped<WarpSize>(flat_id, output, items, valid);
361  }
362 };
363 
364 #endif // DOXYGEN_SHOULD_SKIP_THIS
365 
366 END_ROCPRIM_NAMESPACE
367 
369 // end of group warpmodule
370 
371 #endif // ROCPRIM_WARP_WARP_STORE_HPP_
typename ::rocprim::detail::empty_storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: warp_store.hpp:144
ROCPRIM_DEVICE ROCPRIM_INLINE auto block_store_direct_blocked_vectorized(unsigned int flat_id, T *block_output, U(&items)[ItemsPerThread]) -> typename std::enable_if< detail::is_vectorizable< T, ItemsPerThread >::value >::type
Stores a blocked arrangement of items from across the thread block into a blocked arrangement on cont...
Definition: block_store_func.hpp:151
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
warp_store_method
warp_store_method enumerates the methods available to store a blocked/striped arrangement of items in...
Definition: warp_store.hpp:38
Defaults to block_load_direct.
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator output, T(&items)[ItemsPerThread], storage_type &)
Stores an arrangement of items from across the warp into an arrangement on continuous memory...
Definition: warp_store.hpp:167
A blocked arrangement of items is locally transposed and stored as a striped arrangement of data on c...
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator output, T(&items)[ItemsPerThread], unsigned int valid, storage_type &)
Stores an arrangement of items from across the warp into an arrangement on continuous memory...
Definition: warp_store.hpp:199
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread])
Stores a blocked arrangement of items from across the thread block into a blocked arrangement on cont...
Definition: block_store_func.hpp:58
A blocked arrangement of items is stored into a blocked arrangement on continuous memory using vector...
A blocked arrangement of items is stored into a blocked arrangement on continuous memory...
The warp_store class is a warp level parallel primitive which provides methods for storing an arrange...
Definition: warp_store.hpp:124
A striped arrangement of items is stored into a blocked arrangement on continuous memory...