21 #ifndef ROCPRIM_BLOCK_BLOCK_STORE_HPP_ 22 #define ROCPRIM_BLOCK_BLOCK_STORE_HPP_ 24 #include "../config.hpp" 25 #include "../detail/various.hpp" 27 #include "../intrinsics.hpp" 28 #include "../functional.hpp" 29 #include "../types.hpp" 31 #include "block_store_func.hpp" 32 #include "block_exchange.hpp" 37 BEGIN_ROCPRIM_NAMESPACE
128 unsigned int BlockSizeX,
129 unsigned int ItemsPerThread,
131 unsigned int BlockSizeY = 1,
132 unsigned int BlockSizeZ = 1
137 using storage_type_ = typename ::rocprim::detail::empty_storage_type;
148 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 166 template<
class OutputIterator>
167 ROCPRIM_DEVICE ROCPRIM_INLINE
168 void store(OutputIterator block_output,
169 T (&items)[ItemsPerThread])
171 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
188 template<
class OutputIterator>
189 ROCPRIM_DEVICE ROCPRIM_INLINE
190 void store(OutputIterator block_output,
191 T (&items)[ItemsPerThread],
194 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
228 template<
class OutputIterator>
229 ROCPRIM_DEVICE ROCPRIM_INLINE
230 void store(OutputIterator block_output,
231 T (&items)[ItemsPerThread],
235 store(block_output, items);
270 template<
class OutputIterator>
271 ROCPRIM_DEVICE ROCPRIM_INLINE
272 void store(OutputIterator block_output,
273 T (&items)[ItemsPerThread],
278 store(block_output, items, valid);
282 #ifndef DOXYGEN_SHOULD_SKIP_THIS 286 unsigned int BlockSizeX,
287 unsigned int ItemsPerThread,
288 unsigned int BlockSizeY,
289 unsigned int BlockSizeZ
293 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
295 using storage_type_ = typename ::rocprim::detail::empty_storage_type;
298 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 299 using storage_type = typename ::rocprim::detail::empty_storage_type;
301 using storage_type = storage_type_;
304 template<
class OutputIterator>
305 ROCPRIM_DEVICE
inline 306 void store(OutputIterator block_output,
307 T (&items)[ItemsPerThread])
309 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
310 block_store_direct_striped<BlockSize>(flat_id, block_output, items);
313 template<
class OutputIterator>
314 ROCPRIM_DEVICE
inline 315 void store(OutputIterator block_output,
316 T (&items)[ItemsPerThread],
319 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
320 block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
323 template<
class OutputIterator>
324 ROCPRIM_DEVICE
inline 325 void store(OutputIterator block_output,
326 T (&items)[ItemsPerThread],
327 storage_type& storage)
330 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
331 block_store_direct_striped<BlockSize>(flat_id, block_output, items);
334 template<
class OutputIterator>
335 ROCPRIM_DEVICE
inline 336 void store(OutputIterator block_output,
337 T (&items)[ItemsPerThread],
339 storage_type& storage)
342 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
343 block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
349 unsigned int BlockSizeX,
350 unsigned int ItemsPerThread,
351 unsigned int BlockSizeY,
352 unsigned int BlockSizeZ
357 using storage_type_ = typename ::rocprim::detail::empty_storage_type;
360 #ifndef DOXYGEN_SHOULD_SKIP_THIS // hides storage_type implementation for Doxygen 361 using storage_type = typename ::rocprim::detail::empty_storage_type;
363 using storage_type = storage_type_;
366 ROCPRIM_DEVICE ROCPRIM_INLINE
367 void store(T* block_output,
368 T (&_items)[ItemsPerThread])
370 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
374 template<
class OutputIterator,
class U>
375 ROCPRIM_DEVICE ROCPRIM_INLINE
376 void store(OutputIterator block_output,
377 U (&items)[ItemsPerThread])
379 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
383 template<
class OutputIterator>
384 ROCPRIM_DEVICE ROCPRIM_INLINE
385 void store(OutputIterator block_output,
386 T (&items)[ItemsPerThread],
389 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
393 ROCPRIM_DEVICE ROCPRIM_INLINE
394 void store(T* block_output,
395 T (&items)[ItemsPerThread],
396 storage_type& storage)
399 store(block_output, items);
402 template<
class OutputIterator,
class U>
403 ROCPRIM_DEVICE ROCPRIM_INLINE
404 void store(OutputIterator block_output,
405 U (&items)[ItemsPerThread],
406 storage_type& storage)
409 store(block_output, items);
412 template<
class OutputIterator>
413 ROCPRIM_DEVICE ROCPRIM_INLINE
414 void store(OutputIterator block_output,
415 T (&items)[ItemsPerThread],
417 storage_type& storage)
420 store(block_output, items, valid);
426 unsigned int BlockSizeX,
427 unsigned int ItemsPerThread,
428 unsigned int BlockSizeY,
429 unsigned int BlockSizeZ
433 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
440 template<
class OutputIterator>
441 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
442 void store(OutputIterator block_output,
443 T (&items)[ItemsPerThread])
445 ROCPRIM_SHARED_MEMORY storage_type storage;
446 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
448 block_store_direct_striped<BlockSize>(flat_id, block_output, items);
451 template<
class OutputIterator>
452 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
453 void store(OutputIterator block_output,
454 T (&items)[ItemsPerThread],
457 ROCPRIM_SHARED_MEMORY storage_type storage;
458 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
460 block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
463 template<
class OutputIterator>
464 ROCPRIM_DEVICE ROCPRIM_INLINE
465 void store(OutputIterator block_output,
466 T (&items)[ItemsPerThread],
467 storage_type& storage)
469 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
471 block_store_direct_striped<BlockSize>(flat_id, block_output, items);
474 template<
class OutputIterator>
475 ROCPRIM_DEVICE ROCPRIM_INLINE
476 void store(OutputIterator block_output,
477 T (&items)[ItemsPerThread],
479 storage_type& storage)
481 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
483 block_store_direct_striped<BlockSize>(flat_id, block_output, items, valid);
489 unsigned int BlockSizeX,
490 unsigned int ItemsPerThread,
491 unsigned int BlockSizeY,
492 unsigned int BlockSizeZ
496 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
502 "BlockSize must be a multiple of hardware warpsize");
506 template<
class OutputIterator>
507 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
508 void store(OutputIterator block_output,
509 T (&items)[ItemsPerThread])
511 ROCPRIM_SHARED_MEMORY storage_type storage;
512 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
517 template<
class OutputIterator>
518 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
519 void store(OutputIterator block_output,
520 T (&items)[ItemsPerThread],
523 ROCPRIM_SHARED_MEMORY storage_type storage;
524 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
529 template<
class OutputIterator>
530 ROCPRIM_DEVICE ROCPRIM_INLINE
531 void store(OutputIterator block_output,
532 T (&items)[ItemsPerThread],
533 storage_type& storage)
535 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
540 template<
class OutputIterator>
541 ROCPRIM_DEVICE ROCPRIM_INLINE
542 void store(OutputIterator block_output,
543 T (&items)[ItemsPerThread],
545 storage_type& storage)
547 const unsigned int flat_id = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
553 #endif // DOXYGEN_SHOULD_SKIP_THIS 555 END_ROCPRIM_NAMESPACE
560 #endif // ROCPRIM_BLOCK_BLOCK_STORE_HPP_ ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_warp_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block...
Definition: block_exchange.hpp:262
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread])
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:168
A blocked arrangement of items is locally transposed and stored as a warp-striped arrangement of data...
block_store_method
block_store_method enumerates the methods available to store a striped arrangement of items into a bl...
Definition: block_store.hpp:41
A blocked arrangement of items is stored into a blocked arrangement on continuous memory using vector...
The block_store class is a block level parallel primitive which provides methods for storing an arran...
Definition: block_store.hpp:134
ROCPRIM_DEVICE ROCPRIM_INLINE auto block_store_direct_blocked_vectorized(unsigned int flat_id, T *block_output, U(&items)[ItemsPerThread]) -> typename std::enable_if< detail::is_vectorizable< T, ItemsPerThread >::value >::type
Stores a blocked arrangement of items from across the thread block into a blocked arrangement on cont...
Definition: block_store_func.hpp:151
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid, storage_type &storage)
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:272
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70
The block_exchange class is a block level parallel primitive which provides methods for rearranging i...
Definition: block_exchange.hpp:81
Defaults to block_load_direct.
A striped arrangement of items is stored into a blocked arrangement on continuous memory...
A blocked arrangement of items is stored into a blocked arrangement on continuous memory...
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid)
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:190
ROCPRIM_DEVICE ROCPRIM_INLINE void store(OutputIterator block_output, T(&items)[ItemsPerThread], storage_type &storage)
Stores an arrangement of items from across the thread block into an arrangement on continuous memory...
Definition: block_store.hpp:230
typename ::rocprim::detail::empty_storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: block_store.hpp:149
BEGIN_ROCPRIM_NAMESPACE ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_blocked(unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread])
Stores a blocked arrangement of items from across the thread block into a blocked arrangement on cont...
Definition: block_store_func.hpp:58
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_striped(const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
Transposes a blocked arrangement of items to a striped arrangement across the thread block...
Definition: block_exchange.hpp:130
ROCPRIM_DEVICE ROCPRIM_INLINE void block_store_direct_warp_striped(unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread])
Stores a warp-striped arrangement of items from across the thread block into a blocked arrangement on...
Definition: block_store_func.hpp:306
A blocked arrangement of items is locally transposed and stored as a striped arrangement of data on c...