rocPRIM
Classes | Public Types | Public Member Functions | List of all members
block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ > Class Template Reference

The block_exchange class is a block level parallel primitive which provides methods for rearranging items partitioned across threads in a block. More...

#include <block_exchange.hpp>

Public Types

using storage_type = detail::raw_storage< storage_type_ >
 Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive. More...
 

Public Member Functions

template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
 Transposes a blocked arrangement of items to a striped arrangement across the thread block. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
 Transposes a blocked arrangement of items to a striped arrangement across the thread block, using temporary storage. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
 Transposes a striped arrangement of items to a blocked arrangement across the thread block. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
 Transposes a striped arrangement of items to a blocked arrangement across the thread block, using temporary storage. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void blocked_to_warp_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
 Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void blocked_to_warp_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
 Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block, using temporary storage. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void warp_striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread])
 Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block. More...
 
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void warp_striped_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], storage_type &storage)
 Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block, using temporary storage. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
 Scatters items to a blocked arrangement based on their ranks across the thread block. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void gather_from_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
 Gathers items from a striped arrangement based on their ranks across the thread block. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_blocked (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
 Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void gather_from_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
 Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
 Scatters items to a striped arrangement based on their ranks across the thread block. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
 Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped_guarded (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread])
 Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank. More...
 
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped_guarded (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], storage_type &storage)
 Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage. More...
 
template<class U , class Offset , class ValidFlag >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void scatter_to_striped_flagged (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread])
 Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity. More...
 
template<class U , class Offset , class ValidFlag >
ROCPRIM_DEVICE ROCPRIM_INLINE void scatter_to_striped_flagged (const T(&input)[ItemsPerThread], U(&output)[ItemsPerThread], const Offset(&ranks)[ItemsPerThread], const ValidFlag(&is_valid)[ItemsPerThread], storage_type &storage)
 Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage. More...
 

Detailed Description

template<class T, unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
class block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >

The block_exchange class is a block level parallel primitive which provides methods for rearranging items partitioned across threads in a block.

Template Parameters
T- the input type.
BlockSize- the number of threads in a block.
ItemsPerThread- the number of items contributed by each thread.
Overview
  • The block_exchange class supports the following rearrangement methods:
    • Transposing a blocked arrangement to a striped arrangement.
    • Transposing a striped arrangement to a blocked arrangement.
    • Transposing a blocked arrangement to a warp-striped arrangement.
    • Transposing a warp-striped arrangement to a blocked arrangement.
    • Scattering items to a blocked arrangement.
    • Scattering items to a striped arrangement.
  • Data is automatically be padded to ensure zero bank conflicts.
Examples

In the examples exchange operation is performed on block of 128 threads, using type int with 8 items per thread.

__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
...
block_exchange_int b_exchange;
b_exchange.blocked_to_striped(items, items, storage);
...
}

Member Typedef Documentation

◆ storage_type

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
using block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::storage_type = detail::raw_storage<storage_type_>

Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.

Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords __shared__. It can be aliased to an externally allocated memory, or be a part of a union type with other storage types to increase shared memory reusability.

Member Function Documentation

◆ blocked_to_striped() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::blocked_to_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread] 
)
inline

Transposes a blocked arrangement of items to a striped arrangement across the thread block.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.

◆ blocked_to_striped() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::blocked_to_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
storage_type storage 
)
inline

Transposes a blocked arrangement of items to a striped arrangement across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
...
block_exchange_int b_exchange;
b_exchange.blocked_to_striped(items, items, storage);
...
}

◆ blocked_to_warp_striped() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::blocked_to_warp_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread] 
)
inline

Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.

◆ blocked_to_warp_striped() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::blocked_to_warp_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
storage_type storage 
)
inline

Transposes a blocked arrangement of items to a warp-striped arrangement across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
...
block_exchange_int b_exchange;
b_exchange.blocked_to_warp_striped(items, items, storage);
...
}

◆ gather_from_striped() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::gather_from_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread] 
)
inline

Gathers items from a striped arrangement based on their ranks across the thread block.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[out]ranks- array that has rank of data.

◆ gather_from_striped() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::gather_from_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread],
storage_type storage 
)
inline

Gathers items from a striped arrangement based on their ranks across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[out]ranks- array that has rank of data.
[in]storage- reference to a temporary storage object of type storage_type.

◆ scatter_to_blocked() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_blocked ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread] 
)
inline

Scatters items to a blocked arrangement based on their ranks across the thread block.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[out]ranks- array that has rank of data.

◆ scatter_to_blocked() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_blocked ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread],
storage_type storage 
)
inline

Scatters items to a blocked arrangement based on their ranks across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[out]ranks- array that has rank of data.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
int ranks[8];
...
block_exchange_int b_exchange;
b_exchange.scatter_to_blocked(items, items, ranks, storage);
...
}

◆ scatter_to_striped() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread] 
)
inline

Scatters items to a striped arrangement based on their ranks across the thread block.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[out]ranks- array that has rank of data.

◆ scatter_to_striped() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_striped ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread],
storage_type storage 
)
inline

Scatters items to a striped arrangement based on their ranks across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[out]ranks- array that has rank of data.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
int ranks[8];
...
block_exchange_int b_exchange;
b_exchange.scatter_to_striped(items, items, ranks, storage);
...
}

◆ scatter_to_striped_flagged() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset , class ValidFlag >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_striped_flagged ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread],
const ValidFlag(&)  is_valid[ItemsPerThread] 
)
inline

Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
ValidFlag- [inferred] the validity flag type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]ranks- array that has rank of data.
[in]is_valid- array that has flags to denote validity.

◆ scatter_to_striped_flagged() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset , class ValidFlag >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_striped_flagged ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread],
const ValidFlag(&)  is_valid[ItemsPerThread],
storage_type storage 
)
inline

Scatters items to a striped arrangement based on their ranks across the thread block, with a flag to denote validity, using temporary storage.

Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
ValidFlag- [inferred] the validity flag type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]ranks- array that has rank of data.
[in]is_valid- array that has flags to denote validity.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
int ranks[8];
int flags[8];
...
block_exchange_int b_exchange;
b_exchange.scatter_to_striped_flagged(items, items, ranks, flags, storage);
...
}

◆ scatter_to_striped_guarded() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_striped_guarded ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread] 
)
inline

Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank.

Overview
  • Items with rank -1 are not scattered.
Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]ranks- array that has rank of data.

◆ scatter_to_striped_guarded() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U , class Offset >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::scatter_to_striped_guarded ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
const Offset(&)  ranks[ItemsPerThread],
storage_type storage 
)
inline

Scatters items to a striped arrangement based on their ranks across the thread block, guarded by rank, using temporary storage.

Overview
  • Items with rank -1 are not scattered.
Template Parameters
U- [inferred] the output type.
Offset- [inferred] the rank type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]ranks- array that has rank of data.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
int ranks[8];
...
block_exchange_int b_exchange;
b_exchange.scatter_to_striped_guarded(items, items, ranks, storage);
...
}

◆ striped_to_blocked() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::striped_to_blocked ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread] 
)
inline

Transposes a striped arrangement of items to a blocked arrangement across the thread block.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.

◆ striped_to_blocked() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::striped_to_blocked ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
storage_type storage 
)
inline

Transposes a striped arrangement of items to a blocked arrangement across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
...
block_exchange_int b_exchange;
b_exchange.striped_to_blocked(items, items, storage);
...
}

◆ warp_striped_to_blocked() [1/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::warp_striped_to_blocked ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread] 
)
inline

Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.

◆ warp_striped_to_blocked() [2/2]

template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class U >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_exchange< T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ >::warp_striped_to_blocked ( const T(&)  input[ItemsPerThread],
U(&)  output[ItemsPerThread],
storage_type storage 
)
inline

Transposes a warp-striped arrangement of items to a blocked arrangement across the thread block, using temporary storage.

Template Parameters
U- [inferred] the output type.
Parameters
[in]input- array that data is loaded from.
[out]output- array that data is loaded to.
[in]storage- reference to a temporary storage object of type storage_type.
Storage reusage
Synchronization barrier should be placed before storage is reused or repurposed: __syncthreads() or rocprim::syncthreads().
Example.
__global__ void example_kernel(...)
{
// specialize block_exchange for int, block of 128 threads and 8 items per thread
using block_exchange_int = rocprim::block_exchange<int, 128, 8>;
// allocate storage in shared memory
__shared__ block_exchange_int::storage_type storage;
int items[8];
...
block_exchange_int b_exchange;
b_exchange.warp_striped_to_blocked(items, items, storage);
...
}

The documentation for this class was generated from the following file: