|
template<class InputIterator > |
ROCPRIM_DEVICE ROCPRIM_INLINE void | load (InputIterator block_input, T(&items)[ItemsPerThread]) |
| Loads data from continuous memory into an arrangement of items across the thread block. More...
|
|
template<class InputIterator > |
ROCPRIM_DEVICE ROCPRIM_INLINE void | load (InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid) |
| Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range valid . More...
|
|
template<class InputIterator , class Default > |
ROCPRIM_DEVICE ROCPRIM_INLINE void | load (InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) |
| Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements. More...
|
|
template<class InputIterator > |
ROCPRIM_DEVICE ROCPRIM_INLINE void | load (InputIterator block_input, T(&items)[ItemsPerThread], storage_type &storage) |
| Loads data from continuous memory into an arrangement of items across the thread block, using temporary storage. More...
|
|
template<class InputIterator > |
ROCPRIM_DEVICE ROCPRIM_INLINE void | load (InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, storage_type &storage) |
| Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range valid , using temporary storage. More...
|
|
template<class InputIterator , class Default > |
ROCPRIM_DEVICE ROCPRIM_INLINE void | load (InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds, storage_type &storage) |
| Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements, using temporary storage. More...
|
|
template<class T, unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
class block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >
The block_load
class is a block level parallel primitive which provides methods for loading data from continuous memory into a blocked arrangement of items across the thread block.
- Template Parameters
-
T | - the input/output type. |
BlockSize | - the number of threads in a block. |
ItemsPerThread | - the number of items to be processed by each thread. |
Method | - the method to load data. |
- Overview
- The
block_load
class has a number of different methods to load data:
- Example:
In the examples load operation is performed on block of 128 threads, using type int
and 8 items per thread.
__global__ void example_kernel(int * input, ...)
{
const int offset = blockIdx.x * 128 * 8;
int items[8];
rocprim::block_load<int, 128, 8, load_method> blockload;
blockload.load(input + offset, items);
...
}
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
using block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::storage_type = typename ::rocprim::detail::empty_storage_type |
Struct used to allocate a temporary memory that is required for thread communication during operations provided by related parallel primitive.
Depending on the implemention the operations exposed by parallel primitive may require a temporary storage for thread communication. The storage should be allocated using keywords shared
. It can be aliased to an externally allocated memory, or be a part of a union with other storage types to increase shared memory reusability.
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class InputIterator >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::load |
( |
InputIterator |
block_input, |
|
|
T(&) |
items[ItemsPerThread] |
|
) |
| |
|
inline |
Loads data from continuous memory into an arrangement of items across the thread block.
- Template Parameters
-
InputIterator | - [inferred] an iterator type for input (can be a simple pointer. |
- Parameters
-
[in] | block_input | - the input iterator from the thread block to load from. |
[out] | items | - array that data is loaded to. |
- Overview
- The type
T
must be such that an object of type InputIterator
can be dereferenced and then implicitly converted to T
.
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class InputIterator >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::load |
( |
InputIterator |
block_input, |
|
|
T(&) |
items[ItemsPerThread], |
|
|
unsigned int |
valid |
|
) |
| |
|
inline |
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range valid
.
- Template Parameters
-
InputIterator | - [inferred] an iterator type for input (can be a simple pointer. |
- Parameters
-
[in] | block_input | - the input iterator from the thread block to load from. |
[out] | items | - array that data is loaded to. |
[in] | valid | - maximum range of valid numbers to load. |
- Overview
- The type
T
must be such that an object of type InputIterator
can be dereferenced and then implicitly converted to T
.
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class InputIterator , class Default >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::load |
( |
InputIterator |
block_input, |
|
|
T(&) |
items[ItemsPerThread], |
|
|
unsigned int |
valid, |
|
|
Default |
out_of_bounds |
|
) |
| |
|
inline |
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements.
- Template Parameters
-
InputIterator | - [inferred] an iterator type for input (can be a simple pointer. |
Default | - [inferred] The data type of the default value. |
- Parameters
-
[in] | block_input | - the input iterator from the thread block to load from. |
[out] | items | - array that data is loaded to. |
[in] | valid | - maximum range of valid numbers to load. |
[in] | out_of_bounds | - default value assigned to out-of-bound items. |
- Overview
- The type
T
must be such that an object of type InputIterator
can be dereferenced and then implicitly converted to T
.
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class InputIterator >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::load |
( |
InputIterator |
block_input, |
|
|
T(&) |
items[ItemsPerThread], |
|
|
storage_type & |
storage |
|
) |
| |
|
inline |
Loads data from continuous memory into an arrangement of items across the thread block, using temporary storage.
- Template Parameters
-
InputIterator | - [inferred] an iterator type for input (can be a simple pointer. |
- Parameters
-
[in] | block_input | - the input iterator from the thread block to load from. |
[out] | items | - array that data is loaded to. |
[in] | storage | - temporary storage for inputs. |
- Overview
- The type
T
must be such that an object of type InputIterator
can be dereferenced and then implicitly converted to T
.
- Storage reusage
- Synchronization barrier should be placed before
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
- Example.
__global__ void example_kernel(...)
{
int items[8];
using block_load_int = rocprim::block_load<int, 128, 8>;
block_load_int bload;
__shared__ typename block_load_int::storage_type storage;
bload.load(..., items, storage);
...
}
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class InputIterator >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::load |
( |
InputIterator |
block_input, |
|
|
T(&) |
items[ItemsPerThread], |
|
|
unsigned int |
valid, |
|
|
storage_type & |
storage |
|
) |
| |
|
inline |
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range valid
, using temporary storage.
- Template Parameters
-
InputIterator | - [inferred] an iterator type for input (can be a simple pointer |
- Parameters
-
[in] | block_input | - the input iterator from the thread block to load from. |
[out] | items | - array that data is loaded to. |
[in] | valid | - maximum range of valid numbers to load. |
[in] | storage | - temporary storage for inputs. |
- Overview
- The type
T
must be such that an object of type InputIterator
can be dereferenced and then implicitly converted to T
.
- Storage reusage
- Synchronization barrier should be placed before
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
- Example.
__global__ void example_kernel(...)
{
int items[8];
using block_load_int = rocprim::block_load<int, 128, 8>;
block_load_int bload;
tile_static typename block_load_int::storage_type storage;
bload.load(..., items, valid, storage);
...
}
template<class T , unsigned int BlockSizeX, unsigned int ItemsPerThread, block_load_method Method = block_load_method::block_load_direct, unsigned int BlockSizeY = 1, unsigned int BlockSizeZ = 1>
template<class InputIterator , class Default >
ROCPRIM_DEVICE ROCPRIM_INLINE void block_load< T, BlockSizeX, ItemsPerThread, Method, BlockSizeY, BlockSizeZ >::load |
( |
InputIterator |
block_input, |
|
|
T(&) |
items[ItemsPerThread], |
|
|
unsigned int |
valid, |
|
|
Default |
out_of_bounds, |
|
|
storage_type & |
storage |
|
) |
| |
|
inline |
Loads data from continuous memory into an arrangement of items across the thread block, which is guarded by range with a fall-back value for out-of-bound elements, using temporary storage.
- Template Parameters
-
InputIterator | - [inferred] an iterator type for input (can be a simple pointer. |
Default | - [inferred] The data type of the default value. |
- Parameters
-
[in] | block_input | - the input iterator from the thread block to load from. |
[out] | items | - array that data is loaded to. |
[in] | valid | - maximum range of valid numbers to load. |
[in] | out_of_bounds | - default value assigned to out-of-bound items. |
[in] | storage | - temporary storage for inputs. |
- Overview
- The type
T
must be such that an object of type InputIterator
can be dereferenced and then implicitly converted to T
.
- Storage reusage
- Synchronization barrier should be placed before
storage
is reused or repurposed: __syncthreads()
or rocprim::syncthreads()
.
- Example.
__global__ void example_kernel(...)
{
int items[8];
using block_load_int = rocprim::block_load<int, 128, 8>;
block_load_int bload;
__shared__ typename block_load_int::storage_type storage;
bload.load(..., items, valid, out_of_bounds, storage);
...
}