23 #ifndef HIPCUB_TEST_HIPCUB_BLOCK_LOAD_STORE_KERNELS_HPP 24 #define HIPCUB_TEST_HIPCUB_BLOCK_LOAD_STORE_KERNELS_HPP 26 #include "test_utils.hpp" 29 #include "hipcub/block/block_load.hpp" 30 #include "hipcub/block/block_store.hpp" 33 hipcub::BlockLoadAlgorithm Load,
34 hipcub::BlockStoreAlgorithm Store,
35 unsigned int BlockSize,
36 unsigned int ItemsPerThread>
40 static constexpr hipcub::BlockLoadAlgorithm load_method = Load;
41 static constexpr hipcub::BlockStoreAlgorithm store_method = Store;
42 static constexpr
unsigned int block_size = BlockSize;
43 static constexpr
unsigned int items_per_thread = ItemsPerThread;
46 #define class_param_items(load_algo, store_algo, type, block_size) \ 47 class_params<type, load_algo, store_algo, block_size, 1>, \ 48 class_params<type, load_algo, store_algo, block_size, 4> 50 #define class_param_block_size(load_algo, store_algo, type) \ 51 class_param_items(load_algo, store_algo, type, 64U), \ 52 class_param_items(load_algo, store_algo, type, 256U) 54 #define class_param_block_size_512(load_algo, store_algo, type) \ 55 class_param_block_size(load_algo, store_algo, type), \ 56 class_param_items(load_algo, store_algo, type, 512U) 58 #define class_param_type(load_algo, store_algo) \ 59 class_param_block_size_512(load_algo, store_algo, int), \ 60 class_param_block_size_512(load_algo, store_algo, double), \ 61 class_param_block_size_512(load_algo, store_algo, test_utils::half), \ 62 class_param_block_size_512(load_algo, store_algo, test_utils::bfloat16), \ 63 class_param_block_size(load_algo, store_algo, test_utils::custom_test_type<int>), \ 64 class_param_block_size(load_algo, store_algo, test_utils::custom_test_type<double>) 66 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
67 hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT)>
68 LoadStoreParamsDirect;
70 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
71 hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED)>
72 LoadStoreParamsStriped;
74 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
75 hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE)>
76 LoadStoreParamsVectorize;
78 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE,
79 hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE)>
80 LoadStoreParamsTranspose;
83 hipcub::BlockLoadAlgorithm LoadMethod,
84 hipcub::BlockStoreAlgorithm StoreMethod,
85 unsigned int BlockSize,
86 unsigned int ItemsPerThread>
87 __global__ __launch_bounds__(BlockSize)
void load_store_kernel(Type * device_input,
90 Type items[ItemsPerThread];
91 unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
92 hipcub::BlockLoad<Type, BlockSize, ItemsPerThread, LoadMethod> load;
93 hipcub::BlockStore<Type, BlockSize, ItemsPerThread, StoreMethod> store;
94 load.Load(device_input + offset, items);
95 store.Store(device_output + offset, items);
99 hipcub::BlockLoadAlgorithm LoadMethod,
100 hipcub::BlockStoreAlgorithm StoreMethod,
101 unsigned int BlockSize,
102 unsigned int ItemsPerThread>
103 __global__ __launch_bounds__(BlockSize)
void load_store_valid_kernel(Type * device_input,
104 Type * device_output,
107 Type items[ItemsPerThread];
108 unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
109 hipcub::BlockLoad<Type, BlockSize, ItemsPerThread, LoadMethod> load;
110 hipcub::BlockStore<Type, BlockSize, ItemsPerThread, StoreMethod> store;
111 load.Load(device_input + offset, items, valid);
112 store.Store(device_output + offset, items, valid);
116 hipcub::BlockLoadAlgorithm LoadMethod,
117 hipcub::BlockStoreAlgorithm StoreMethod,
118 unsigned int BlockSize,
119 unsigned int ItemsPerThread>
120 __global__ __launch_bounds__(BlockSize)
void load_store_valid_default_kernel(Type* device_input,
125 Type items[ItemsPerThread];
126 unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
127 hipcub::BlockLoad<Type, BlockSize, ItemsPerThread, LoadMethod> load;
128 hipcub::BlockStore<Type, BlockSize, ItemsPerThread, StoreMethod> store;
129 load.Load(device_input + offset, items, valid, _default);
130 store.Store(device_output + offset, items);
133 template<
typename InputIteratorT,
134 typename OutputIteratorT,
135 hipcub::BlockLoadAlgorithm LoadMethod,
136 hipcub::BlockStoreAlgorithm StoreMethod,
137 unsigned int BlockSize,
138 unsigned int ItemsPerThread>
139 __launch_bounds__(BlockSize) __global__
140 void load_store_guarded_kernel(InputIteratorT d_in,
141 OutputIteratorT d_out_unguarded,
142 OutputIteratorT d_out_guarded,
147 TileSize = BlockSize * ItemsPerThread
151 typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
154 typedef typename std::conditional<
155 (std::is_same<typename std::iterator_traits<OutputIteratorT>::value_type,
157 typename std::iterator_traits<InputIteratorT>::value_type,
159 typename std::iterator_traits<OutputIteratorT>::value_type>::type
163 typedef hipcub::BlockLoad<InputT, BlockSize, ItemsPerThread, LoadMethod>
BlockLoad;
164 typedef hipcub::BlockStore<OutputT, BlockSize, ItemsPerThread, StoreMethod>
BlockStore;
169 typename BlockLoad::TempStorage load;
170 typename BlockStore::TempStorage store;
177 int block_offset = blockIdx.x * TileSize;
178 int guarded_elements = max(num_items - block_offset, 0);
181 OutputT data[ItemsPerThread];
184 BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
189 BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
195 for(
unsigned int item = 0; item < ItemsPerThread; ++item)
196 data[item] = OutputT();
201 BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
206 BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
209 #endif // HIPCUB_TEST_HIPCUB_BLOCK_LOAD_STORE_KERNELS_HPP Definition: test_hipcub_block_load_store.kernels.hpp:167
Definition: test_hipcub_block_load_store.kernels.hpp:37
Definition: block_load.hpp:79
Definition: block_store.hpp:79
Definition: benchmark_block_shuffle.cpp:41