hipCUB
test_hipcub_block_load_store.kernels.hpp
1 // MIT License
2 //
3 // Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved.
4 //
5 // Permission is hereby granted, free of charge, to any person obtaining a copy
6 // of this software and associated documentation files (the "Software"), to deal
7 // in the Software without restriction, including without limitation the rights
8 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 // copies of the Software, and to permit persons to whom the Software is
10 // furnished to do so, subject to the following conditions:
11 //
12 // The above copyright notice and this permission notice shall be included in
13 // all copies or substantial portions of the Software.
14 //
15 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 // THE SOFTWARE.
22 
23 #ifndef HIPCUB_TEST_HIPCUB_BLOCK_LOAD_STORE_KERNELS_HPP
24 #define HIPCUB_TEST_HIPCUB_BLOCK_LOAD_STORE_KERNELS_HPP
25 
26 #include "test_utils.hpp"
27 
28 // hipcub API
29 #include "hipcub/block/block_load.hpp"
30 #include "hipcub/block/block_store.hpp"
31 
32 template<class Type,
33  hipcub::BlockLoadAlgorithm Load,
34  hipcub::BlockStoreAlgorithm Store,
35  unsigned int BlockSize,
36  unsigned int ItemsPerThread>
38 {
39  using type = Type;
40  static constexpr hipcub::BlockLoadAlgorithm load_method = Load;
41  static constexpr hipcub::BlockStoreAlgorithm store_method = Store;
42  static constexpr unsigned int block_size = BlockSize;
43  static constexpr unsigned int items_per_thread = ItemsPerThread;
44 };
45 
46 #define class_param_items(load_algo, store_algo, type, block_size) \
47  class_params<type, load_algo, store_algo, block_size, 1>, \
48  class_params<type, load_algo, store_algo, block_size, 4>
49 
50 #define class_param_block_size(load_algo, store_algo, type) \
51  class_param_items(load_algo, store_algo, type, 64U), \
52  class_param_items(load_algo, store_algo, type, 256U)
53 
54 #define class_param_block_size_512(load_algo, store_algo, type) \
55  class_param_block_size(load_algo, store_algo, type), \
56  class_param_items(load_algo, store_algo, type, 512U)
57 
58 #define class_param_type(load_algo, store_algo) \
59  class_param_block_size_512(load_algo, store_algo, int), \
60  class_param_block_size_512(load_algo, store_algo, double), \
61  class_param_block_size_512(load_algo, store_algo, test_utils::half), \
62  class_param_block_size_512(load_algo, store_algo, test_utils::bfloat16), \
63  class_param_block_size(load_algo, store_algo, test_utils::custom_test_type<int>), \
64  class_param_block_size(load_algo, store_algo, test_utils::custom_test_type<double>)
65 
66 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_DIRECT,
67  hipcub::BlockStoreAlgorithm::BLOCK_STORE_DIRECT)>
68  LoadStoreParamsDirect;
69 
70 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_STRIPED,
71  hipcub::BlockStoreAlgorithm::BLOCK_STORE_STRIPED)>
72  LoadStoreParamsStriped;
73 
74 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_VECTORIZE,
75  hipcub::BlockStoreAlgorithm::BLOCK_STORE_VECTORIZE)>
76  LoadStoreParamsVectorize;
77 
78 typedef ::testing::Types<class_param_type(hipcub::BlockLoadAlgorithm::BLOCK_LOAD_TRANSPOSE,
79  hipcub::BlockStoreAlgorithm::BLOCK_STORE_TRANSPOSE)>
80  LoadStoreParamsTranspose;
81 
82 template<class Type,
83  hipcub::BlockLoadAlgorithm LoadMethod,
84  hipcub::BlockStoreAlgorithm StoreMethod,
85  unsigned int BlockSize,
86  unsigned int ItemsPerThread>
87 __global__ __launch_bounds__(BlockSize) void load_store_kernel(Type * device_input,
88  Type * device_output)
89 {
90  Type items[ItemsPerThread];
91  unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
92  hipcub::BlockLoad<Type, BlockSize, ItemsPerThread, LoadMethod> load;
93  hipcub::BlockStore<Type, BlockSize, ItemsPerThread, StoreMethod> store;
94  load.Load(device_input + offset, items);
95  store.Store(device_output + offset, items);
96 }
97 
98 template<class Type,
99  hipcub::BlockLoadAlgorithm LoadMethod,
100  hipcub::BlockStoreAlgorithm StoreMethod,
101  unsigned int BlockSize,
102  unsigned int ItemsPerThread>
103 __global__ __launch_bounds__(BlockSize) void load_store_valid_kernel(Type * device_input,
104  Type * device_output,
105  size_t valid)
106 {
107  Type items[ItemsPerThread];
108  unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
109  hipcub::BlockLoad<Type, BlockSize, ItemsPerThread, LoadMethod> load;
110  hipcub::BlockStore<Type, BlockSize, ItemsPerThread, StoreMethod> store;
111  load.Load(device_input + offset, items, valid);
112  store.Store(device_output + offset, items, valid);
113 }
114 
115 template<class Type,
116  hipcub::BlockLoadAlgorithm LoadMethod,
117  hipcub::BlockStoreAlgorithm StoreMethod,
118  unsigned int BlockSize,
119  unsigned int ItemsPerThread>
120 __global__ __launch_bounds__(BlockSize) void load_store_valid_default_kernel(Type* device_input,
121  Type* device_output,
122  size_t valid,
123  Type _default)
124 {
125  Type items[ItemsPerThread];
126  unsigned int offset = hipBlockIdx_x * BlockSize * ItemsPerThread;
127  hipcub::BlockLoad<Type, BlockSize, ItemsPerThread, LoadMethod> load;
128  hipcub::BlockStore<Type, BlockSize, ItemsPerThread, StoreMethod> store;
129  load.Load(device_input + offset, items, valid, _default);
130  store.Store(device_output + offset, items);
131 }
132 
133 template<typename InputIteratorT,
134  typename OutputIteratorT,
135  hipcub::BlockLoadAlgorithm LoadMethod,
136  hipcub::BlockStoreAlgorithm StoreMethod,
137  unsigned int BlockSize,
138  unsigned int ItemsPerThread>
139 __launch_bounds__(BlockSize) __global__
140  void load_store_guarded_kernel(InputIteratorT d_in,
141  OutputIteratorT d_out_unguarded,
142  OutputIteratorT d_out_guarded,
143  int num_items)
144 {
145  enum
146  {
147  TileSize = BlockSize * ItemsPerThread
148  };
149 
150  // The input value type
151  typedef typename std::iterator_traits<InputIteratorT>::value_type InputT;
152 
153  // The output value type
154  typedef typename std::conditional<
155  (std::is_same<typename std::iterator_traits<OutputIteratorT>::value_type,
156  void>::value), // OutputT = (if output iterator's value type is void) ?
157  typename std::iterator_traits<InputIteratorT>::value_type, // ... then the input iterator's
158  // value type,
159  typename std::iterator_traits<OutputIteratorT>::value_type>::type
160  OutputT; // ... else the output iterator's value type
161 
162  // Threadblock load/store abstraction types
163  typedef hipcub::BlockLoad<InputT, BlockSize, ItemsPerThread, LoadMethod> BlockLoad;
164  typedef hipcub::BlockStore<OutputT, BlockSize, ItemsPerThread, StoreMethod> BlockStore;
165 
166  // Shared memory type for this thread block
168  {
169  typename BlockLoad::TempStorage load;
170  typename BlockStore::TempStorage store;
171  };
172 
173  // Allocate temp storage in shared memory
174  __shared__ TempStorage temp_storage;
175 
176  // Threadblock work bounds
177  int block_offset = blockIdx.x * TileSize;
178  int guarded_elements = max(num_items - block_offset, 0);
179 
180  // Tile of items
181  OutputT data[ItemsPerThread];
182 
183  // Load data
184  BlockLoad(temp_storage.load).Load(d_in + block_offset, data);
185 
186  __syncthreads();
187 
188  // Store data
189  BlockStore(temp_storage.store).Store(d_out_unguarded + block_offset, data);
190 
191  __syncthreads();
192 
193  // reset data
194 #pragma unroll
195  for(unsigned int item = 0; item < ItemsPerThread; ++item)
196  data[item] = OutputT();
197 
198  __syncthreads();
199 
200  // Load data
201  BlockLoad(temp_storage.load).Load(d_in + block_offset, data, guarded_elements);
202 
203  __syncthreads();
204 
205  // Store data
206  BlockStore(temp_storage.store).Store(d_out_guarded + block_offset, data, guarded_elements);
207 }
208 
209 #endif // HIPCUB_TEST_HIPCUB_BLOCK_LOAD_STORE_KERNELS_HPP
Definition: test_hipcub_block_load_store.kernels.hpp:167
Definition: test_hipcub_block_load_store.kernels.hpp:37
Definition: block_load.hpp:79
Definition: block_store.hpp:79
Definition: benchmark_block_shuffle.cpp:41