21 #ifndef ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_ 22 #define ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_ 24 #include "../../config.hpp" 25 #include "../../detail/various.hpp" 26 #include "../../intrinsics/thread.hpp" 28 #include <type_traits> 32 BEGIN_ROCPRIM_NAMESPACE
43 template <
class T,
class BinaryFunction>
44 ROCPRIM_DEVICE ROCPRIM_INLINE
auto apply(BinaryFunction op,
49 bool_constant<false> ) -> decltype(op(b, a, index))
51 return op(a, b, index);
54 template <
class T,
class BinaryFunction>
55 ROCPRIM_DEVICE ROCPRIM_INLINE
auto apply(BinaryFunction op,
61 -> decltype(op(b, a, index))
63 return op(b, a, index);
66 template <
typename T,
typename BinaryFunction,
bool AsFlags>
67 ROCPRIM_DEVICE ROCPRIM_INLINE
auto apply(BinaryFunction op,
71 bool_constant<AsFlags> ,
72 bool_constant<false> ) -> decltype(op(b, a))
77 template <
typename T,
typename BinaryFunction,
bool AsFlags>
78 ROCPRIM_DEVICE ROCPRIM_INLINE
auto apply(BinaryFunction op,
82 bool_constant<AsFlags> ,
83 bool_constant<true> ) -> decltype(op(b, a))
89 unsigned int BlockSizeX,
90 unsigned int BlockSizeY = 1,
91 unsigned int BlockSizeZ = 1>
95 static constexpr
unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
101 template <
bool AsFlags,
103 bool WithTilePredecessor,
104 unsigned int ItemsPerThread,
106 typename BinaryFunction>
107 ROCPRIM_DEVICE
void apply_left(
const T (&input)[ItemsPerThread],
108 Output (&output)[ItemsPerThread],
110 const T tile_predecessor_item,
113 static constexpr
auto as_flags = bool_constant<AsFlags> {};
114 static constexpr
auto reversed = bool_constant<Reversed> {};
116 const unsigned int flat_id
117 = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
120 storage.items[flat_id] = input[ItemsPerThread - 1];
123 for(
unsigned int i = ItemsPerThread - 1; i > 0; --i)
125 output[i] = detail::apply(
126 op, input[i - 1], input[i], flat_id * ItemsPerThread + i, as_flags, reversed);
130 if ROCPRIM_IF_CONSTEXPR (WithTilePredecessor)
132 T predecessor_item = tile_predecessor_item;
134 predecessor_item = storage.items[flat_id - 1];
137 output[0] = detail::apply(
138 op, predecessor_item, input[0], flat_id * ItemsPerThread, as_flags, reversed);
142 output[0] = get_default_item(input, 0, as_flags);
144 output[0] = detail::apply(op,
145 storage.items[flat_id - 1],
147 flat_id * ItemsPerThread,
154 template <
bool AsFlags,
156 bool WithTilePredecessor,
157 unsigned int ItemsPerThread,
159 typename BinaryFunction>
160 ROCPRIM_DEVICE
void apply_left_partial(
const T (&input)[ItemsPerThread],
161 Output (&output)[ItemsPerThread],
163 const T tile_predecessor_item,
164 const unsigned int valid_items,
167 static constexpr
auto as_flags = bool_constant<AsFlags> {};
168 static constexpr
auto reversed = bool_constant<Reversed> {};
170 const unsigned int flat_id
171 = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
174 storage.items[flat_id] = input[ItemsPerThread - 1];
177 for(
unsigned int i = ItemsPerThread - 1; i > 0; --i)
179 const unsigned int index = flat_id * ItemsPerThread + i;
180 output[i] = get_default_item(input, i, as_flags);
181 if(index < valid_items) {
182 output[i] = detail::apply(op, input[i - 1], input[i], index, as_flags, reversed);
187 const unsigned int index = flat_id * ItemsPerThread;
189 if ROCPRIM_IF_CONSTEXPR (WithTilePredecessor)
191 T predecessor_item = tile_predecessor_item;
193 predecessor_item = storage.items[flat_id - 1];
196 output[0] = get_default_item(input, 0, as_flags);
197 if(index < valid_items)
200 = detail::apply(op, predecessor_item, input[0], index, as_flags, reversed);
205 output[0] = get_default_item(input, 0, as_flags);
206 if(flat_id != 0 && index < valid_items)
208 output[0] = detail::apply(op,
209 storage.items[flat_id - 1],
211 flat_id * ItemsPerThread,
218 template <
bool AsFlags,
220 bool WithTileSuccessor,
221 unsigned int ItemsPerThread,
223 typename BinaryFunction>
224 ROCPRIM_DEVICE
void apply_right(
const T (&input)[ItemsPerThread],
225 Output (&output)[ItemsPerThread],
227 const T tile_successor_item,
230 static constexpr
auto as_flags = bool_constant<AsFlags> {};
231 static constexpr
auto reversed = bool_constant<Reversed> {};
233 const unsigned int flat_id
234 = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
237 storage.items[flat_id] = input[0];
240 for(
unsigned int i = 0; i < ItemsPerThread - 1; ++i)
242 output[i] = detail::apply(
243 op, input[i], input[i + 1], flat_id * ItemsPerThread + i + 1, as_flags, reversed);
247 if ROCPRIM_IF_CONSTEXPR (WithTileSuccessor)
249 T successor_item = tile_successor_item;
250 if(flat_id != BlockSize - 1) {
251 successor_item = storage.items[flat_id + 1];
254 output[ItemsPerThread - 1] = detail::apply(op,
255 input[ItemsPerThread - 1],
257 flat_id * ItemsPerThread + ItemsPerThread,
263 output[ItemsPerThread - 1] = get_default_item(input, ItemsPerThread - 1, as_flags);
264 if(flat_id != BlockSize - 1) {
265 output[ItemsPerThread - 1]
267 input[ItemsPerThread - 1],
268 storage.items[flat_id + 1],
269 flat_id * ItemsPerThread + ItemsPerThread,
275 template <
bool AsFlags,
277 unsigned int ItemsPerThread,
279 typename BinaryFunction>
280 ROCPRIM_DEVICE
void apply_right_partial(
const T (&input)[ItemsPerThread],
281 Output (&output)[ItemsPerThread],
283 const unsigned int valid_items,
286 static constexpr
auto as_flags = bool_constant<AsFlags> {};
287 static constexpr
auto reversed = bool_constant<Reversed> {};
289 const unsigned int flat_id
290 = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
293 storage.items[flat_id] = input[0];
296 for(
unsigned int i = 0; i < ItemsPerThread - 1; ++i)
298 const unsigned int index = flat_id * ItemsPerThread + i + 1;
299 output[i] = get_default_item(input, i, as_flags);
300 if(index < valid_items)
302 output[i] = detail::apply(op, input[i], input[i + 1], index, as_flags, reversed);
307 output[ItemsPerThread - 1] = get_default_item(input, ItemsPerThread - 1, as_flags);
309 const unsigned int next_thread_index = flat_id * ItemsPerThread + ItemsPerThread;
310 if(next_thread_index < valid_items)
312 output[ItemsPerThread - 1] = detail::apply(op,
313 input[ItemsPerThread - 1],
314 storage.items[flat_id + 1],
322 template <
unsigned int ItemsPerThread>
323 ROCPRIM_DEVICE
int get_default_item(
const T (&)[ItemsPerThread],
325 bool_constant<true> )
330 template <
unsigned int ItemsPerThread>
331 ROCPRIM_DEVICE T get_default_item(
const T (&input)[ItemsPerThread],
332 const unsigned int index,
333 bool_constant<false> )
341 END_ROCPRIM_NAMESPACE
343 #endif // ROCPRIM_BLOCK_DETAIL_BLOCK_ADJACENT_DIFFERENCE_IMPL_HPP_ Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
Definition: block_adjacent_difference_impl.hpp:96
Definition: block_adjacent_difference_impl.hpp:92