21 #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_ADJACENT_DIFFERENCE_HPP_ 22 #define ROCPRIM_DEVICE_DETAIL_DEVICE_ADJACENT_DIFFERENCE_HPP_ 24 #include "../../block/block_adjacent_difference.hpp" 25 #include "../../block/block_load.hpp" 26 #include "../../block/block_store.hpp" 28 #include "../../detail/various.hpp" 30 #include "../../config.hpp" 32 #include <hip/hip_runtime.h> 34 #include <type_traits> 38 BEGIN_ROCPRIM_NAMESPACE
43 template <
typename T,
unsigned int BlockSize>
46 using adjacent_diff_type = ::rocprim::block_adjacent_difference<T, BlockSize>;
47 using storage_type =
typename adjacent_diff_type::storage_type;
49 template <
unsigned int ItemsPerThread,
51 typename BinaryFunction,
54 ROCPRIM_DEVICE
void dispatch(
const T (&input)[ItemsPerThread],
55 Output (&output)[ItemsPerThread],
56 const BinaryFunction op,
57 const InputIt previous_values,
59 const std::size_t starting_block,
60 const std::size_t num_blocks,
61 const std::size_t size,
62 storage_type& storage,
63 bool_constant<InPlace> ,
66 static constexpr
unsigned int items_per_block = BlockSize * ItemsPerThread;
69 if(starting_block + block_id != 0)
73 const unsigned int block_offset = InPlace ?
block_id : block_id * items_per_block;
74 const InputIt block_previous_values = previous_values + block_offset;
76 const T tile_predecessor = block_previous_values[-1];
78 if(starting_block + block_id != num_blocks - 1)
80 adjacent_diff_type {}.subtract_left(input, output, op, tile_predecessor, storage);
84 const unsigned int valid_items
85 =
static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
86 adjacent_diff_type {}.subtract_left_partial(
87 input, output, op, tile_predecessor, valid_items, storage);
93 if(starting_block + block_id != num_blocks - 1)
95 adjacent_diff_type {}.subtract_left(input, output, op, storage);
99 const unsigned int valid_items
100 =
static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
101 adjacent_diff_type {}.subtract_left_partial(
102 input, output, op, valid_items, storage);
107 template <
unsigned int ItemsPerThread,
109 typename BinaryFunction,
112 ROCPRIM_DEVICE
void dispatch(
const T (&input)[ItemsPerThread],
113 Output (&output)[ItemsPerThread],
114 const BinaryFunction op,
115 const InputIt previous_values,
116 const unsigned int block_id,
117 const std::size_t starting_block,
118 const std::size_t num_blocks,
119 const std::size_t size,
120 storage_type& storage,
121 bool_constant<InPlace> ,
124 static constexpr
unsigned int items_per_block = BlockSize * ItemsPerThread;
127 if(starting_block + block_id != num_blocks - 1)
134 const unsigned int block_offset = InPlace ?
block_id : (block_id + 1) * items_per_block;
136 const InputIt next_block_values = previous_values + block_offset;
137 const T tile_successor = *next_block_values;
139 adjacent_diff_type {}.subtract_right(input, output, op, tile_successor, storage);
143 const unsigned int valid_items
144 =
static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
145 adjacent_diff_type {}.subtract_right_partial(input, output, op, valid_items, storage);
150 template <
typename T,
typename InputIterator>
151 ROCPRIM_DEVICE ROCPRIM_INLINE
auto select_previous_values_iterator(T* previous_values,
155 return previous_values;
158 template <
typename T,
typename InputIterator>
159 ROCPRIM_DEVICE ROCPRIM_INLINE
auto select_previous_values_iterator(T* ,
166 template <
typename Config,
171 typename BinaryFunction>
172 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void adjacent_difference_kernel_impl(
174 const OutputIt output,
175 const std::size_t size,
176 const BinaryFunction op,
177 const typename std::iterator_traits<InputIt>::value_type* previous_values,
178 const std::size_t starting_block)
180 using input_type =
typename std::iterator_traits<InputIt>::value_type;
181 using output_type =
typename std::iterator_traits<OutputIt>::value_type;
183 static constexpr
unsigned int block_size = Config::block_size;
184 static constexpr
unsigned int items_per_thread = Config::items_per_thread;
185 static constexpr
unsigned int items_per_block = block_size * items_per_thread;
187 using block_load_type
188 = ::rocprim::block_load<input_type, block_size, items_per_thread, Config::load_method>;
189 using block_store_type
190 = ::rocprim::block_store<output_type, block_size, items_per_thread, Config::store_method>;
194 ROCPRIM_SHARED_MEMORY
struct 196 typename block_load_type::storage_type load;
197 typename adjacent_helper::storage_type adjacent_diff;
198 typename block_store_type::storage_type store;
201 const unsigned int block_id = blockIdx.x;
202 const unsigned int block_offset = block_id * items_per_block;
204 const std::size_t num_blocks = ceiling_div(size, items_per_block);
206 input_type thread_input[items_per_thread];
207 if(starting_block + block_id < num_blocks - 1)
209 block_load_type {}.load(input + block_offset, thread_input, storage.load);
213 const unsigned int valid_items
214 =
static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
215 block_load_type {}.load(input + block_offset, thread_input, valid_items, storage.load);
220 static constexpr
auto in_place = bool_constant<InPlace> {};
221 static constexpr
auto right = bool_constant<Right> {};
229 const auto previous_values_it
230 = select_previous_values_iterator(previous_values, input, in_place);
232 output_type thread_output[items_per_thread];
235 adjacent_helper {}.dispatch(thread_input,
243 storage.adjacent_diff,
248 if(starting_block + block_id < num_blocks - 1)
250 block_store_type {}.store(output + block_offset, thread_output, storage.store);
254 const unsigned int valid_items
255 =
static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
256 block_store_type {}.store(output + block_offset, thread_output, valid_items, storage.store);
262 END_ROCPRIM_NAMESPACE
264 #endif // ROCPRIM_DEVICE_DETAIL_DEVICE_ADJACENT_DIFFERENCE_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_id()
Returns block identifier in a multidimensional grid by dimension.
Definition: thread.hpp:258
Definition: device_adjacent_difference.hpp:44
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension.
Definition: thread.hpp:268