21 #ifndef ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_ 22 #define ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_ 24 #include "detail/device_adjacent_difference.hpp" 26 #include "device_adjacent_difference_config.hpp" 28 #include "config_types.hpp" 29 #include "device_transform.hpp" 31 #include "../config.hpp" 32 #include "../functional.hpp" 34 #include "../detail/temp_storage.hpp" 35 #include "../detail/various.hpp" 36 #include "../iterator/counting_iterator.hpp" 37 #include "../iterator/transform_iterator.hpp" 39 #include <hip/hip_runtime.h> 51 BEGIN_ROCPRIM_NAMESPACE
53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document 55 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \ 57 auto _error = hipGetLastError(); \ 58 if(_error != hipSuccess) \ 60 if(debug_synchronous) \ 62 std::cout << name << "(" << size << ")"; \ 63 auto __error = hipStreamSynchronize(stream); \ 64 if(__error != hipSuccess) \ 66 auto _end = std::chrono::high_resolution_clock::now(); \ 67 auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \ 68 std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \ 74 template <
typename Config,
79 typename BinaryFunction>
80 void ROCPRIM_KERNEL __launch_bounds__(Config::block_size) adjacent_difference_kernel(
82 const OutputIt output,
83 const std::size_t size,
84 const BinaryFunction op,
85 const typename std::iterator_traits<InputIt>::value_type* previous_values,
86 const std::size_t starting_block)
88 adjacent_difference_kernel_impl<Config, InPlace, Right>(
89 input, output, size, op, previous_values, starting_block);
92 template <
typename Config,
97 typename BinaryFunction>
98 hipError_t adjacent_difference_impl(
void*
const temporary_storage,
99 std::size_t& storage_size,
101 const OutputIt output,
102 const std::size_t size,
103 const BinaryFunction op,
104 const hipStream_t stream,
105 const bool debug_synchronous)
107 using value_type =
typename std::iterator_traits<InputIt>::value_type;
109 using config = detail::default_or_custom_config<
113 static constexpr
unsigned int block_size = config::block_size;
114 static constexpr
unsigned int items_per_thread = config::items_per_thread;
115 static constexpr
unsigned int items_per_block = block_size * items_per_thread;
117 const std::size_t num_blocks = ceiling_div(size, items_per_block);
118 const std::size_t num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0;
120 value_type* previous_values;
125 detail::temp_storage::ptr_aligned_array(&previous_values, num_previous_values));
126 if(partition_result != hipSuccess || temporary_storage ==
nullptr)
128 return partition_result;
138 if ROCPRIM_IF_CONSTEXPR(InPlace)
142 static constexpr
auto offset = items_per_block - (Right ? 0 : 1);
146 [base = input + offset](std::size_t i) {
return base[i * items_per_block]; });
151 rocprim::identity<> {},
154 if(error != hipSuccess)
160 static constexpr
unsigned int size_limit = config::size_limit;
161 static constexpr
auto number_of_blocks_limit =
std::max(size_limit / items_per_block, 1u);
162 static constexpr
auto aligned_size_limit = number_of_blocks_limit * items_per_block;
166 const auto number_of_launch = ceiling_div(size, aligned_size_limit);
168 if(debug_synchronous)
170 std::cout <<
"----------------------------------\n";
171 std::cout <<
"size: " << size <<
'\n';
172 std::cout <<
"aligned_size_limit: " << aligned_size_limit <<
'\n';
173 std::cout <<
"number_of_launch: " << number_of_launch <<
'\n';
174 std::cout <<
"block_size: " << block_size <<
'\n';
175 std::cout <<
"items_per_block: " << items_per_block <<
'\n';
176 std::cout <<
"----------------------------------\n";
179 for(std::size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit)
181 const auto current_size
182 =
static_cast<unsigned int>(std::min<std::size_t>(size - offset, aligned_size_limit));
183 const auto current_blocks = ceiling_div(current_size, items_per_block);
184 const auto starting_block = i * number_of_blocks_limit;
186 std::chrono::time_point<std::chrono::high_resolution_clock> start;
187 if(debug_synchronous)
189 std::cout <<
"index: " << i <<
'\n';
190 std::cout <<
"current_size: " << current_size <<
'\n';
191 std::cout <<
"number of blocks: " << current_blocks <<
'\n';
193 start = std::chrono::high_resolution_clock::now();
195 hipLaunchKernelGGL(HIP_KERNEL_NAME(adjacent_difference_kernel<config, InPlace, Right>),
196 dim3(current_blocks),
204 previous_values + starting_block,
206 ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
207 "adjacent_difference_kernel", current_size, start);
213 #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR 215 #endif // DOXYGEN_SHOULD_SKIP_THIS 301 typename BinaryFunction = ::rocprim::minus<>>
303 std::size_t& storage_size,
305 const OutputIt output,
306 const std::size_t size,
307 const BinaryFunction op = BinaryFunction {},
308 const hipStream_t stream = 0,
309 const bool debug_synchronous =
false)
311 static constexpr
bool in_place =
false;
312 static constexpr
bool right =
false;
313 return detail::adjacent_difference_impl<Config, in_place, right>(
314 temporary_storage, storage_size, input, output, size, op, stream, debug_synchronous);
354 typename BinaryFunction = ::rocprim::minus<>>
356 std::size_t& storage_size,
357 const InputIt values,
358 const std::size_t size,
359 const BinaryFunction op = BinaryFunction {},
360 const hipStream_t stream = 0,
361 const bool debug_synchronous =
false)
363 static constexpr
bool in_place =
true;
364 static constexpr
bool right =
false;
365 return detail::adjacent_difference_impl<Config, in_place, right>(
366 temporary_storage, storage_size, values, values, size, op, stream, debug_synchronous);
450 typename BinaryFunction = ::rocprim::minus<>>
452 std::size_t& storage_size,
454 const OutputIt output,
455 const std::size_t size,
456 const BinaryFunction op = BinaryFunction {},
457 const hipStream_t stream = 0,
458 const bool debug_synchronous =
false)
460 static constexpr
bool in_place =
false;
461 static constexpr
bool right =
true;
462 return detail::adjacent_difference_impl<Config, in_place, right>(
463 temporary_storage, storage_size, input, output, size, op, stream, debug_synchronous);
503 typename BinaryFunction = ::rocprim::minus<>>
505 std::size_t& storage_size,
506 const InputIt values,
507 const std::size_t size,
508 const BinaryFunction op = BinaryFunction {},
509 const hipStream_t stream = 0,
510 const bool debug_synchronous =
false)
512 static constexpr
bool in_place =
true;
513 static constexpr
bool right =
true;
514 return detail::adjacent_difference_impl<Config, in_place, right>(
515 temporary_storage, storage_size, values, values, size, op, stream, debug_synchronous);
521 END_ROCPRIM_NAMESPACE
523 #endif // ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_ ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
ROCPRIM_HOST_DEVICE counting_iterator< Incrementable, Difference > make_counting_iterator(Incrementable value)
make_counting_iterator creates a counting_iterator with its initial value set to value.
Definition: counting_iterator.hpp:261
hipError_t adjacent_difference(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:302
hipError_t adjacent_difference_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:355
hipError_t partition(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using range of flags.
Definition: device_partition.hpp:721
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
hipError_t adjacent_difference_right(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:451
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_HOST_DEVICE transform_iterator< InputIterator, UnaryFunction > make_transform_iterator(InputIterator iterator, UnaryFunction transform)
make_transform_iterator creates a transform_iterator using iterator as the underlying iterator and tr...
Definition: transform_iterator.hpp:254
hipError_t transform(InputIterator input, OutputIterator output, const size_t size, UnaryFunction transform_op, const hipStream_t stream=0, bool debug_synchronous=false)
Parallel transform primitive for device level.
Definition: device_transform.hpp:135
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension.
Definition: thread.hpp:268
hipError_t adjacent_difference_right_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:504
Definition: device_adjacent_difference_config.hpp:72