21 #ifndef ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_    22 #define ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_    24 #include "detail/device_adjacent_difference.hpp"    26 #include "device_adjacent_difference_config.hpp"    28 #include "config_types.hpp"    29 #include "device_transform.hpp"    31 #include "../config.hpp"    32 #include "../functional.hpp"    34 #include "../detail/temp_storage.hpp"    35 #include "../detail/various.hpp"    36 #include "../iterator/counting_iterator.hpp"    37 #include "../iterator/transform_iterator.hpp"    39 #include <hip/hip_runtime.h>    51 BEGIN_ROCPRIM_NAMESPACE
    53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document    55 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start)                           \    57         auto _error = hipGetLastError();                                                         \    58         if(_error != hipSuccess)                                                                 \    60         if(debug_synchronous)                                                                    \    62             std::cout << name << "(" << size << ")";                                             \    63             auto __error = hipStreamSynchronize(stream);                                         \    64             if(__error != hipSuccess)                                                            \    66             auto _end = std::chrono::high_resolution_clock::now();                               \    67             auto _d   = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \    68             std::cout << " " << _d.count() * 1000 << " ms" << '\n';                              \    74 template <
typename Config,
    79           typename BinaryFunction>
    80 void ROCPRIM_KERNEL __launch_bounds__(Config::block_size) adjacent_difference_kernel(
    82     const OutputIt                                            output,
    83     const std::size_t                                         size,
    84     const BinaryFunction                                      op,
    85     const typename std::iterator_traits<InputIt>::value_type* previous_values,
    86     const std::size_t                                         starting_block)
    88     adjacent_difference_kernel_impl<Config, InPlace, Right>(
    89         input, output, size, op, previous_values, starting_block);
    92 template <
typename Config,
    97           typename BinaryFunction>
    98 hipError_t adjacent_difference_impl(
void* 
const          temporary_storage,
    99                                     std::size_t&         storage_size,
   101                                     const OutputIt       output,
   102                                     const std::size_t    size,
   103                                     const BinaryFunction op,
   104                                     const hipStream_t    stream,
   105                                     const bool           debug_synchronous)
   107     using value_type = 
typename std::iterator_traits<InputIt>::value_type;
   109     using config = detail::default_or_custom_config<
   113     static constexpr 
unsigned int block_size       = config::block_size;
   114     static constexpr 
unsigned int items_per_thread = config::items_per_thread;
   115     static constexpr 
unsigned int items_per_block  = block_size * items_per_thread;
   117     const std::size_t num_blocks = ceiling_div(size, items_per_block);
   118     const std::size_t num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0;
   120     value_type* previous_values;
   125         detail::temp_storage::ptr_aligned_array(&previous_values, num_previous_values));
   126     if(partition_result != hipSuccess || temporary_storage == 
nullptr)
   128         return partition_result;
   138     if ROCPRIM_IF_CONSTEXPR(InPlace)
   142         static constexpr 
auto offset = items_per_block - (Right ? 0 : 1);
   146             [base = input + offset](std::size_t i) { 
return base[i * items_per_block]; });
   151                                                       rocprim::identity<> {},
   154         if(error != hipSuccess)
   160     static constexpr 
unsigned int size_limit     = config::size_limit;
   161     static constexpr 
auto number_of_blocks_limit = 
std::max(size_limit / items_per_block, 1u);
   162     static constexpr 
auto aligned_size_limit     = number_of_blocks_limit * items_per_block;
   166     const auto number_of_launch = ceiling_div(size, aligned_size_limit);
   168     if(debug_synchronous)
   170         std::cout << 
"----------------------------------\n";
   171         std::cout << 
"size:               " << size << 
'\n';
   172         std::cout << 
"aligned_size_limit: " << aligned_size_limit << 
'\n';
   173         std::cout << 
"number_of_launch:   " << number_of_launch << 
'\n';
   174         std::cout << 
"block_size:         " << block_size << 
'\n';
   175         std::cout << 
"items_per_block:    " << items_per_block << 
'\n';
   176         std::cout << 
"----------------------------------\n";
   179     for(std::size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit)
   181         const auto current_size
   182             = 
static_cast<unsigned int>(std::min<std::size_t>(size - offset, aligned_size_limit));
   183         const auto current_blocks = ceiling_div(current_size, items_per_block);
   184         const auto starting_block = i * number_of_blocks_limit;
   186         std::chrono::time_point<std::chrono::high_resolution_clock> start;
   187         if(debug_synchronous)
   189             std::cout << 
"index:            " << i << 
'\n';
   190             std::cout << 
"current_size:     " << current_size << 
'\n';
   191             std::cout << 
"number of blocks: " << current_blocks << 
'\n';
   193             start = std::chrono::high_resolution_clock::now();
   195         hipLaunchKernelGGL(HIP_KERNEL_NAME(adjacent_difference_kernel<config, InPlace, Right>),
   196                            dim3(current_blocks),
   204                            previous_values + starting_block,
   206         ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
   207             "adjacent_difference_kernel", current_size, start);
   213 #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR   215 #endif // DOXYGEN_SHOULD_SKIP_THIS   301           typename BinaryFunction = ::rocprim::minus<>>
   303                                std::size_t&         storage_size,
   305                                const OutputIt       output,
   306                                const std::size_t    size,
   307                                const BinaryFunction op                = BinaryFunction {},
   308                                const hipStream_t    stream            = 0,
   309                                const bool           debug_synchronous = 
false)
   311     static constexpr 
bool in_place = 
false;
   312     static constexpr 
bool right    = 
false;
   313     return detail::adjacent_difference_impl<Config, in_place, right>(
   314         temporary_storage, storage_size, input, output, size, op, stream, debug_synchronous);
   354           typename BinaryFunction = ::rocprim::minus<>>
   356                                        std::size_t&         storage_size,
   357                                        const InputIt        values,
   358                                        const std::size_t    size,
   359                                        const BinaryFunction op                = BinaryFunction {},
   360                                        const hipStream_t    stream            = 0,
   361                                        const bool           debug_synchronous = 
false)
   363     static constexpr 
bool in_place = 
true;
   364     static constexpr 
bool right    = 
false;
   365     return detail::adjacent_difference_impl<Config, in_place, right>(
   366         temporary_storage, storage_size, values, values, size, op, stream, debug_synchronous);
   450           typename BinaryFunction = ::rocprim::minus<>>
   452                                      std::size_t&         storage_size,
   454                                      const OutputIt       output,
   455                                      const std::size_t    size,
   456                                      const BinaryFunction op                = BinaryFunction {},
   457                                      const hipStream_t    stream            = 0,
   458                                      const bool           debug_synchronous = 
false)
   460     static constexpr 
bool in_place = 
false;
   461     static constexpr 
bool right    = 
true;
   462     return detail::adjacent_difference_impl<Config, in_place, right>(
   463         temporary_storage, storage_size, input, output, size, op, stream, debug_synchronous);
   503           typename BinaryFunction = ::rocprim::minus<>>
   505                                              std::size_t&         storage_size,
   506                                              const InputIt        values,
   507                                              const std::size_t    size,
   508                                              const BinaryFunction op     = BinaryFunction {},
   509                                              const hipStream_t    stream = 0,
   510                                              const bool           debug_synchronous = 
false)
   512     static constexpr 
bool in_place = 
true;
   513     static constexpr 
bool right    = 
true;
   514     return detail::adjacent_difference_impl<Config, in_place, right>(
   515         temporary_storage, storage_size, values, values, size, op, stream, debug_synchronous);
   521 END_ROCPRIM_NAMESPACE
   523 #endif // ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_ ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments. 
Definition: functional.hpp:55
ROCPRIM_HOST_DEVICE counting_iterator< Incrementable, Difference > make_counting_iterator(Incrementable value)
make_counting_iterator creates a counting_iterator with its initial value set to value. 
Definition: counting_iterator.hpp:261
hipError_t adjacent_difference(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:302
hipError_t adjacent_difference_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:355
hipError_t partition(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using range of flags. 
Definition: device_partition.hpp:721
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
hipError_t adjacent_difference_right(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:451
Deprecated: Configuration of device-level scan primitives. 
Definition: block_histogram.hpp:62
ROCPRIM_HOST_DEVICE transform_iterator< InputIterator, UnaryFunction > make_transform_iterator(InputIterator iterator, UnaryFunction transform)
make_transform_iterator creates a transform_iterator using iterator as the underlying iterator and tr...
Definition: transform_iterator.hpp:254
hipError_t transform(InputIterator input, OutputIterator output, const size_t size, UnaryFunction transform_op, const hipStream_t stream=0, bool debug_synchronous=false)
Parallel transform primitive for device level. 
Definition: device_transform.hpp:135
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension. 
Definition: thread.hpp:268
hipError_t adjacent_difference_right_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:504
Definition: device_adjacent_difference_config.hpp:72