rocPRIM
device_adjacent_difference.hpp
Go to the documentation of this file.
1 // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
22 #define ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
23 
24 #include "detail/device_adjacent_difference.hpp"
25 
26 #include "device_adjacent_difference_config.hpp"
27 
28 #include "config_types.hpp"
29 #include "device_transform.hpp"
30 
31 #include "../config.hpp"
32 #include "../functional.hpp"
33 
34 #include "../detail/temp_storage.hpp"
35 #include "../detail/various.hpp"
36 #include "../iterator/counting_iterator.hpp"
37 #include "../iterator/transform_iterator.hpp"
38 
39 #include <hip/hip_runtime.h>
40 
41 #include <chrono>
42 #include <iostream>
43 #include <iterator>
44 
45 #include <cstddef>
46 
50 
51 BEGIN_ROCPRIM_NAMESPACE
52 
53 #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document
54 
55 #define ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(name, size, start) \
56  { \
57  auto _error = hipGetLastError(); \
58  if(_error != hipSuccess) \
59  return _error; \
60  if(debug_synchronous) \
61  { \
62  std::cout << name << "(" << size << ")"; \
63  auto __error = hipStreamSynchronize(stream); \
64  if(__error != hipSuccess) \
65  return __error; \
66  auto _end = std::chrono::high_resolution_clock::now(); \
67  auto _d = std::chrono::duration_cast<std::chrono::duration<double>>(_end - start); \
68  std::cout << " " << _d.count() * 1000 << " ms" << '\n'; \
69  } \
70  }
71 
72 namespace detail
73 {
74 template <typename Config,
75  bool InPlace,
76  bool Right,
77  typename InputIt,
78  typename OutputIt,
79  typename BinaryFunction>
80 void ROCPRIM_KERNEL __launch_bounds__(Config::block_size) adjacent_difference_kernel(
81  const InputIt input,
82  const OutputIt output,
83  const std::size_t size,
84  const BinaryFunction op,
85  const typename std::iterator_traits<InputIt>::value_type* previous_values,
86  const std::size_t starting_block)
87 {
88  adjacent_difference_kernel_impl<Config, InPlace, Right>(
89  input, output, size, op, previous_values, starting_block);
90 }
91 
92 template <typename Config,
93  bool InPlace,
94  bool Right,
95  typename InputIt,
96  typename OutputIt,
97  typename BinaryFunction>
98 hipError_t adjacent_difference_impl(void* const temporary_storage,
99  std::size_t& storage_size,
100  const InputIt input,
101  const OutputIt output,
102  const std::size_t size,
103  const BinaryFunction op,
104  const hipStream_t stream,
105  const bool debug_synchronous)
106 {
107  using value_type = typename std::iterator_traits<InputIt>::value_type;
108 
109  using config = detail::default_or_custom_config<
110  Config,
112 
113  static constexpr unsigned int block_size = config::block_size;
114  static constexpr unsigned int items_per_thread = config::items_per_thread;
115  static constexpr unsigned int items_per_block = block_size * items_per_thread;
116 
117  const std::size_t num_blocks = ceiling_div(size, items_per_block);
118  const std::size_t num_previous_values = InPlace && num_blocks >= 2 ? num_blocks - 1 : 0;
119 
120  value_type* previous_values;
121 
122  const hipError_t partition_result = detail::temp_storage::partition(
123  temporary_storage,
124  storage_size,
125  detail::temp_storage::ptr_aligned_array(&previous_values, num_previous_values));
126  if(partition_result != hipSuccess || temporary_storage == nullptr)
127  {
128  return partition_result;
129  }
130 
131  if(num_blocks == 0)
132  {
133  return hipSuccess;
134  }
135 
136  // Copy values before they are overwritten to use as tile predecessors/successors
137  // previous_values is not dereferenced when the operation is not in place
138  if ROCPRIM_IF_CONSTEXPR(InPlace)
139  {
140  // If doing left adjacent diff then the last item of each block is needed for the
141  // next block, otherwise the first item is needed for the previous block
142  static constexpr auto offset = items_per_block - (Right ? 0 : 1);
143 
144  const auto block_starts_iter = make_transform_iterator(
145  rocprim::make_counting_iterator(std::size_t {0}),
146  [base = input + offset](std::size_t i) { return base[i * items_per_block]; });
147 
148  const hipError_t error = ::rocprim::transform(block_starts_iter,
149  previous_values,
150  num_blocks - 1,
151  rocprim::identity<> {},
152  stream,
153  debug_synchronous);
154  if(error != hipSuccess)
155  {
156  return error;
157  }
158  }
159 
160  static constexpr unsigned int size_limit = config::size_limit;
161  static constexpr auto number_of_blocks_limit = std::max(size_limit / items_per_block, 1u);
162  static constexpr auto aligned_size_limit = number_of_blocks_limit * items_per_block;
163 
164  // Launch number_of_blocks_limit blocks while there is still at least as many blocks
165  // left as the limit
166  const auto number_of_launch = ceiling_div(size, aligned_size_limit);
167 
168  if(debug_synchronous)
169  {
170  std::cout << "----------------------------------\n";
171  std::cout << "size: " << size << '\n';
172  std::cout << "aligned_size_limit: " << aligned_size_limit << '\n';
173  std::cout << "number_of_launch: " << number_of_launch << '\n';
174  std::cout << "block_size: " << block_size << '\n';
175  std::cout << "items_per_block: " << items_per_block << '\n';
176  std::cout << "----------------------------------\n";
177  }
178 
179  for(std::size_t i = 0, offset = 0; i < number_of_launch; ++i, offset += aligned_size_limit)
180  {
181  const auto current_size
182  = static_cast<unsigned int>(std::min<std::size_t>(size - offset, aligned_size_limit));
183  const auto current_blocks = ceiling_div(current_size, items_per_block);
184  const auto starting_block = i * number_of_blocks_limit;
185 
186  std::chrono::time_point<std::chrono::high_resolution_clock> start;
187  if(debug_synchronous)
188  {
189  std::cout << "index: " << i << '\n';
190  std::cout << "current_size: " << current_size << '\n';
191  std::cout << "number of blocks: " << current_blocks << '\n';
192 
193  start = std::chrono::high_resolution_clock::now();
194  }
195  hipLaunchKernelGGL(HIP_KERNEL_NAME(adjacent_difference_kernel<config, InPlace, Right>),
196  dim3(current_blocks),
197  dim3(block_size),
198  0,
199  stream,
200  input + offset,
201  output + offset,
202  size,
203  op,
204  previous_values + starting_block,
205  starting_block);
206  ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR(
207  "adjacent_difference_kernel", current_size, start);
208  }
209  return hipSuccess;
210 }
211 } // namespace detail
212 
213 #undef ROCPRIM_DETAIL_HIP_SYNC_AND_RETURN_ON_ERROR
214 
215 #endif // DOXYGEN_SHOULD_SKIP_THIS
216 
219 
298 template <typename Config = default_config,
299  typename InputIt,
300  typename OutputIt,
301  typename BinaryFunction = ::rocprim::minus<>>
302 hipError_t adjacent_difference(void* const temporary_storage,
303  std::size_t& storage_size,
304  const InputIt input,
305  const OutputIt output,
306  const std::size_t size,
307  const BinaryFunction op = BinaryFunction {},
308  const hipStream_t stream = 0,
309  const bool debug_synchronous = false)
310 {
311  static constexpr bool in_place = false;
312  static constexpr bool right = false;
313  return detail::adjacent_difference_impl<Config, in_place, right>(
314  temporary_storage, storage_size, input, output, size, op, stream, debug_synchronous);
315 }
316 
352 template <typename Config = default_config,
353  typename InputIt,
354  typename BinaryFunction = ::rocprim::minus<>>
355 hipError_t adjacent_difference_inplace(void* const temporary_storage,
356  std::size_t& storage_size,
357  const InputIt values,
358  const std::size_t size,
359  const BinaryFunction op = BinaryFunction {},
360  const hipStream_t stream = 0,
361  const bool debug_synchronous = false)
362 {
363  static constexpr bool in_place = true;
364  static constexpr bool right = false;
365  return detail::adjacent_difference_impl<Config, in_place, right>(
366  temporary_storage, storage_size, values, values, size, op, stream, debug_synchronous);
367 }
368 
447 template <typename Config = default_config,
448  typename InputIt,
449  typename OutputIt,
450  typename BinaryFunction = ::rocprim::minus<>>
451 hipError_t adjacent_difference_right(void* const temporary_storage,
452  std::size_t& storage_size,
453  const InputIt input,
454  const OutputIt output,
455  const std::size_t size,
456  const BinaryFunction op = BinaryFunction {},
457  const hipStream_t stream = 0,
458  const bool debug_synchronous = false)
459 {
460  static constexpr bool in_place = false;
461  static constexpr bool right = true;
462  return detail::adjacent_difference_impl<Config, in_place, right>(
463  temporary_storage, storage_size, input, output, size, op, stream, debug_synchronous);
464 }
465 
501 template <typename Config = default_config,
502  typename InputIt,
503  typename BinaryFunction = ::rocprim::minus<>>
504 hipError_t adjacent_difference_right_inplace(void* const temporary_storage,
505  std::size_t& storage_size,
506  const InputIt values,
507  const std::size_t size,
508  const BinaryFunction op = BinaryFunction {},
509  const hipStream_t stream = 0,
510  const bool debug_synchronous = false)
511 {
512  static constexpr bool in_place = true;
513  static constexpr bool right = true;
514  return detail::adjacent_difference_impl<Config, in_place, right>(
515  temporary_storage, storage_size, values, values, size, op, stream, debug_synchronous);
516 }
517 
519 // end of group devicemodule
520 
521 END_ROCPRIM_NAMESPACE
522 
523 #endif // ROCPRIM_DEVICE_DEVICE_ADJACENT_DIFFERENCE_HPP_
ROCPRIM_HOST_DEVICE constexpr T max(const T &a, const T &b)
Returns the maximum of its arguments.
Definition: functional.hpp:55
ROCPRIM_HOST_DEVICE counting_iterator< Incrementable, Difference > make_counting_iterator(Incrementable value)
make_counting_iterator creates a counting_iterator with its initial value set to value.
Definition: counting_iterator.hpp:261
hipError_t adjacent_difference(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:302
hipError_t adjacent_difference_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:355
hipError_t partition(void *temporary_storage, size_t &storage_size, InputIterator input, FlagIterator flags, OutputIterator output, SelectedCountOutputIterator selected_count_output, const size_t size, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel select primitive for device level using range of flags.
Definition: device_partition.hpp:721
Special type used to show that the given device-level operation will be executed with optimal configu...
Definition: config_types.hpp:45
hipError_t adjacent_difference_right(void *const temporary_storage, std::size_t &storage_size, const InputIt input, const OutputIt output, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:451
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_HOST_DEVICE transform_iterator< InputIterator, UnaryFunction > make_transform_iterator(InputIterator iterator, UnaryFunction transform)
make_transform_iterator creates a transform_iterator using iterator as the underlying iterator and tr...
Definition: transform_iterator.hpp:254
hipError_t transform(InputIterator input, OutputIterator output, const size_t size, UnaryFunction transform_op, const hipStream_t stream=0, bool debug_synchronous=false)
Parallel transform primitive for device level.
Definition: device_transform.hpp:135
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension.
Definition: thread.hpp:268
hipError_t adjacent_difference_right_inplace(void *const temporary_storage, std::size_t &storage_size, const InputIt values, const std::size_t size, const BinaryFunction op=BinaryFunction {}, const hipStream_t stream=0, const bool debug_synchronous=false)
Parallel primitive for applying a binary operation across pairs of consecutive elements in device acc...
Definition: device_adjacent_difference.hpp:504
Definition: device_adjacent_difference_config.hpp:72