rocPRIM
device_adjacent_difference.hpp
1 // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_DEVICE_DETAIL_DEVICE_ADJACENT_DIFFERENCE_HPP_
22 #define ROCPRIM_DEVICE_DETAIL_DEVICE_ADJACENT_DIFFERENCE_HPP_
23 
24 #include "../../block/block_adjacent_difference.hpp"
25 #include "../../block/block_load.hpp"
26 #include "../../block/block_store.hpp"
27 
28 #include "../../detail/various.hpp"
29 
30 #include "../../config.hpp"
31 
32 #include <hip/hip_runtime.h>
33 
34 #include <type_traits>
35 
36 #include <cstdint>
37 
38 BEGIN_ROCPRIM_NAMESPACE
39 
40 namespace detail
41 {
42 
43 template <typename T, unsigned int BlockSize>
45 {
46  using adjacent_diff_type = ::rocprim::block_adjacent_difference<T, BlockSize>;
47  using storage_type = typename adjacent_diff_type::storage_type;
48 
49  template <unsigned int ItemsPerThread,
50  typename Output,
51  typename BinaryFunction,
52  typename InputIt,
53  bool InPlace>
54  ROCPRIM_DEVICE void dispatch(const T (&input)[ItemsPerThread],
55  Output (&output)[ItemsPerThread],
56  const BinaryFunction op,
57  const InputIt previous_values,
58  const unsigned int block_id,
59  const std::size_t starting_block,
60  const std::size_t num_blocks,
61  const std::size_t size,
62  storage_type& storage,
63  bool_constant<InPlace> /*in_place*/,
64  std::false_type /*right*/)
65  {
66  static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
67 
68  // Not the first block, i.e. has a predecessor
69  if(starting_block + block_id != 0)
70  {
71  // `previous_values` needs to be accessed with a stride of `items_per_block` if the
72  // operation is out-of-place
73  const unsigned int block_offset = InPlace ? block_id : block_id * items_per_block;
74  const InputIt block_previous_values = previous_values + block_offset;
75 
76  const T tile_predecessor = block_previous_values[-1];
77  // Not the last (i.e. full block)
78  if(starting_block + block_id != num_blocks - 1)
79  {
80  adjacent_diff_type {}.subtract_left(input, output, op, tile_predecessor, storage);
81  }
82  else
83  {
84  const unsigned int valid_items
85  = static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
86  adjacent_diff_type {}.subtract_left_partial(
87  input, output, op, tile_predecessor, valid_items, storage);
88  }
89  }
90  else
91  {
92  // Not the last (i.e. full block)
93  if(starting_block + block_id != num_blocks - 1)
94  {
95  adjacent_diff_type {}.subtract_left(input, output, op, storage);
96  }
97  else
98  {
99  const unsigned int valid_items
100  = static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
101  adjacent_diff_type {}.subtract_left_partial(
102  input, output, op, valid_items, storage);
103  }
104  }
105  }
106 
107  template <unsigned int ItemsPerThread,
108  typename Output,
109  typename BinaryFunction,
110  typename InputIt,
111  bool InPlace>
112  ROCPRIM_DEVICE void dispatch(const T (&input)[ItemsPerThread],
113  Output (&output)[ItemsPerThread],
114  const BinaryFunction op,
115  const InputIt previous_values,
116  const unsigned int block_id,
117  const std::size_t starting_block,
118  const std::size_t num_blocks,
119  const std::size_t size,
120  storage_type& storage,
121  bool_constant<InPlace> /*in_place*/,
122  std::true_type /*right*/)
123  {
124  static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread;
125 
126  // Not the last (i.e. full) block and has a successor
127  if(starting_block + block_id != num_blocks - 1)
128  {
129  // `previous_values` needs to be accessed with a stride of `items_per_block` if the
130  // operation is out-of-place
131  // When in-place, the first block does not save its value (since it won't be used)
132  // so the block values are shifted right one. This means that next block's first value
133  // is in the position `block_id`
134  const unsigned int block_offset = InPlace ? block_id : (block_id + 1) * items_per_block;
135 
136  const InputIt next_block_values = previous_values + block_offset;
137  const T tile_successor = *next_block_values;
138 
139  adjacent_diff_type {}.subtract_right(input, output, op, tile_successor, storage);
140  }
141  else
142  {
143  const unsigned int valid_items
144  = static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
145  adjacent_diff_type {}.subtract_right_partial(input, output, op, valid_items, storage);
146  }
147  }
148 };
149 
150 template <typename T, typename InputIterator>
151 ROCPRIM_DEVICE ROCPRIM_INLINE auto select_previous_values_iterator(T* previous_values,
152  InputIterator /*input*/,
153  std::true_type /*in_place*/)
154 {
155  return previous_values;
156 }
157 
158 template <typename T, typename InputIterator>
159 ROCPRIM_DEVICE ROCPRIM_INLINE auto select_previous_values_iterator(T* /*previous_values*/,
160  InputIterator input,
161  std::false_type /*in_place*/)
162 {
163  return input;
164 }
165 
166 template <typename Config,
167  bool InPlace,
168  bool Right,
169  typename InputIt,
170  typename OutputIt,
171  typename BinaryFunction>
172 ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void adjacent_difference_kernel_impl(
173  const InputIt input,
174  const OutputIt output,
175  const std::size_t size,
176  const BinaryFunction op,
177  const typename std::iterator_traits<InputIt>::value_type* previous_values,
178  const std::size_t starting_block)
179 {
180  using input_type = typename std::iterator_traits<InputIt>::value_type;
181  using output_type = typename std::iterator_traits<OutputIt>::value_type;
182 
183  static constexpr unsigned int block_size = Config::block_size;
184  static constexpr unsigned int items_per_thread = Config::items_per_thread;
185  static constexpr unsigned int items_per_block = block_size * items_per_thread;
186 
187  using block_load_type
188  = ::rocprim::block_load<input_type, block_size, items_per_thread, Config::load_method>;
189  using block_store_type
190  = ::rocprim::block_store<output_type, block_size, items_per_thread, Config::store_method>;
191 
192  using adjacent_helper = adjacent_diff_helper<input_type, block_size>;
193 
194  ROCPRIM_SHARED_MEMORY struct
195  {
196  typename block_load_type::storage_type load;
197  typename adjacent_helper::storage_type adjacent_diff;
198  typename block_store_type::storage_type store;
199  } storage;
200 
201  const unsigned int block_id = blockIdx.x;
202  const unsigned int block_offset = block_id * items_per_block;
203 
204  const std::size_t num_blocks = ceiling_div(size, items_per_block);
205 
206  input_type thread_input[items_per_thread];
207  if(starting_block + block_id < num_blocks - 1)
208  {
209  block_load_type {}.load(input + block_offset, thread_input, storage.load);
210  }
211  else
212  {
213  const unsigned int valid_items
214  = static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
215  block_load_type {}.load(input + block_offset, thread_input, valid_items, storage.load);
216  }
218 
219  // Type tags for tag dispatch.
220  static constexpr auto in_place = bool_constant<InPlace> {};
221  static constexpr auto right = bool_constant<Right> {};
222 
223  // When doing the operation in-place the last/first items of each block have been copied out
224  // in advance and written to the contiguos locations, since accessing them would be a data race
225  // with the writing of their new values. In this case `select_previous_values_iterator` returns
226  // a pointer to the copied values, and it should be addressed by block_id.
227  // Otherwise (when the transform is out-of-place) it just returns the input iterator, and the
228  // first/last values of the blocks can be accessed with a stride of `items_per_block`
229  const auto previous_values_it
230  = select_previous_values_iterator(previous_values, input, in_place);
231 
232  output_type thread_output[items_per_thread];
233  // Do tag dispatch on `right` to select either `subtract_right` or `subtract_left`.
234  // Note that the function is overloaded on its last parameter.
235  adjacent_helper {}.dispatch(thread_input,
236  thread_output,
237  op,
238  previous_values_it,
239  block_id,
240  starting_block,
241  num_blocks,
242  size,
243  storage.adjacent_diff,
244  in_place,
245  right);
247 
248  if(starting_block + block_id < num_blocks - 1)
249  {
250  block_store_type {}.store(output + block_offset, thread_output, storage.store);
251  }
252  else
253  {
254  const unsigned int valid_items
255  = static_cast<unsigned int>(size - (num_blocks - 1) * items_per_block);
256  block_store_type {}.store(output + block_offset, thread_output, valid_items, storage.store);
257  }
258 }
259 
260 } // namespace detail
261 
262 END_ROCPRIM_NAMESPACE
263 
264 #endif // ROCPRIM_DEVICE_DETAIL_DEVICE_ADJACENT_DIFFERENCE_HPP_
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_id()
Returns block identifier in a multidimensional grid by dimension.
Definition: thread.hpp:258
Definition: device_adjacent_difference.hpp:44
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
ROCPRIM_DEVICE ROCPRIM_INLINE void syncthreads()
Synchronize all threads in a block (tile)
Definition: thread.hpp:216
ROCPRIM_DEVICE ROCPRIM_INLINE unsigned int block_size()
Returns block size in a multidimensional grid by dimension.
Definition: thread.hpp:268