rocPRIM
block_reduce.hpp
1 // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved.
2 //
3 // Permission is hereby granted, free of charge, to any person obtaining a copy
4 // of this software and associated documentation files (the "Software"), to deal
5 // in the Software without restriction, including without limitation the rights
6 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 // copies of the Software, and to permit persons to whom the Software is
8 // furnished to do so, subject to the following conditions:
9 //
10 // The above copyright notice and this permission notice shall be included in
11 // all copies or substantial portions of the Software.
12 //
13 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 // THE SOFTWARE.
20 
21 #ifndef ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
22 #define ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
23 
24 #include <type_traits>
25 
26 #include "../config.hpp"
27 #include "../detail/various.hpp"
28 
29 #include "../intrinsics.hpp"
30 #include "../functional.hpp"
31 
32 #include "detail/block_reduce_warp_reduce.hpp"
33 #include "detail/block_reduce_raking_reduce.hpp"
34 
35 
38 
39 BEGIN_ROCPRIM_NAMESPACE
40 
43 {
52 };
53 
54 namespace detail
55 {
56 
57 // Selector for block_reduce algorithm which gives block reduce implementation
58 // type based on passed block_reduce_algorithm enum
59 template<block_reduce_algorithm Algorithm>
61 
62 template<>
64 {
65  template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
67 };
68 
69 template<>
71 {
72  template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
74 };
75 
76 template<>
78 {
79  template<class T, unsigned int BlockSizeX, unsigned int BlockSizeY, unsigned int BlockSizeZ>
81 };
82 
83 
84 } // end namespace detail
85 
132 template<
133  class T,
134  unsigned int BlockSizeX,
136  unsigned int BlockSizeY = 1,
137  unsigned int BlockSizeZ = 1
138 >
140 #ifndef DOXYGEN_SHOULD_SKIP_THIS
141  : private detail::select_block_reduce_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>
142 #endif
143 {
144  using base_type = typename detail::select_block_reduce_impl<Algorithm>::template type<T, BlockSizeX, BlockSizeY, BlockSizeZ>;
145 public:
154  using storage_type = typename base_type::storage_type;
155 
202  template<class BinaryFunction = ::rocprim::plus<T>>
203  ROCPRIM_DEVICE ROCPRIM_INLINE
204  void reduce(T input,
205  T& output,
206  storage_type& storage,
207  BinaryFunction reduce_op = BinaryFunction())
208  {
209  base_type::reduce(input, output, storage, reduce_op);
210  }
211 
227  template<class BinaryFunction = ::rocprim::plus<T>>
228  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
229  void reduce(T input,
230  T& output,
231  BinaryFunction reduce_op = BinaryFunction())
232  {
233  base_type::reduce(input, output, reduce_op);
234  }
235 
283  template<
284  unsigned int ItemsPerThread,
285  class BinaryFunction = ::rocprim::plus<T>
286  >
287  ROCPRIM_DEVICE ROCPRIM_INLINE
288  void reduce(T (&input)[ItemsPerThread],
289  T& output,
290  storage_type& storage,
291  BinaryFunction reduce_op = BinaryFunction())
292  {
293  base_type::reduce(input, output, storage, reduce_op);
294  }
295 
312  template<
313  unsigned int ItemsPerThread,
314  class BinaryFunction = ::rocprim::plus<T>
315  >
316  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
317  void reduce(T (&input)[ItemsPerThread],
318  T& output,
319  BinaryFunction reduce_op = BinaryFunction())
320  {
321  base_type::reduce(input, output, reduce_op);
322  }
323 
370  template<class BinaryFunction = ::rocprim::plus<T>>
371  ROCPRIM_DEVICE ROCPRIM_INLINE
372  void reduce(T input,
373  T& output,
374  unsigned int valid_items,
375  storage_type& storage,
376  BinaryFunction reduce_op = BinaryFunction())
377  {
378  base_type::reduce(input, output, valid_items, storage, reduce_op);
379  }
380 
398  template<class BinaryFunction = ::rocprim::plus<T>>
399  ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
400  void reduce(T input,
401  T& output,
402  unsigned int valid_items,
403  BinaryFunction reduce_op = BinaryFunction())
404  {
405  base_type::reduce(input, output, valid_items, reduce_op);
406  }
407 };
408 
409 END_ROCPRIM_NAMESPACE
410 
412 // end of group blockmodule
413 
414 #endif // ROCPRIM_BLOCK_BLOCK_REDUCE_HPP_
Definition: block_reduce_raking_reduce.hpp:118
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction())
Performs reduction across threads in a block.
Definition: block_reduce.hpp:204
Default block_reduce algorithm.
A warp_reduce based algorithm.
Definition: block_reduce_warp_reduce.hpp:45
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, BinaryFunction reduce_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_reduce.hpp:229
block_reduce_algorithm
Available algorithms for block_reduce primitive.
Definition: block_reduce.hpp:42
hipError_t reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduction primitive for device level.
Definition: device_reduce.hpp:374
An algorithm which limits calculations to a single hardware warp.
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T(&input)[ItemsPerThread], T &output, BinaryFunction reduce_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_reduce.hpp:317
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T(&input)[ItemsPerThread], T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction())
Performs reduction across threads in a block.
Definition: block_reduce.hpp:288
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
The block_reduce class is a block level parallel primitive which provides methods for performing redu...
Definition: block_reduce.hpp:139
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void reduce(T input, T &output, unsigned int valid_items, BinaryFunction reduce_op=BinaryFunction())
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: block_reduce.hpp:400
raking reduce that supports only commutative operators
Definition: block_reduce.hpp:60
Default block_histogram algorithm.
typename base_type::storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: block_reduce.hpp:154
ROCPRIM_DEVICE ROCPRIM_INLINE void reduce(T input, T &output, unsigned int valid_items, storage_type &storage, BinaryFunction reduce_op=BinaryFunction())
Performs reduction across threads in a block.
Definition: block_reduce.hpp:372