ROCmSoftwarePlatform/rocPRIM/warp__reduce_8hpp_source.html

 // Copyright (c) 2017-2021 Advanced Micro Devices, Inc. All rights reserved.
 //
 // Permission is hereby granted, free of charge, to any person obtaining a copy
 // of this software and associated documentation files (the "Software"), to deal
 // in the Software without restriction, including without limitation the rights
 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 // copies of the Software, and to permit persons to whom the Software is
 // furnished to do so, subject to the following conditions:
 //
 // The above copyright notice and this permission notice shall be included in
 // all copies or substantial portions of the Software.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 // THE SOFTWARE.

 #ifndef ROCPRIM_WARP_WARP_REDUCE_HPP_
 #define ROCPRIM_WARP_WARP_REDUCE_HPP_

 #include <type_traits>

 #include "../config.hpp"
 #include "../detail/various.hpp"

 #include "../intrinsics.hpp"
 #include "../functional.hpp"
 #include "../types.hpp"

 #include "detail/warp_reduce_crosslane.hpp"
 #include "detail/warp_reduce_shared_mem.hpp"


 BEGIN_ROCPRIM_NAMESPACE

 namespace detail
 {

 // Select warp_reduce implementation based WarpSize
 template<class T, unsigned int WarpSize, bool UseAllReduce>
 struct select_warp_reduce_impl
 {
     typedef typename std::conditional<
         // can we use crosslane (DPP or shuffle-based) implementation?
         detail::is_warpsize_shuffleable<WarpSize>::value,
         detail::warp_reduce_crosslane<T, WarpSize, UseAllReduce>, // yes
         detail::warp_reduce_shared_mem<T, WarpSize, UseAllReduce> // no
     >::type type;
 };

 } // end namespace detail

 template<
     class T,
     unsigned int WarpSize = device_warp_size(),
     bool UseAllReduce = false
 >
 class warp_reduce
 #ifndef DOXYGEN_SHOULD_SKIP_THIS
     : private detail::select_warp_reduce_impl<T, WarpSize, UseAllReduce>::type
 #endif
 {
     using base_type = typename detail::select_warp_reduce_impl<T, WarpSize, UseAllReduce>::type;

     // Check if WarpSize is valid for the targets
     static_assert(WarpSize <= ROCPRIM_MAX_WARP_SIZE, "WarpSize can't be greater than hardware warp size.");

 public:
     using storage_type = typename base_type::storage_type;

     template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto reduce(T input,
                 T& output,
                 storage_type& storage,
                 BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         base_type::reduce(input, output, storage, reduce_op);
     }

     template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto reduce(T ,
                 T& ,
                 storage_type& ,
                 BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         (void) reduce_op;
         ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort.");
         return;
     }

     template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto reduce(T input,
                 T& output,
                 int valid_items,
                 storage_type& storage,
                 BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         base_type::reduce(input, output, valid_items, storage, reduce_op);
     }

     template<class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto reduce(T ,
                 T& ,
                 int ,
                 storage_type& ,
                 BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         (void) reduce_op;
         ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort.");
         return;
     }

     template<class Flag, class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto head_segmented_reduce(T input,
                                T& output,
                                Flag flag,
                                storage_type& storage,
                                BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         base_type::head_segmented_reduce(input, output, flag, storage, reduce_op);
     }

     template<class Flag, class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto head_segmented_reduce(T ,
                                T& ,
                                Flag ,
                                storage_type& ,
                                BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         (void) reduce_op;
         ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort.");
         return;
     }

     template<class Flag, class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto tail_segmented_reduce(T input,
                                T& output,
                                Flag flag,
                                storage_type& storage,
                                BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize <= __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         base_type::tail_segmented_reduce(input, output, flag, storage, reduce_op);
     }

     template<class Flag, class BinaryFunction = ::rocprim::plus<T>, unsigned int FunctionWarpSize = WarpSize>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     auto tail_segmented_reduce(T ,
                                T& ,
                                Flag ,
                                storage_type& ,
                                BinaryFunction reduce_op = BinaryFunction())
         -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void>::type
     {
         (void) reduce_op;
         ROCPRIM_PRINT_ERROR_ONCE("Specified warp size exceeds current hardware supported warp size. Aborting warp sort.");
         return;
     }
 };

 END_ROCPRIM_NAMESPACE

 // end of group warpmodule

 #endif // ROCPRIM_WARP_WARP_REDUCE_HPP_
warp_reduce::head_segmented_reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto head_segmented_reduce(T, T &, Flag, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs head-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:313

warp_reduce::reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T, T &, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:194

device_warp_size
ROCPRIM_DEVICE ROCPRIM_INLINE constexpr unsigned int device_warp_size()
Returns a number of threads in a hardware warp for the actual target.
Definition: thread.hpp:70

warp_reduce::storage_type
typename base_type::storage_type storage_type
Struct used to allocate a temporary memory that is required for thread communication during operation...
Definition: warp_reduce.hpp:133

reduce
hipError_t reduce(void *temporary_storage, size_t &storage_size, InputIterator input, OutputIterator output, const InitValueType initial_value, const size_t size, BinaryFunction reduce_op=BinaryFunction(), const hipStream_t stream=0, bool debug_synchronous=false)
Parallel reduction primitive for device level.
Definition: device_reduce.hpp:374

detail
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62

detail::select_warp_reduce_impl
Definition: warp_reduce.hpp:46

warp_reduce::reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, int valid_items, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:253

warp_reduce::tail_segmented_reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto tail_segmented_reduce(T, T &, Flag, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs tail-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:359

ROCPRIM_PRINT_ERROR_ONCE
#define ROCPRIM_PRINT_ERROR_ONCE(message)
Prints the supplied error message only once (using only one of the active threads).
Definition: functional.hpp:42

warp_reduce
The warp_reduce class is a warp level parallel primitive which provides methods for performing reduct...
Definition: warp_reduce.hpp:114

warp_reduce::reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T, T &, int, storage_type &, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize > __AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:267

detail::warp_reduce_shared_mem
Definition: warp_reduce_shared_mem.hpp:43

detail::is_warpsize_shuffleable
Definition: various.hpp:108

warp_reduce::tail_segmented_reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto tail_segmented_reduce(T input, T &output, Flag flag, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs tail-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:345

warp_reduce::head_segmented_reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto head_segmented_reduce(T input, T &output, Flag flag, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs head-segmented reduction across threads in a logical warp.
Definition: warp_reduce.hpp:299

warp_reduce::reduce
ROCPRIM_DEVICE ROCPRIM_INLINE auto reduce(T input, T &output, storage_type &storage, BinaryFunction reduce_op=BinaryFunction()) -> typename std::enable_if<(FunctionWarpSize<=__AMDGCN_WAVEFRONT_SIZE), void >::type
Performs reduction across threads in a logical warp.
Definition: warp_reduce.hpp:181