21 #ifndef ROCPRIM_WARP_DETAIL_WARP_REDUCE_SHUFFLE_HPP_ 22 #define ROCPRIM_WARP_DETAIL_WARP_REDUCE_SHUFFLE_HPP_ 24 #include <type_traits> 26 #include "../../config.hpp" 27 #include "../../intrinsics.hpp" 28 #include "../../types.hpp" 29 #include "../../detail/various.hpp" 31 #include "warp_segment_bounds.hpp" 33 BEGIN_ROCPRIM_NAMESPACE
40 unsigned int WarpSize,
46 static_assert(detail::is_power_of_two(WarpSize),
"WarpSize must be power of 2");
50 template<
class BinaryFunction>
51 ROCPRIM_DEVICE ROCPRIM_INLINE
52 void reduce(T input, T& output, BinaryFunction reduce_op)
58 for(
unsigned int offset = 1; offset < WarpSize; offset *= 2)
61 output = reduce_op(output, value);
63 set_output<UseAllReduce>(output);
66 template<
class BinaryFunction>
67 ROCPRIM_DEVICE ROCPRIM_INLINE
71 this->
reduce(input, output, reduce_op);
74 template<
bool UseAllReduceDummy = UseAllReduce,
class BinaryFunction>
75 ROCPRIM_DEVICE ROCPRIM_INLINE
76 void reduce(T input, T& output,
unsigned int valid_items, BinaryFunction reduce_op)
82 for(
unsigned int offset = 1; offset < WarpSize; offset *= 2)
85 unsigned int id = detail::logical_lane_id<WarpSize>();
86 if (
id + offset < valid_items) output = reduce_op(output, value);
88 set_output<UseAllReduceDummy>(output);
91 template<
class BinaryFunction>
92 ROCPRIM_DEVICE ROCPRIM_INLINE
93 void reduce(T input, T& output,
unsigned int valid_items,
97 this->
reduce(input, output, valid_items, reduce_op);
100 template<
class Flag,
class BinaryFunction>
101 ROCPRIM_DEVICE ROCPRIM_INLINE
102 void head_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op)
104 this->segmented_reduce<true>(input, output, flag, reduce_op);
107 template<
class Flag,
class BinaryFunction>
108 ROCPRIM_DEVICE ROCPRIM_INLINE
109 void tail_segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op)
111 this->segmented_reduce<false>(input, output, flag, reduce_op);
114 template<
class Flag,
class BinaryFunction>
115 ROCPRIM_DEVICE ROCPRIM_INLINE
116 void head_segmented_reduce(T input, T& output, Flag flag,
120 this->segmented_reduce<true>(input, output, flag, reduce_op);
123 template<
class Flag,
class BinaryFunction>
124 ROCPRIM_DEVICE ROCPRIM_INLINE
125 void tail_segmented_reduce(T input, T& output, Flag flag,
129 this->segmented_reduce<false>(input, output, flag, reduce_op);
133 template<
bool HeadSegmented,
class Flag,
class BinaryFunction>
134 ROCPRIM_DEVICE ROCPRIM_INLINE
135 void segmented_reduce(T input, T& output, Flag flag, BinaryFunction reduce_op)
139 auto valid_items_in_segment = last_in_warp_segment<HeadSegmented, WarpSize>(flag) + 1U;
140 this->
reduce<false>(input, output, valid_items_in_segment, reduce_op);
143 template<
bool Switch>
144 ROCPRIM_DEVICE ROCPRIM_INLINE
145 typename std::enable_if<(Switch == false)>::type
146 set_output(T& output)
152 template<
bool Switch>
153 ROCPRIM_DEVICE ROCPRIM_INLINE
154 typename std::enable_if<(Switch == true)>::type
155 set_output(T& output)
163 END_ROCPRIM_NAMESPACE
165 #endif // ROCPRIM_WARP_DETAIL_WARP_REDUCE_SHUFFLE_HPP_ ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle(const T &input, const int src_lane, const int width=device_warp_size())
Shuffle for any data type.
Definition: warp_shuffle.hpp:172
Definition: benchmark_block_reduce.cpp:63
ROCPRIM_DEVICE ROCPRIM_INLINE T warp_shuffle_down(const T &input, const unsigned int delta, const int width=device_warp_size())
Shuffle down for any data type.
Definition: warp_shuffle.hpp:222
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: warp_reduce_shuffle.hpp:43
Definition: various.hpp:52