21 #ifndef ROCPRIM_WARP_DETAIL_WARP_REDUCE_SHARED_MEM_HPP_ 22 #define ROCPRIM_WARP_DETAIL_WARP_REDUCE_SHARED_MEM_HPP_ 24 #include <type_traits> 26 #include "../../config.hpp" 27 #include "../../intrinsics.hpp" 28 #include "../../types.hpp" 29 #include "../../detail/various.hpp" 31 #include "warp_segment_bounds.hpp" 33 BEGIN_ROCPRIM_NAMESPACE
40 unsigned int WarpSize,
53 template<
class BinaryFunction>
54 ROCPRIM_DEVICE ROCPRIM_INLINE
57 constexpr
unsigned int ceiling = next_power_of_two(WarpSize);
58 const unsigned int lid = detail::logical_lane_id<WarpSize>();
59 storage_type_& storage_ = storage.get();
62 storage_.values[lid] = output;
65 for(
unsigned int i = ceiling >> 1; i > 0; i >>= 1)
67 const bool do_op = lid + i < WarpSize && lid < i;
70 output = storage_.values[lid];
71 T other = storage_.values[lid + i];
72 output = reduce_op(output, other);
77 storage_.values[lid] = output;
81 set_output<UseAllReduce>(output, storage);
84 template<
class BinaryFunction>
85 ROCPRIM_DEVICE ROCPRIM_INLINE
86 void reduce(T input, T& output,
unsigned int valid_items,
89 constexpr
unsigned int ceiling = next_power_of_two(WarpSize);
90 const unsigned int lid = detail::logical_lane_id<WarpSize>();
91 storage_type_& storage_ = storage.get();
94 storage_.values[lid] = output;
97 for(
unsigned int i = ceiling >> 1; i > 0; i >>= 1)
99 const bool do_op = (lid + i) < WarpSize && lid < i && (lid + i) < valid_items;
102 output = storage_.values[lid];
103 T other = storage_.values[lid + i];
104 output = reduce_op(output, other);
109 storage_.values[lid] = output;
113 set_output<UseAllReduce>(output, storage);
116 template<
class Flag,
class BinaryFunction>
117 ROCPRIM_DEVICE ROCPRIM_INLINE
118 void head_segmented_reduce(T input, T& output, Flag flag,
121 this->segmented_reduce<true>(input, output, flag, storage, reduce_op);
124 template<
class Flag,
class BinaryFunction>
125 ROCPRIM_DEVICE ROCPRIM_INLINE
126 void tail_segmented_reduce(T input, T& output, Flag flag,
129 this->segmented_reduce<false>(input, output, flag, storage, reduce_op);
133 template<
bool HeadSegmented,
class Flag,
class BinaryFunction>
134 ROCPRIM_DEVICE ROCPRIM_INLINE
135 void segmented_reduce(T input, T& output, Flag flag,
138 const unsigned int lid = detail::logical_lane_id<WarpSize>();
139 constexpr
unsigned int ceiling = next_power_of_two(WarpSize);
140 storage_type_& storage_ = storage.get();
142 auto last = last_in_warp_segment<HeadSegmented, WarpSize>(flag);
146 for(
unsigned int i = 1; i < ceiling; i *= 2)
148 storage_.values[lid] = output;
150 if((lid + i) <= last)
152 T other = storage_.values[lid + i];
153 output = reduce_op(output, other);
159 template<
bool Switch>
160 ROCPRIM_DEVICE ROCPRIM_INLINE
161 typename std::enable_if<(Switch == false)>::type
169 template<
bool Switch>
170 ROCPRIM_DEVICE ROCPRIM_INLINE
171 typename std::enable_if<(Switch == true)>::type
174 storage_type_& storage_ = storage.get();
175 output = storage_.values[0];
181 END_ROCPRIM_NAMESPACE
183 #endif // ROCPRIM_WARP_DETAIL_WARP_REDUCE_SHARED_MEM_HPP_ Definition: benchmark_block_reduce.cpp:63
ROCPRIM_DEVICE ROCPRIM_INLINE void wave_barrier()
Synchronize all threads in the wavefront.
Definition: thread.hpp:235
Deprecated: Configuration of device-level scan primitives.
Definition: block_histogram.hpp:62
Definition: warp_reduce_shared_mem.hpp:43