Expression Templates Library (ETL)
sum.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
13 #pragma once
14 
15 namespace etl::impl::vec {
16 
23 template <typename V, typename L>
24 value_t<L> sum_impl(const L& lhs) {
25  //Note: Padding cannot be taken into account we don't start at zero
26 
27  using vec_type = V;
28  using T = value_t<L>;
29 
30  const size_t n = etl::size(lhs);
31 
32  static constexpr size_t vec_size = vec_type::template traits<T>::size;
33 
35 
36  size_t i = 0;
37 
38  auto r1 = vec_type::template zero<T>();
39  auto r2 = vec_type::template zero<T>();
40  auto r3 = vec_type::template zero<T>();
41  auto r4 = vec_type::template zero<T>();
42 
43  for (; i + (vec_size * 4) - 1 < n; i += 4 * vec_size) {
44  r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
45  r2 = vec_type::add(lhs.template load<vec_type>(i + 1 * vec_size), r2);
46  r3 = vec_type::add(lhs.template load<vec_type>(i + 2 * vec_size), r3);
47  r4 = vec_type::add(lhs.template load<vec_type>(i + 3 * vec_size), r4);
48  }
49 
50  for (; i + (vec_size * 2) - 1 < n; i += 2 * vec_size) {
51  r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
52  r2 = vec_type::add(lhs.template load<vec_type>(i + 1 * vec_size), r2);
53  }
54 
55  for (; i + vec_size - 1 < n; i += vec_size) {
56  r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
57  }
58 
59  auto p1 = vec_type::hadd(r1) + vec_type::hadd(r2) + vec_type::hadd(r3) + vec_type::hadd(r4);
60  auto p2 = T();
61 
62  for (; i + 1 < n; i += 2) {
63  p1 += lhs[i];
64  p2 += lhs[i + 1];
65  }
66 
67  if (i < n) {
68  p1 += lhs[i];
69  }
70 
71  return p1 + p2;
72 }
73 
80 template <typename V, typename L>
81 value_t<L> asum_impl(const L& lhs) {
82  //Note: Padding cannot be taken into account we don't start at zero
83 
84  using vec_type = V;
85  using T = value_t<L>;
86  using std::abs;
87 
88  const size_t n = etl::size(lhs);
89 
90  static constexpr size_t vec_size = vec_type::template traits<T>::size;
91 
93 
94  size_t i = 0;
95 
96  auto r1 = vec_type::template zero<T>();
97  auto r2 = vec_type::template zero<T>();
98  auto r3 = vec_type::template zero<T>();
99  auto r4 = vec_type::template zero<T>();
100 
101  for (; i + (vec_size * 4) - 1 < n; i += 4 * vec_size) {
102  auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
103  auto v2 = lhs.template load<vec_type>(i + 1 * vec_size);
104  auto v3 = lhs.template load<vec_type>(i + 2 * vec_size);
105  auto v4 = lhs.template load<vec_type>(i + 3 * vec_size);
106 
107  auto x1 = vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
108  auto x2 = vec_type::max(v2, vec_type::sub(vec_type::template zero<T>(), v2));
109  auto x3 = vec_type::max(v3, vec_type::sub(vec_type::template zero<T>(), v3));
110  auto x4 = vec_type::max(v4, vec_type::sub(vec_type::template zero<T>(), v4));
111 
112  r1 = vec_type::add(x1, r1);
113  r2 = vec_type::add(x2, r2);
114  r3 = vec_type::add(x3, r3);
115  r4 = vec_type::add(x4, r4);
116  }
117 
118  for (; i + (vec_size * 2) - 1 < n; i += 2 * vec_size) {
119  auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
120  auto v2 = lhs.template load<vec_type>(i + 1 * vec_size);
121 
122  auto x1 = vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
123  auto x2 = vec_type::max(v2, vec_type::sub(vec_type::template zero<T>(), v2));
124 
125  r1 = vec_type::add(x1, r1);
126  r2 = vec_type::add(x2, r2);
127  }
128 
129  for (; i + vec_size - 1 < n; i += vec_size) {
130  auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
131  auto x1 = vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
132  r1 = vec_type::add(x1, r1);
133  }
134 
135  auto p1 = vec_type::hadd(r1) + vec_type::hadd(r2) + vec_type::hadd(r3) + vec_type::hadd(r4);
136  auto p2 = T();
137 
138  for (; i + 1 < n; i += 2) {
139  p1 += abs(lhs[i]);
140  p2 += abs(lhs[i + 1]);
141  }
142 
143  if (i < n) {
144  p1 += abs(lhs[i]);
145  }
146 
147  return p1 + p2;
148 }
149 
155 template <typename L>
156 value_t<L> sum([[maybe_unused]] const L& lhs) {
157  if constexpr (vec_enabled && all_vectorizable<vector_mode, L>) {
158  using T = value_t<L>;
159 
160  T acc(0);
161 
162  auto acc_functor = [&acc](T value) { acc += value; };
163 
164  auto batch_fun = [](auto& sub) {
165  // The default vectorization scheme should be sufficient
166  return sum_impl<default_vec>(sub);
167  };
168 
169  if (etl::size(lhs) < sum_parallel_threshold) {
170  return sum_impl<default_vec>(lhs);
171  } else {
172  engine_dispatch_1d_acc_slice(lhs, batch_fun, acc_functor, vec_sum_parallel_threshold);
173  }
174 
175  return acc;
176  } else {
177  cpp_unreachable("vec::sum called with invalid parameters");
178  }
179 }
180 
186 template <typename L>
187 value_t<L> asum([[maybe_unused]] const L& lhs) {
188  if constexpr (vec_enabled && all_vectorizable<vector_mode, L>) {
189  using T = value_t<L>;
190 
191  T acc(0);
192 
193  auto acc_functor = [&acc](T value) { acc += value; };
194 
195  auto batch_fun = [](auto& sub) {
196  // The default vectorization scheme should be sufficient
197  return asum_impl<default_vec>(sub);
198  };
199 
200  engine_dispatch_1d_acc_slice(lhs, batch_fun, acc_functor, vec_sum_parallel_threshold);
201 
202  return acc;
203  } else {
204  cpp_unreachable("vec::sum called with invalid parameters");
205  }
206 }
207 
208 } //end of namespace etl::impl::vec
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
void engine_dispatch_1d_acc_slice(E &&expr, Functor &&functor, AccFunctor &&acc_functor, [[maybe_unused]] size_t threshold, [[maybe_unused]] size_t n_threads=etl::threads)
Dispatch the elements of an ETL container in a parallel manner and use an accumulator functor to accu...
Definition: parallel_support.hpp:890
Definition: bias_add.hpp:15
constexpr size_t vec_sum_parallel_threshold
The minimum number of elements before considering parallel acc implementation.
Definition: threshold.hpp:69
constexpr bool vec_enabled
Indicates if vectorization is available in any format.
Definition: config.hpp:220
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
sum_impl
Enumeration describing the different implementations of sum.
Definition: sum_impl.hpp:20
auto abs(E &&value)
Apply absolute on each value of the given expression.
Definition: expression_builder.hpp:54
value_t< E > asum(E &&values)
Returns the sum of all the absolute values contained in the given expression.
Definition: expression_builder.hpp:637
value_t< L > asum_impl(const L &lhs)
Vectorized absolute sum computation.
Definition: sum.hpp:81
constexpr size_t sum_parallel_threshold
The minimum number of elements before considering parallel acc implementation.
Definition: threshold.hpp:68
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624
void safe_ensure_cpu_up_to_date(E &&expr)
Ensure that the CPU is up to date.
Definition: helpers.hpp:278
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81