23 template <
typename V,
typename L>
32 static constexpr
size_t vec_size = vec_type::template traits<T>::size;
38 auto r1 = vec_type::template zero<T>();
39 auto r2 = vec_type::template zero<T>();
40 auto r3 = vec_type::template zero<T>();
41 auto r4 = vec_type::template zero<T>();
43 for (; i + (vec_size * 4) - 1 < n; i += 4 * vec_size) {
44 r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
45 r2 = vec_type::add(lhs.template load<vec_type>(i + 1 * vec_size), r2);
46 r3 = vec_type::add(lhs.template load<vec_type>(i + 2 * vec_size), r3);
47 r4 = vec_type::add(lhs.template load<vec_type>(i + 3 * vec_size), r4);
50 for (; i + (vec_size * 2) - 1 < n; i += 2 * vec_size) {
51 r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
52 r2 = vec_type::add(lhs.template load<vec_type>(i + 1 * vec_size), r2);
55 for (; i + vec_size - 1 < n; i += vec_size) {
56 r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
59 auto p1 = vec_type::hadd(r1) + vec_type::hadd(r2) + vec_type::hadd(r3) + vec_type::hadd(r4);
62 for (; i + 1 < n; i += 2) {
80 template <
typename V,
typename L>
90 static constexpr
size_t vec_size = vec_type::template traits<T>::size;
96 auto r1 = vec_type::template zero<T>();
97 auto r2 = vec_type::template zero<T>();
98 auto r3 = vec_type::template zero<T>();
99 auto r4 = vec_type::template zero<T>();
101 for (; i + (vec_size * 4) - 1 < n; i += 4 * vec_size) {
102 auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
103 auto v2 = lhs.template load<vec_type>(i + 1 * vec_size);
104 auto v3 = lhs.template load<vec_type>(i + 2 * vec_size);
105 auto v4 = lhs.template load<vec_type>(i + 3 * vec_size);
107 auto x1 =
vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
108 auto x2 =
vec_type::max(v2, vec_type::sub(vec_type::template zero<T>(), v2));
109 auto x3 =
vec_type::max(v3, vec_type::sub(vec_type::template zero<T>(), v3));
110 auto x4 =
vec_type::max(v4, vec_type::sub(vec_type::template zero<T>(), v4));
112 r1 = vec_type::add(x1, r1);
113 r2 = vec_type::add(x2, r2);
114 r3 = vec_type::add(x3, r3);
115 r4 = vec_type::add(x4, r4);
118 for (; i + (vec_size * 2) - 1 < n; i += 2 * vec_size) {
119 auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
120 auto v2 = lhs.template load<vec_type>(i + 1 * vec_size);
122 auto x1 =
vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
123 auto x2 =
vec_type::max(v2, vec_type::sub(vec_type::template zero<T>(), v2));
125 r1 = vec_type::add(x1, r1);
126 r2 = vec_type::add(x2, r2);
129 for (; i + vec_size - 1 < n; i += vec_size) {
130 auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
131 auto x1 =
vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
132 r1 = vec_type::add(x1, r1);
135 auto p1 = vec_type::hadd(r1) + vec_type::hadd(r2) + vec_type::hadd(r3) + vec_type::hadd(r4);
138 for (; i + 1 < n; i += 2) {
140 p2 +=
abs(lhs[i + 1]);
155 template <
typename L>
157 if constexpr (
vec_enabled && all_vectorizable<vector_mode, L>) {
162 auto acc_functor = [&acc](T value) { acc += value; };
164 auto batch_fun = [](
auto& sub) {
177 cpp_unreachable(
"vec::sum called with invalid parameters");
186 template <
typename L>
188 if constexpr (
vec_enabled && all_vectorizable<vector_mode, L>) {
193 auto acc_functor = [&acc](T value) { acc += value; };
195 auto batch_fun = [](
auto& sub) {
197 return asum_impl<default_vec>(sub);
204 cpp_unreachable(
"vec::sum called with invalid parameters");
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
void engine_dispatch_1d_acc_slice(E &&expr, Functor &&functor, AccFunctor &&acc_functor, [[maybe_unused]] size_t threshold, [[maybe_unused]] size_t n_threads=etl::threads)
Dispatch the elements of an ETL container in a parallel manner and use an accumulator functor to accu...
Definition: parallel_support.hpp:890
Definition: bias_add.hpp:15
constexpr size_t vec_sum_parallel_threshold
The minimum number of elements before considering parallel acc implementation.
Definition: threshold.hpp:69
constexpr bool vec_enabled
Indicates if vectorization is available in any format.
Definition: config.hpp:220
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
sum_impl
Enumeration describing the different implementations of sum.
Definition: sum_impl.hpp:20
auto abs(E &&value)
Apply absolute on each value of the given expression.
Definition: expression_builder.hpp:54
value_t< E > asum(E &&values)
Returns the sum of all the absolute values contained in the given expression.
Definition: expression_builder.hpp:637
value_t< L > asum_impl(const L &lhs)
Vectorized absolute sum computation.
Definition: sum.hpp:81
constexpr size_t sum_parallel_threshold
The minimum number of elements before considering parallel acc implementation.
Definition: threshold.hpp:68
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624
void safe_ensure_cpu_up_to_date(E &&expr)
Ensure that the CPU is up to date.
Definition: helpers.hpp:278
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81