wichtounet/etl/vec_2sum_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 #pragma once

 namespace etl::impl::vec {

 template <typename V, typename L>
 value_t<L> sum_impl(const L& lhs) {
     //Note: Padding cannot be taken into account we don't start at zero

     using vec_type = V;
     using T        = value_t<L>;

     const size_t n = etl::size(lhs);

     static constexpr size_t vec_size = vec_type::template traits<T>::size;

     safe_ensure_cpu_up_to_date(lhs);

     size_t i = 0;

     auto r1 = vec_type::template zero<T>();
     auto r2 = vec_type::template zero<T>();
     auto r3 = vec_type::template zero<T>();
     auto r4 = vec_type::template zero<T>();

     for (; i + (vec_size * 4) - 1 < n; i += 4 * vec_size) {
         r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
         r2 = vec_type::add(lhs.template load<vec_type>(i + 1 * vec_size), r2);
         r3 = vec_type::add(lhs.template load<vec_type>(i + 2 * vec_size), r3);
         r4 = vec_type::add(lhs.template load<vec_type>(i + 3 * vec_size), r4);
     }

     for (; i + (vec_size * 2) - 1 < n; i += 2 * vec_size) {
         r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
         r2 = vec_type::add(lhs.template load<vec_type>(i + 1 * vec_size), r2);
     }

     for (; i + vec_size - 1 < n; i += vec_size) {
         r1 = vec_type::add(lhs.template load<vec_type>(i + 0 * vec_size), r1);
     }

     auto p1 = vec_type::hadd(r1) + vec_type::hadd(r2) + vec_type::hadd(r3) + vec_type::hadd(r4);
     auto p2 = T();

     for (; i + 1 < n; i += 2) {
         p1 += lhs[i];
         p2 += lhs[i + 1];
     }

     if (i < n) {
         p1 += lhs[i];
     }

     return p1 + p2;
 }

 template <typename V, typename L>
 value_t<L> asum_impl(const L& lhs) {
     //Note: Padding cannot be taken into account we don't start at zero

     using vec_type = V;
     using T        = value_t<L>;
     using std::abs;

     const size_t n = etl::size(lhs);

     static constexpr size_t vec_size = vec_type::template traits<T>::size;

     safe_ensure_cpu_up_to_date(lhs);

     size_t i = 0;

     auto r1 = vec_type::template zero<T>();
     auto r2 = vec_type::template zero<T>();
     auto r3 = vec_type::template zero<T>();
     auto r4 = vec_type::template zero<T>();

     for (; i + (vec_size * 4) - 1 < n; i += 4 * vec_size) {
         auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
         auto v2 = lhs.template load<vec_type>(i + 1 * vec_size);
         auto v3 = lhs.template load<vec_type>(i + 2 * vec_size);
         auto v4 = lhs.template load<vec_type>(i + 3 * vec_size);

         auto x1 = vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
         auto x2 = vec_type::max(v2, vec_type::sub(vec_type::template zero<T>(), v2));
         auto x3 = vec_type::max(v3, vec_type::sub(vec_type::template zero<T>(), v3));
         auto x4 = vec_type::max(v4, vec_type::sub(vec_type::template zero<T>(), v4));

         r1 = vec_type::add(x1, r1);
         r2 = vec_type::add(x2, r2);
         r3 = vec_type::add(x3, r3);
         r4 = vec_type::add(x4, r4);
     }

     for (; i + (vec_size * 2) - 1 < n; i += 2 * vec_size) {
         auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
         auto v2 = lhs.template load<vec_type>(i + 1 * vec_size);

         auto x1 = vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
         auto x2 = vec_type::max(v2, vec_type::sub(vec_type::template zero<T>(), v2));

         r1 = vec_type::add(x1, r1);
         r2 = vec_type::add(x2, r2);
     }

     for (; i + vec_size - 1 < n; i += vec_size) {
         auto v1 = lhs.template load<vec_type>(i + 0 * vec_size);
         auto x1 = vec_type::max(v1, vec_type::sub(vec_type::template zero<T>(), v1));
         r1      = vec_type::add(x1, r1);
     }

     auto p1 = vec_type::hadd(r1) + vec_type::hadd(r2) + vec_type::hadd(r3) + vec_type::hadd(r4);
     auto p2 = T();

     for (; i + 1 < n; i += 2) {
         p1 += abs(lhs[i]);
         p2 += abs(lhs[i + 1]);
     }

     if (i < n) {
         p1 += abs(lhs[i]);
     }

     return p1 + p2;
 }

 template <typename L>
 value_t<L> sum([[maybe_unused]] const L& lhs) {
     if constexpr (vec_enabled && all_vectorizable<vector_mode, L>) {
         using T = value_t<L>;

         T acc(0);

         auto acc_functor = [&acc](T value) { acc += value; };

         auto batch_fun = [](auto& sub) {
             // The default vectorization scheme should be sufficient
             return sum_impl<default_vec>(sub);
         };

         if (etl::size(lhs) < sum_parallel_threshold) {
             return sum_impl<default_vec>(lhs);
         } else {
             engine_dispatch_1d_acc_slice(lhs, batch_fun, acc_functor, vec_sum_parallel_threshold);
         }

         return acc;
     } else {
         cpp_unreachable("vec::sum called with invalid parameters");
     }
 }

 template <typename L>
 value_t<L> asum([[maybe_unused]] const L& lhs) {
     if constexpr (vec_enabled && all_vectorizable<vector_mode, L>) {
         using T = value_t<L>;

         T acc(0);

         auto acc_functor = [&acc](T value) { acc += value; };

         auto batch_fun = [](auto& sub) {
             // The default vectorization scheme should be sufficient
             return asum_impl<default_vec>(sub);
         };

         engine_dispatch_1d_acc_slice(lhs, batch_fun, acc_functor, vec_sum_parallel_threshold);

         return acc;
     } else {
         cpp_unreachable("vec::sum called with invalid parameters");
     }
 }

 } //end of namespace etl::impl::vec
etl::max
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65

etl::engine_dispatch_1d_acc_slice
void engine_dispatch_1d_acc_slice(E &&expr, Functor &&functor, AccFunctor &&acc_functor, [[maybe_unused]] size_t threshold, [[maybe_unused]] size_t n_threads=etl::threads)
Dispatch the elements of an ETL container in a parallel manner and use an accumulator functor to accu...
Definition: parallel_support.hpp:890

etl::impl::vec
Definition: bias_add.hpp:15

etl::vec_sum_parallel_threshold
constexpr size_t vec_sum_parallel_threshold
The minimum number of elements before considering parallel acc implementation.
Definition: threshold.hpp:69

etl::vec_enabled
constexpr bool vec_enabled
Indicates if vectorization is available in any format.
Definition: config.hpp:220

etl::vec_type
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43

etl::sum_impl
sum_impl
Enumeration describing the different implementations of sum.
Definition: sum_impl.hpp:20

etl::abs
auto abs(E &&value)
Apply absolute on each value of the given expression.
Definition: expression_builder.hpp:54

etl::asum
value_t< E > asum(E &&values)
Returns the sum of all the absolute values contained in the given expression.
Definition: expression_builder.hpp:637

etl::impl::vec::asum_impl
value_t< L > asum_impl(const L &lhs)
Vectorized absolute sum computation.
Definition: sum.hpp:81

etl::sum_parallel_threshold
constexpr size_t sum_parallel_threshold
The minimum number of elements before considering parallel acc implementation.
Definition: threshold.hpp:68

etl::sum
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624

etl::safe_ensure_cpu_up_to_date
void safe_ensure_cpu_up_to_date(E &&expr)
Ensure that the CPU is up to date.
Definition: helpers.hpp:278

etl::size
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108

etl::value_t
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81