wichtounet/etl/evaluator_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 /*
  * Possible improvements
  *  * The pre/post functions should be refactored so that is less heavy on the code (too much usage)
  *  * Compound operations should ideally be direct evaluated
  */

 #pragma once

 #include "etl/eval_selectors.hpp"       //Method selectors
 #include "etl/linear_eval_functors.hpp" //Implementation functors
 #include "etl/vec_eval_functors.hpp"    //Implementation functors

 namespace etl {

 /*
  * \brief The evaluator is responsible for assigning one expression to another.
  *
  * The implementation is chosen by SFINAE.
  */
 namespace standard_evaluator {
 template <typename E>
 void pre_assign_rhs(E&& expr) {
     detail::evaluator_visitor eval_visitor;
     expr.visit(eval_visitor);
 }

 template <typename Fun, typename E, typename R>
 void par_exec(E&& expr, R&& result) {
     if constexpr (parallel_support) {
         auto slice_functor = [&](auto&& lhs, auto&& rhs) { Fun::apply(lhs, rhs); };

         engine_dispatch_1d_slice_binary(result, expr, slice_functor, 0);
     } else {
         Fun::apply(result, expr);
     }
 }

 // Assign functions implementations

 template <typename E, typename R>
 void standard_assign_impl(E& expr, R& result) {
     inc_counter("std:assign");

     for (size_t i = 0; i < etl::size(result); ++i) {
         result[i] = expr.read_flat(i);
     }
 }

 template <typename E, typename R>
 void fast_assign_impl_full(E& expr, R& result) {
     if constexpr (!is_gpu_dyn_matrix<R>) {
         if constexpr (cuda_enabled) {
             cpp_assert(expr.is_cpu_up_to_date() || expr.is_gpu_up_to_date(), "expr must be in valid state");

             if (expr.is_cpu_up_to_date()) {
                 inc_counter("fast:copy");

                 direct_copy(expr.memory_start(), expr.memory_end(), result.memory_start());

                 result.validate_cpu();
             }

             if (expr.is_gpu_up_to_date()) {
                 inc_counter("gpu:copy");

                 bool cpu_status = expr.is_cpu_up_to_date();

                 result.ensure_gpu_allocated();
                 result.gpu_copy_from(expr.gpu_memory());

                 // Restore CPU status because gpu_copy_from will erase it
                 if (cpu_status) {
                     result.validate_cpu();
                 }
             }

             // Invalidation must be done after validation to preserve
             // valid CPU/GPU state

             if (!expr.is_cpu_up_to_date()) {
                 result.invalidate_cpu();
             }

             if (!expr.is_gpu_up_to_date()) {
                 result.invalidate_gpu();
             }

             cpp_assert(expr.is_cpu_up_to_date() == result.is_cpu_up_to_date(), "fast_assign must preserve CPU status");
             cpp_assert(expr.is_gpu_up_to_date() == result.is_gpu_up_to_date(), "fast_assign must preserve GPU status");
         } else {
             direct_copy(expr.memory_start(), expr.memory_end(), result.memory_start());
         }
     } else {
         if constexpr (cuda_enabled) {
             cpp_assert(expr.is_gpu_up_to_date(), "expr must be in valid state");

             inc_counter("gpu:copy");

             result.ensure_gpu_allocated();
             result.gpu_copy_from(expr.gpu_memory());

             // Invalidation must be done after validation to preserve
             // valid CPU/GPU state

             result.validate_gpu();
             result.invalidate_cpu();

             cpp_assert(result.is_gpu_up_to_date(), "fast_assign must preserve GPU status");
         } else {
             cpp_unreachable("gpu_dyn_matrix should never be used without GPU support");
         }
     }
 }

 template <typename E, typename R>
 void fast_assign_impl(E& expr, R& result) {
     static_assert(!is_gpu_dyn_matrix<R>, "gpu_dyn_matrix should not be used here");

     inc_counter("fast:copy");

     expr.ensure_cpu_up_to_date();

     direct_copy(expr.memory_start(), expr.memory_end(), result.memory_start());

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void gpu_assign_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_allocated();

     if constexpr (is_binary_expr<E>) {
         if (expr.alias(result)) {
             // Compute the GPU representation of the expression
             decltype(auto) t2 = smart_gpu_compute_hint(expr, result);

             // Copy the GPU memory from the expression to the result
             result.gpu_copy_from(t2.gpu_memory());
         } else {
             // Compute the GPU representation of the expression into the result
             smart_gpu_compute(expr, result);
         }
     } else {
         // Compute the GPU representation of the expression into the result
         smart_gpu_compute(expr, result);
     }

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 template <typename E, typename R>
 void direct_assign_impl(E& expr, R& result) {
     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par:assign");
             par_exec<detail::Assign>(expr, result);
         } else {
             inc_counter("mem:assign");
             detail::Assign::apply(result, expr);
         }
     } else {
         inc_counter("mem:assign");
         detail::Assign::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void vectorized_assign_impl(E& expr, R& result) {
     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     constexpr auto V = detail::select_vector_mode<E, R>();

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par_vec:assign");
             par_exec<detail::VectorizedAssign<V>>(expr, result);
         } else {
             inc_counter("vec:assign");
             detail::VectorizedAssign<V>::apply(result, expr);
         }
     } else {
         inc_counter("vec:assign");
         detail::VectorizedAssign<V>::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 // Selector versions

 template <typename E, typename R>
 void assign_evaluate_impl_no_gpu(E&& expr, R&& result) {
     if constexpr (detail::standard_assign_no_gpu<E, R>) {
         standard_assign_impl(expr, result);
     } else if constexpr (std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign_no_gpu<E, R>) {
         fast_assign_impl_full(expr, result);
     } else if constexpr (!std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign_no_gpu<E, R>) {
         fast_assign_impl(expr, result);
     } else if constexpr (detail::direct_assign_no_gpu<E, R>) {
         direct_assign_impl(expr, result);
     } else if constexpr (detail::vectorized_assign_no_gpu<E, R>) {
         vectorized_assign_impl(expr, result);
     }
 }

 template <typename E, typename R>
 void assign_evaluate_impl(E&& expr, R&& result) {
     if constexpr (detail::standard_assign<E, R>) {
         standard_assign_impl(expr, result);
     } else if constexpr (std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign<E, R>) {
         fast_assign_impl_full(expr, result);
     } else if constexpr (!std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign<E, R>) {
         fast_assign_impl(expr, result);
     } else if constexpr (detail::gpu_assign<E, R>) {
         if (local_context().cpu || is_something_forced()) {
             assign_evaluate_impl_no_gpu(expr, result);
         } else {
             gpu_assign_impl(expr, result);
         }
     } else if constexpr (detail::direct_assign<E, R>) {
         direct_assign_impl(expr, result);
     } else if constexpr (detail::vectorized_assign<E, R>) {
         vectorized_assign_impl(expr, result);
     }
 }

 // Compound Assign Add functions implementations

 template <typename E, typename R>
 void standard_compound_add_impl(E& expr, R& result) {
     inc_counter("std:assign");

     pre_assign_rhs(expr);

     for (size_t i = 0; i < etl::size(result); ++i) {
         result[i] += expr[i];
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void direct_compound_add_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par:assign");
             par_exec<detail::AssignAdd>(expr, result);
         } else {
             inc_counter("mem:assign");
             detail::AssignAdd::apply(result, expr);
         }
     } else {
         inc_counter("mem:assign");
         detail::AssignAdd::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void vectorized_compound_add_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     constexpr auto V = detail::select_vector_mode<E, R>();

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par_vec:assign");
             par_exec<detail::VectorizedAssignAdd<V>>(expr, result);
         } else {
             inc_counter("vec:assign");
             detail::VectorizedAssignAdd<V>::apply(result, expr);
         }
     } else {
         inc_counter("vec:assign");
         detail::VectorizedAssignAdd<V>::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 #ifdef ETL_CUBLAS_MODE

 template <typename E, typename R>
 void gpu_compound_add_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     decltype(auto) t1 = smart_gpu_compute_hint(expr, result);

     value_t<E> alpha(1);
     impl::egblas::axpy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 #endif

 #ifdef ETL_EGBLAS_MODE

 template <typename E, typename R>
 void gpu_compound_add_scalar_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     impl::egblas::scalar_add(result.gpu_memory(), etl::size(result), 1, expr.value);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 #endif

 // Selector functions

 template <typename E, typename R>
 void add_evaluate_no_gpu(E&& expr, R&& result) {
     if constexpr (detail::standard_compound_no_gpu<E, R>) {
         standard_compound_add_impl(expr, result);
     } else if constexpr (detail::direct_compound_no_gpu<E, R>) {
         direct_compound_add_impl(expr, result);
     } else if constexpr (detail::vectorized_compound_no_gpu<E, R>) {
         vectorized_compound_add_impl(expr, result);
     }
 }

 template <typename E, typename R>
 void add_evaluate(E&& expr, R&& result) {
     if constexpr (detail::standard_compound<E, R>) {
         standard_compound_add_impl(expr, result);
     } else if constexpr (detail::direct_compound<E, R>) {
         direct_compound_add_impl(expr, result);
     } else if constexpr (detail::vectorized_compound<E, R>) {
         vectorized_compound_add_impl(expr, result);
     } else if constexpr (cublas_enabled && detail::gpu_compound<E, R> && !is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             add_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_add_impl(expr, result);
         }
     } else if constexpr (egblas_enabled && detail::gpu_compound<E, R> && is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             add_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_add_scalar_impl(expr, result);
         }
     }
 }

 // Compound assign sub implementation functions

 template <typename E, typename R>
 void standard_compound_sub_impl(E& expr, R& result) {
     inc_counter("std:assign");

     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     for (size_t i = 0; i < etl::size(result); ++i) {
         result[i] -= expr[i];
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void direct_compound_sub_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             par_exec<detail::AssignSub>(expr, result);
             inc_counter("par:assign");
         } else {
             detail::AssignSub::apply(result, expr);
             inc_counter("mem:assign");
         }
     } else {
         detail::AssignSub::apply(result, expr);
         inc_counter("mem:assign");
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void vectorized_compound_sub_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     constexpr auto V = detail::select_vector_mode<E, R>();

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par_vec:assign");
             par_exec<detail::VectorizedAssignSub<V>>(expr, result);
         } else {
             inc_counter("vec:assign");
             detail::VectorizedAssignSub<V>::apply(result, expr);
         }
     } else {
         inc_counter("vec:assign");
         detail::VectorizedAssignSub<V>::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void gpu_compound_sub_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     decltype(auto) t1 = smart_gpu_compute_hint(expr, result);

     value_t<E> alpha(-1);
     impl::egblas::axpy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 #ifdef ETL_EGBLAS_MODE

 template <typename E, typename R>
 void gpu_compound_sub_scalar_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     auto value = -expr.value;
     impl::egblas::scalar_add(result.gpu_memory(), etl::size(result), 1, value);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 #endif

 // Selector functions

 template <typename E, typename R>
 void sub_evaluate_no_gpu(E&& expr, R&& result) {
     if constexpr (detail::standard_compound_no_gpu<E, R>) {
         standard_compound_sub_impl(expr, result);
     } else if constexpr (detail::direct_compound_no_gpu<E, R>) {
         direct_compound_sub_impl(expr, result);
     } else if constexpr (detail::vectorized_compound_no_gpu<E, R>) {
         vectorized_compound_sub_impl(expr, result);
     }
 }

 template <typename E, typename R>
 void sub_evaluate(E&& expr, R&& result) {
     if constexpr (detail::standard_compound<E, R>) {
         standard_compound_sub_impl(expr, result);
     } else if constexpr (detail::direct_compound<E, R>) {
         direct_compound_sub_impl(expr, result);
     } else if constexpr (detail::vectorized_compound<E, R>) {
         vectorized_compound_sub_impl(expr, result);
     } else if constexpr (cublas_enabled && detail::gpu_compound<E, R> && !is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             sub_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_sub_impl(expr, result);
         }
     } else if constexpr (egblas_enabled && detail::gpu_compound<E, R> && is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             sub_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_sub_scalar_impl(expr, result);
         }
     }
 }

 // Compound assign mul implementation functions

 template <typename E, typename R>
 void standard_compound_mul_impl(E& expr, R& result) {
     inc_counter("std:assign");

     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     for (size_t i = 0; i < etl::size(result); ++i) {
         result[i] *= expr[i];
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void direct_compound_mul_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par:assign");
             par_exec<detail::AssignMul>(expr, result);
         } else {
             inc_counter("mem:assign");
             detail::AssignMul::apply(result, expr);
         }
     } else {
         inc_counter("mem:assign");
         detail::AssignMul::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void vectorized_compound_mul_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     constexpr auto V = detail::select_vector_mode<E, R>();

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par_vec:assign");
             par_exec<detail::VectorizedAssignMul<V>>(expr, result);
         } else {
             inc_counter("vec:assign");
             detail::VectorizedAssignMul<V>::apply(result, expr);
         }
     } else {
         inc_counter("vec:assign");
         detail::VectorizedAssignMul<V>::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 #ifdef ETL_EGBLAS_MODE

 template <typename E, typename R>
 void gpu_compound_mul_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     decltype(auto) t1 = smart_gpu_compute_hint(expr, result);

     value_t<E> alpha(1);
     impl::egblas::axmy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 #endif

 template <typename E, typename R>
 void gpu_compound_mul_scalar_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     impl::egblas::scalar_mul(result.gpu_memory(), etl::size(result), 1, expr.value);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 // Selector functions

 template <typename E, typename R>
 void mul_evaluate_no_gpu(E&& expr, R&& result) {
     if constexpr (detail::standard_compound_no_gpu<E, R>) {
         standard_compound_mul_impl(expr, result);
     } else if constexpr (detail::direct_compound_no_gpu<E, R>) {
         direct_compound_mul_impl(expr, result);
     } else if constexpr (detail::vectorized_compound_no_gpu<E, R>) {
         vectorized_compound_mul_impl(expr, result);
     }
 }

 template <typename E, typename R>
 void mul_evaluate(E&& expr, R&& result) {
     if constexpr (detail::standard_compound<E, R>) {
         standard_compound_mul_impl(expr, result);
     } else if constexpr (detail::direct_compound<E, R>) {
         direct_compound_mul_impl(expr, result);
     } else if constexpr (detail::vectorized_compound<E, R>) {
         vectorized_compound_mul_impl(expr, result);
     } else if constexpr (egblas_enabled && detail::gpu_compound<E, R> && !is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             mul_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_mul_impl(expr, result);
         }
     } else if constexpr (cublas_enabled && detail::gpu_compound<E, R> && is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             mul_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_mul_scalar_impl(expr, result);
         }
     }
 }

 // Compound Assign Div implementation functions

 template <typename E, typename R>
 void standard_compound_div_impl(E& expr, R& result) {
     inc_counter("std:assign");

     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     for (size_t i = 0; i < etl::size(result); ++i) {
         result[i] /= expr[i];
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void direct_compound_div_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par:assign");
             par_exec<detail::AssignDiv>(expr, result);
         } else {
             inc_counter("mem:assign");
             detail::AssignDiv::apply(result, expr);
         }
     } else {
         inc_counter("mem:assign");
         detail::AssignDiv::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void vectorized_compound_div_impl(E& expr, R& result) {
     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     constexpr auto V = detail::select_vector_mode<E, R>();

     if constexpr (is_thread_safe<E>) {
         int factor = std::max(etl::complexity(expr), 1);
         if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
             inc_counter("par_vec:assign");
             par_exec<detail::VectorizedAssignDiv<V>>(expr, result);
         } else {
             inc_counter("vec:assign");
             detail::VectorizedAssignDiv<V>::apply(result, expr);
         }
     } else {
         inc_counter("vec:assign");
         detail::VectorizedAssignDiv<V>::apply(result, expr);
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void gpu_compound_div_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     decltype(auto) t1 = smart_gpu_compute_hint(expr, result);

     value_t<E> alpha(1);
     impl::egblas::axdy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 template <typename E, typename R>
 void gpu_compound_div_scalar_impl(E& expr, R& result) {
     inc_counter("gpu:assign");

     result.ensure_gpu_up_to_date();

     // Compute the GPU representation of the expression
     auto value = value_t<E>(1.0) / expr.value;
     impl::egblas::scalar_mul(result.gpu_memory(), etl::size(result), 1, value);

     // Validate the GPU and invalidates the CPU
     result.validate_gpu();
     result.invalidate_cpu();
 }

 // Selector functions

 template <typename E, typename R>
 void div_evaluate_no_gpu(E&& expr, R&& result) {
     if constexpr (detail::standard_compound_div_no_gpu<E, R>) {
         standard_compound_div_impl(expr, result);
     } else if constexpr (detail::direct_compound_div_no_gpu<E, R>) {
         direct_compound_div_impl(expr, result);
     } else if constexpr (detail::vectorized_compound_div_no_gpu<E, R>) {
         vectorized_compound_div_impl(expr, result);
     }
 }

 template <typename E, typename R>
 void div_evaluate(E&& expr, R&& result) {
     if constexpr (detail::standard_compound_div<E, R>) {
         standard_compound_div_impl(expr, result);
     } else if constexpr (detail::direct_compound_div<E, R>) {
         direct_compound_div_impl(expr, result);
     } else if constexpr (detail::vectorized_compound_div<E, R>) {
         vectorized_compound_div_impl(expr, result);
     } else if constexpr (egblas_enabled && detail::gpu_compound_div<E, R> && !is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             div_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_div_impl(expr, result);
         }
     } else if constexpr (cublas_enabled && detail::gpu_compound_div<E, R> && is_scalar<E>) {
         if (local_context().cpu || is_something_forced()) {
             div_evaluate_no_gpu(expr, result);
         } else {
             gpu_compound_div_scalar_impl(expr, result);
         }
     }
 }

 //Standard Mod Evaluate (no optimized versions for mod)

 template <typename E, typename R>
 void mod_evaluate(E&& expr, R&& result) {
     inc_counter("std:assign");

     pre_assign_rhs(expr);

     safe_ensure_cpu_up_to_date(expr);
     safe_ensure_cpu_up_to_date(result);

     for (size_t i = 0; i < etl::size(result); ++i) {
         result[i] %= expr[i];
     }

     result.validate_cpu();
     result.invalidate_gpu();
 }

 template <typename E, typename R>
 void assign_evaluate(E&& expr, R&& result) {
     if constexpr (!detail::gpu_assign<E, R>) {
         //Evaluate sub parts, if any
         pre_assign_rhs(expr);
     }

     //Perform the real evaluation, selected by TMP
     assign_evaluate_impl(expr, result);
 }

 } // end of namespace standard_evaluator

 template <typename Expr, typename Result>
 constexpr bool direct_assign_compatible = decay_traits<Expr>::is_generator // No dimensions, always possible to assign
                                           || decay_traits<Expr>::storage_order == decay_traits<Result>::storage_order // Same storage always possible to assign
                                           || all_1d<Expr, Result> // Vectors can be directly assigned, regardless of the storage order
     ;

 template <typename Expr, typename Result>
 void std_assign_evaluate(Expr&& expr, Result&& result) {
 #ifdef DEBUG_EVALUATOR
     std::cout << result << "=" << expr << std::endl;
 #endif

     if constexpr (direct_assign_compatible<Expr, Result>) {
         standard_evaluator::assign_evaluate(expr, result);
     } else {
         inc_counter("eval:transpose");
         standard_evaluator::assign_evaluate(transpose(expr), result);
     }
 }

 template <typename Expr, typename Result>
 void std_add_evaluate(Expr&& expr, Result&& result) {
 #ifdef DEBUG_EVALUATOR
     std::cout << result << "+=" << expr << std::endl;
 #endif

     if constexpr (direct_assign_compatible<Expr, Result>) {
         standard_evaluator::add_evaluate(expr, result);
     } else {
         inc_counter("eval:transpose");
         standard_evaluator::add_evaluate(transpose(expr), result);
     }
 }

 template <typename Expr, typename Result>
 void std_sub_evaluate(Expr&& expr, Result&& result) {
 #ifdef DEBUG_EVALUATOR
     std::cout << result << "-=" << expr << std::endl;
 #endif

     if constexpr (direct_assign_compatible<Expr, Result>) {
         standard_evaluator::sub_evaluate(expr, result);
     } else {
         inc_counter("eval:transpose");
         standard_evaluator::sub_evaluate(transpose(expr), result);
     }
 }

 template <typename Expr, typename Result>
 void std_mul_evaluate(Expr&& expr, Result&& result) {
 #ifdef DEBUG_EVALUATOR
     std::cout << result << "*=" << expr << std::endl;
 #endif

     if constexpr (direct_assign_compatible<Expr, Result>) {
         standard_evaluator::mul_evaluate(expr, result);
     } else {
         inc_counter("eval:transpose");
         standard_evaluator::mul_evaluate(transpose(expr), result);
     }
 }

 template <typename Expr, typename Result>
 void std_div_evaluate(Expr&& expr, Result&& result) {
 #ifdef DEBUG_EVALUATOR
     std::cout << result << "/=" << expr << std::endl;
 #endif

     if constexpr (direct_assign_compatible<Expr, Result>) {
         standard_evaluator::div_evaluate(expr, result);
     } else {
         inc_counter("eval:transpose");
         standard_evaluator::div_evaluate(transpose(expr), result);
     }
 }

 template <typename Expr, typename Result>
 void std_mod_evaluate(Expr&& expr, Result&& result) {
 #ifdef DEBUG_EVALUATOR
     std::cout << result << "%=" << expr << std::endl;
 #endif

     if constexpr (direct_assign_compatible<Expr, Result>) {
         standard_evaluator::mod_evaluate(expr, result);
     } else {
         inc_counter("eval:transpose");
         standard_evaluator::mod_evaluate(transpose(expr), result);
     }
 }

 template <typename Expr>
 void force(Expr&& expr) {
     standard_evaluator::pre_assign_rhs(expr);
 }

 } //end of namespace etl
etl::standard_evaluator::gpu_compound_mul_scalar_impl
void gpu_compound_mul_scalar_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:858

etl::complexity
constexpr int complexity([[maybe_unused]] const E &expr) noexcept
Return the complexity of the expression.
Definition: helpers.hpp:38

etl::standard_evaluator::sub_evaluate_no_gpu
void sub_evaluate_no_gpu(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:685

etl::detail::AssignMul::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:125

etl::max
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65

etl::standard_evaluator::standard_compound_div_impl
void standard_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:931

etl::standard_evaluator::gpu_compound_div_scalar_impl
void gpu_compound_div_scalar_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1051

etl::std_assign_evaluate
void std_assign_evaluate(Expr &&expr, Result &&result)
Evaluation of the expr into result.
Definition: evaluator.hpp:1176

etl::detail::AssignSub::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:93

etl::standard_evaluator::fast_assign_impl
void fast_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:173

etl::standard_evaluator::gpu_compound_sub_impl
void gpu_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:631

etl::detail::VectorizedAssignAdd::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:113

etl::standard_evaluator::vectorized_compound_div_impl
void vectorized_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:991

etl::standard_evaluator::vectorized_compound_add_impl
void vectorized_compound_add_impl(E &expr, R &result)
Add the result of the expression to the result.
Definition: evaluator.hpp:393

etl::standard_evaluator::standard_compound_mul_impl
void standard_compound_mul_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:734

etl::cuda_enabled
constexpr bool cuda_enabled
Indicates if CUDA is available.
Definition: config.hpp:94

etl::direct_copy
void direct_copy(const S *first, const S *last, T *target)
Performs a direct memory copy.
Definition: memory.hpp:24

etl::force
void force(Expr &&expr)
Force the internal evaluation of an expression.
Definition: evaluator.hpp:1292

etl::standard_evaluator::mul_evaluate
void mul_evaluate(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:898

etl::standard_evaluator::standard_assign_impl
void standard_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:80

etl::standard_evaluator::direct_compound_sub_impl
void direct_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:562

etl::standard_evaluator::vectorized_compound_mul_impl
void vectorized_compound_mul_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:794

etl::standard_evaluator::direct_assign_impl
void direct_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:231

eval_selectors.hpp
Contains TMP selectors to select evaluation methods based on configuration.

etl::detail::Assign::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:29

etl::transpose
auto transpose(const E &value)
Returns the transpose of the given expression.
Definition: expression_builder.hpp:528

etl::standard_evaluator::standard_compound_sub_impl
void standard_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:536

etl::standard_evaluator::div_evaluate
void div_evaluate(E &&expr, R &&result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1092

etl::etl_traits
Traits to get information about ETL types.
Definition: tmp.hpp:68

etl
Root namespace for the ETL library.
Definition: adapter.hpp:15

etl::local_context
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50

etl::egblas_enabled
constexpr bool egblas_enabled
Indicates if the EGBLAS library is available for ETL.
Definition: config.hpp:119

etl::cublas_enabled
constexpr bool cublas_enabled
Indicates if the NVIDIA CUBLAS library is available for ETL.
Definition: config.hpp:99

etl::standard_evaluator::par_exec
void par_exec(E &&expr, R &&result)
Assign the result of the expression to the result with the given Functor, using parallel implementati...
Definition: evaluator.hpp:59

etl::detail::evaluator_visitor
Visitor to perform local evaluation when necessary.
Definition: eval_visitors.hpp:23

etl::parallel_threshold
constexpr size_t parallel_threshold
The minimum number of elements before considering parallel implementation.
Definition: threshold.hpp:66

etl::std_mod_evaluate
void std_mod_evaluate(Expr &&expr, Result &&result)
Compound modulo evaluation of the expr into result.
Definition: evaluator.hpp:1271

etl::standard_evaluator::add_evaluate
void add_evaluate(E &&expr, R &&result)
Add the result of the expression to the result.
Definition: evaluator.hpp:503

etl::detail::VectorizedAssignSub::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:156

etl::standard_evaluator::pre_assign_rhs
void pre_assign_rhs(E &&expr)
Allocate temporaries and evaluate sub expressions in RHS.
Definition: evaluator.hpp:48

etl::standard_evaluator::add_evaluate_no_gpu
void add_evaluate_no_gpu(E &&expr, R &&result)
Add the result of the expression to the result.
Definition: evaluator.hpp:487

etl::std_mul_evaluate
void std_mul_evaluate(Expr &&expr, Result &&result)
Compound multiply evaluation of the expr into result.
Definition: evaluator.hpp:1233

etl::standard_evaluator::mul_evaluate_no_gpu
void mul_evaluate_no_gpu(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:882

etl::standard_evaluator::sub_evaluate
void sub_evaluate(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:701

etl::standard_evaluator::gpu_compound_div_impl
void gpu_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1026

etl::standard_evaluator::gpu_assign_impl
void gpu_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:195

etl::safe_ensure_cpu_up_to_date
void safe_ensure_cpu_up_to_date(E &&expr)
Ensure that the CPU is up to date.
Definition: helpers.hpp:278

etl::standard_evaluator::assign_evaluate
void assign_evaluate(E &&expr, R &&result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:1144

etl::size
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108

etl::standard_evaluator::div_evaluate_no_gpu
void div_evaluate_no_gpu(E &&expr, R &&result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1076

etl::detail::VectorizedAssign::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:52

etl::detail::VectorizedAssignDiv::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:242

etl::standard_evaluator::standard_compound_add_impl
void standard_compound_add_impl(E &expr, R &result)
Add the result of the expression to the result.
Definition: evaluator.hpp:336

etl::standard_evaluator::direct_compound_div_impl
void direct_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:957

etl::standard_evaluator::direct_compound_add_impl
void direct_compound_add_impl(E &expr, R &result)
Add the result of the expression to the result.
Definition: evaluator.hpp:359

etl::standard_evaluator::direct_compound_mul_impl
void direct_compound_mul_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:760

etl::standard_evaluator::vectorized_assign_impl
void vectorized_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:263

etl::detail::AssignDiv::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:157

etl::is_something_forced
bool is_something_forced()
Indicates if some implementation is forced in the context.
Definition: context.hpp:60

etl::std_sub_evaluate
void std_sub_evaluate(Expr &&expr, Result &&result)
Compound subtract evaluation of the expr into result.
Definition: evaluator.hpp:1214

etl::standard_evaluator::vectorized_compound_sub_impl
void vectorized_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:596

etl::direct_assign_compatible
constexpr bool direct_assign_compatible
Traits indicating if a direct assign is possible.
Definition: evaluator.hpp:1165

etl::parallel_support
constexpr bool parallel_support
Indicates if support for parallelization is integrated into the framework.
Definition: config.hpp:51

etl::detail::AssignAdd::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:61

etl::detail::VectorizedAssignMul::apply
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:199

etl::engine_select_parallel
bool engine_select_parallel([[maybe_unused]] size_t n, [[maybe_unused]] size_t threshold=parallel_threshold)
Indicates if an 1D evaluation should run in paralle.
Definition: parallel_support.hpp:679

etl::smart_gpu_compute_hint
decltype(auto) smart_gpu_compute_hint(E &expr, Y &y)
Compute the expression into a representation that is GPU up to date.
Definition: helpers.hpp:368

linear_eval_functors.hpp
Contains the linear functors used by the evaluator to perform its actions.

etl::value_t
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81

etl::std_div_evaluate
void std_div_evaluate(Expr &&expr, Result &&result)
Compound divide evaluation of the expr into result.
Definition: evaluator.hpp:1252

etl::standard_evaluator::mod_evaluate
void mod_evaluate(E &&expr, R &&result)
Modulo the result by the result of the expression.
Definition: evaluator.hpp:1122

etl::inc_counter
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25

etl::smart_gpu_compute
decltype(auto) smart_gpu_compute(X &x, Y &y)
Compute the expression into a representation that is GPU up to date and store this representation in ...
Definition: helpers.hpp:397

etl::std_add_evaluate
void std_add_evaluate(Expr &&expr, Result &&result)
Compound add evaluation of the expr into result.
Definition: evaluator.hpp:1195

etl::standard_evaluator::fast_assign_impl_full
void fast_assign_impl_full(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:98