Expression Templates Library (ETL)
evaluator.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
23 /*
24  * Possible improvements
25  * * The pre/post functions should be refactored so that is less heavy on the code (too much usage)
26  * * Compound operations should ideally be direct evaluated
27  */
28 
29 #pragma once
30 
31 #include "etl/eval_selectors.hpp" //Method selectors
32 #include "etl/linear_eval_functors.hpp" //Implementation functors
33 #include "etl/vec_eval_functors.hpp" //Implementation functors
34 
35 namespace etl {
36 
37 /*
38  * \brief The evaluator is responsible for assigning one expression to another.
39  *
40  * The implementation is chosen by SFINAE.
41  */
42 namespace standard_evaluator {
47 template <typename E>
48 void pre_assign_rhs(E&& expr) {
49  detail::evaluator_visitor eval_visitor;
50  expr.visit(eval_visitor);
51 }
52 
58 template <typename Fun, typename E, typename R>
59 void par_exec(E&& expr, R&& result) {
60  if constexpr (parallel_support) {
61  auto slice_functor = [&](auto&& lhs, auto&& rhs) { Fun::apply(lhs, rhs); };
62 
63  engine_dispatch_1d_slice_binary(result, expr, slice_functor, 0);
64  } else {
65  Fun::apply(result, expr);
66  }
67 }
68 
69 // Assign functions implementations
70 
79 template <typename E, typename R>
80 void standard_assign_impl(E& expr, R& result) {
81  inc_counter("std:assign");
82 
83  for (size_t i = 0; i < etl::size(result); ++i) {
84  result[i] = expr.read_flat(i);
85  }
86 }
87 
97 template <typename E, typename R>
98 void fast_assign_impl_full(E& expr, R& result) {
99  if constexpr (!is_gpu_dyn_matrix<R>) {
100  if constexpr (cuda_enabled) {
101  cpp_assert(expr.is_cpu_up_to_date() || expr.is_gpu_up_to_date(), "expr must be in valid state");
102 
103  if (expr.is_cpu_up_to_date()) {
104  inc_counter("fast:copy");
105 
106  direct_copy(expr.memory_start(), expr.memory_end(), result.memory_start());
107 
108  result.validate_cpu();
109  }
110 
111  if (expr.is_gpu_up_to_date()) {
112  inc_counter("gpu:copy");
113 
114  bool cpu_status = expr.is_cpu_up_to_date();
115 
116  result.ensure_gpu_allocated();
117  result.gpu_copy_from(expr.gpu_memory());
118 
119  // Restore CPU status because gpu_copy_from will erase it
120  if (cpu_status) {
121  result.validate_cpu();
122  }
123  }
124 
125  // Invalidation must be done after validation to preserve
126  // valid CPU/GPU state
127 
128  if (!expr.is_cpu_up_to_date()) {
129  result.invalidate_cpu();
130  }
131 
132  if (!expr.is_gpu_up_to_date()) {
133  result.invalidate_gpu();
134  }
135 
136  cpp_assert(expr.is_cpu_up_to_date() == result.is_cpu_up_to_date(), "fast_assign must preserve CPU status");
137  cpp_assert(expr.is_gpu_up_to_date() == result.is_gpu_up_to_date(), "fast_assign must preserve GPU status");
138  } else {
139  direct_copy(expr.memory_start(), expr.memory_end(), result.memory_start());
140  }
141  } else {
142  if constexpr (cuda_enabled) {
143  cpp_assert(expr.is_gpu_up_to_date(), "expr must be in valid state");
144 
145  inc_counter("gpu:copy");
146 
147  result.ensure_gpu_allocated();
148  result.gpu_copy_from(expr.gpu_memory());
149 
150  // Invalidation must be done after validation to preserve
151  // valid CPU/GPU state
152 
153  result.validate_gpu();
154  result.invalidate_cpu();
155 
156  cpp_assert(result.is_gpu_up_to_date(), "fast_assign must preserve GPU status");
157  } else {
158  cpp_unreachable("gpu_dyn_matrix should never be used without GPU support");
159  }
160  }
161 }
162 
172 template <typename E, typename R>
173 void fast_assign_impl(E& expr, R& result) {
174  static_assert(!is_gpu_dyn_matrix<R>, "gpu_dyn_matrix should not be used here");
175 
176  inc_counter("fast:copy");
177 
178  expr.ensure_cpu_up_to_date();
179 
180  direct_copy(expr.memory_start(), expr.memory_end(), result.memory_start());
181 
182  result.validate_cpu();
183  result.invalidate_gpu();
184 }
185 
194 template <typename E, typename R>
195 void gpu_assign_impl(E& expr, R& result) {
196  inc_counter("gpu:assign");
197 
198  result.ensure_gpu_allocated();
199 
200  if constexpr (is_binary_expr<E>) {
201  if (expr.alias(result)) {
202  // Compute the GPU representation of the expression
203  decltype(auto) t2 = smart_gpu_compute_hint(expr, result);
204 
205  // Copy the GPU memory from the expression to the result
206  result.gpu_copy_from(t2.gpu_memory());
207  } else {
208  // Compute the GPU representation of the expression into the result
209  smart_gpu_compute(expr, result);
210  }
211  } else {
212  // Compute the GPU representation of the expression into the result
213  smart_gpu_compute(expr, result);
214  }
215 
216  // Validate the GPU and invalidates the CPU
217  result.validate_gpu();
218  result.invalidate_cpu();
219 }
220 
230 template <typename E, typename R>
231 void direct_assign_impl(E& expr, R& result) {
234 
235  if constexpr (is_thread_safe<E>) {
236  int factor = std::max(etl::complexity(expr), 1);
237  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
238  inc_counter("par:assign");
239  par_exec<detail::Assign>(expr, result);
240  } else {
241  inc_counter("mem:assign");
242  detail::Assign::apply(result, expr);
243  }
244  } else {
245  inc_counter("mem:assign");
246  detail::Assign::apply(result, expr);
247  }
248 
249  result.validate_cpu();
250  result.invalidate_gpu();
251 }
252 
262 template <typename E, typename R>
263 void vectorized_assign_impl(E& expr, R& result) {
266 
267  constexpr auto V = detail::select_vector_mode<E, R>();
268 
269  if constexpr (is_thread_safe<E>) {
270  int factor = std::max(etl::complexity(expr), 1);
271  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
272  inc_counter("par_vec:assign");
273  par_exec<detail::VectorizedAssign<V>>(expr, result);
274  } else {
275  inc_counter("vec:assign");
277  }
278  } else {
279  inc_counter("vec:assign");
281  }
282 
283  result.validate_cpu();
284  result.invalidate_gpu();
285 }
286 
287 // Selector versions
288 
289 template <typename E, typename R>
290 void assign_evaluate_impl_no_gpu(E&& expr, R&& result) {
291  if constexpr (detail::standard_assign_no_gpu<E, R>) {
292  standard_assign_impl(expr, result);
293  } else if constexpr (std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign_no_gpu<E, R>) {
294  fast_assign_impl_full(expr, result);
295  } else if constexpr (!std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign_no_gpu<E, R>) {
296  fast_assign_impl(expr, result);
297  } else if constexpr (detail::direct_assign_no_gpu<E, R>) {
298  direct_assign_impl(expr, result);
299  } else if constexpr (detail::vectorized_assign_no_gpu<E, R>) {
300  vectorized_assign_impl(expr, result);
301  }
302 }
303 
304 template <typename E, typename R>
305 void assign_evaluate_impl(E&& expr, R&& result) {
306  if constexpr (detail::standard_assign<E, R>) {
307  standard_assign_impl(expr, result);
308  } else if constexpr (std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign<E, R>) {
309  fast_assign_impl_full(expr, result);
310  } else if constexpr (!std::is_same_v<value_t<E>, value_t<R>> && detail::fast_assign<E, R>) {
311  fast_assign_impl(expr, result);
312  } else if constexpr (detail::gpu_assign<E, R>) {
313  if (local_context().cpu || is_something_forced()) {
314  assign_evaluate_impl_no_gpu(expr, result);
315  } else {
316  gpu_assign_impl(expr, result);
317  }
318  } else if constexpr (detail::direct_assign<E, R>) {
319  direct_assign_impl(expr, result);
320  } else if constexpr (detail::vectorized_assign<E, R>) {
321  vectorized_assign_impl(expr, result);
322  }
323 }
324 
325 // Compound Assign Add functions implementations
326 
335 template <typename E, typename R>
336 void standard_compound_add_impl(E& expr, R& result) {
337  inc_counter("std:assign");
338 
339  pre_assign_rhs(expr);
340 
341  for (size_t i = 0; i < etl::size(result); ++i) {
342  result[i] += expr[i];
343  }
344 
345  result.validate_cpu();
346  result.invalidate_gpu();
347 }
348 
358 template <typename E, typename R>
359 void direct_compound_add_impl(E& expr, R& result) {
360  pre_assign_rhs(expr);
361 
364 
365  if constexpr (is_thread_safe<E>) {
366  int factor = std::max(etl::complexity(expr), 1);
367  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
368  inc_counter("par:assign");
369  par_exec<detail::AssignAdd>(expr, result);
370  } else {
371  inc_counter("mem:assign");
372  detail::AssignAdd::apply(result, expr);
373  }
374  } else {
375  inc_counter("mem:assign");
376  detail::AssignAdd::apply(result, expr);
377  }
378 
379  result.validate_cpu();
380  result.invalidate_gpu();
381 }
382 
392 template <typename E, typename R>
393 void vectorized_compound_add_impl(E& expr, R& result) {
394  pre_assign_rhs(expr);
395 
398 
399  constexpr auto V = detail::select_vector_mode<E, R>();
400 
401  if constexpr (is_thread_safe<E>) {
402  int factor = std::max(etl::complexity(expr), 1);
403  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
404  inc_counter("par_vec:assign");
405  par_exec<detail::VectorizedAssignAdd<V>>(expr, result);
406  } else {
407  inc_counter("vec:assign");
409  }
410  } else {
411  inc_counter("vec:assign");
413  }
414 
415  result.validate_cpu();
416  result.invalidate_gpu();
417 }
418 
419 #ifdef ETL_CUBLAS_MODE
420 
430 template <typename E, typename R>
431 void gpu_compound_add_impl(E& expr, R& result) {
432  inc_counter("gpu:assign");
433 
434  result.ensure_gpu_up_to_date();
435 
436  // Compute the GPU representation of the expression
437  decltype(auto) t1 = smart_gpu_compute_hint(expr, result);
438 
439  value_t<E> alpha(1);
440  impl::egblas::axpy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);
441 
442  // Validate the GPU and invalidates the CPU
443  result.validate_gpu();
444  result.invalidate_cpu();
445 }
446 
447 #endif
448 
449 #ifdef ETL_EGBLAS_MODE
450 
460 template <typename E, typename R>
461 void gpu_compound_add_scalar_impl(E& expr, R& result) {
462  inc_counter("gpu:assign");
463 
464  result.ensure_gpu_up_to_date();
465 
466  // Compute the GPU representation of the expression
467  impl::egblas::scalar_add(result.gpu_memory(), etl::size(result), 1, expr.value);
468 
469  // Validate the GPU and invalidates the CPU
470  result.validate_gpu();
471  result.invalidate_cpu();
472 }
473 
474 #endif
475 
476 // Selector functions
477 
486 template <typename E, typename R>
487 void add_evaluate_no_gpu(E&& expr, R&& result) {
488  if constexpr (detail::standard_compound_no_gpu<E, R>) {
489  standard_compound_add_impl(expr, result);
490  } else if constexpr (detail::direct_compound_no_gpu<E, R>) {
491  direct_compound_add_impl(expr, result);
492  } else if constexpr (detail::vectorized_compound_no_gpu<E, R>) {
493  vectorized_compound_add_impl(expr, result);
494  }
495 }
496 
502 template <typename E, typename R>
503 void add_evaluate(E&& expr, R&& result) {
504  if constexpr (detail::standard_compound<E, R>) {
505  standard_compound_add_impl(expr, result);
506  } else if constexpr (detail::direct_compound<E, R>) {
507  direct_compound_add_impl(expr, result);
508  } else if constexpr (detail::vectorized_compound<E, R>) {
509  vectorized_compound_add_impl(expr, result);
510  } else if constexpr (cublas_enabled && detail::gpu_compound<E, R> && !is_scalar<E>) {
511  if (local_context().cpu || is_something_forced()) {
512  add_evaluate_no_gpu(expr, result);
513  } else {
514  gpu_compound_add_impl(expr, result);
515  }
516  } else if constexpr (egblas_enabled && detail::gpu_compound<E, R> && is_scalar<E>) {
517  if (local_context().cpu || is_something_forced()) {
518  add_evaluate_no_gpu(expr, result);
519  } else {
520  gpu_compound_add_scalar_impl(expr, result);
521  }
522  }
523 }
524 
525 // Compound assign sub implementation functions
526 
535 template <typename E, typename R>
536 void standard_compound_sub_impl(E& expr, R& result) {
537  inc_counter("std:assign");
538 
539  pre_assign_rhs(expr);
540 
543 
544  for (size_t i = 0; i < etl::size(result); ++i) {
545  result[i] -= expr[i];
546  }
547 
548  result.validate_cpu();
549  result.invalidate_gpu();
550 }
551 
561 template <typename E, typename R>
562 void direct_compound_sub_impl(E& expr, R& result) {
563  pre_assign_rhs(expr);
564 
567 
568  if constexpr (is_thread_safe<E>) {
569  int factor = std::max(etl::complexity(expr), 1);
570  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
571  par_exec<detail::AssignSub>(expr, result);
572  inc_counter("par:assign");
573  } else {
574  detail::AssignSub::apply(result, expr);
575  inc_counter("mem:assign");
576  }
577  } else {
578  detail::AssignSub::apply(result, expr);
579  inc_counter("mem:assign");
580  }
581 
582  result.validate_cpu();
583  result.invalidate_gpu();
584 }
585 
595 template <typename E, typename R>
596 void vectorized_compound_sub_impl(E& expr, R& result) {
597  pre_assign_rhs(expr);
598 
601 
602  constexpr auto V = detail::select_vector_mode<E, R>();
603 
604  if constexpr (is_thread_safe<E>) {
605  int factor = std::max(etl::complexity(expr), 1);
606  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
607  inc_counter("par_vec:assign");
608  par_exec<detail::VectorizedAssignSub<V>>(expr, result);
609  } else {
610  inc_counter("vec:assign");
612  }
613  } else {
614  inc_counter("vec:assign");
616  }
617 
618  result.validate_cpu();
619  result.invalidate_gpu();
620 }
621 
630 template <typename E, typename R>
631 void gpu_compound_sub_impl(E& expr, R& result) {
632  inc_counter("gpu:assign");
633 
634  result.ensure_gpu_up_to_date();
635 
636  // Compute the GPU representation of the expression
637  decltype(auto) t1 = smart_gpu_compute_hint(expr, result);
638 
639  value_t<E> alpha(-1);
640  impl::egblas::axpy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);
641 
642  // Validate the GPU and invalidates the CPU
643  result.validate_gpu();
644  result.invalidate_cpu();
645 }
646 
647 #ifdef ETL_EGBLAS_MODE
648 
657 template <typename E, typename R>
658 void gpu_compound_sub_scalar_impl(E& expr, R& result) {
659  inc_counter("gpu:assign");
660 
661  result.ensure_gpu_up_to_date();
662 
663  // Compute the GPU representation of the expression
664  auto value = -expr.value;
665  impl::egblas::scalar_add(result.gpu_memory(), etl::size(result), 1, value);
666 
667  // Validate the GPU and invalidates the CPU
668  result.validate_gpu();
669  result.invalidate_cpu();
670 }
671 
672 #endif
673 
674 // Selector functions
675 
684 template <typename E, typename R>
685 void sub_evaluate_no_gpu(E&& expr, R&& result) {
686  if constexpr (detail::standard_compound_no_gpu<E, R>) {
687  standard_compound_sub_impl(expr, result);
688  } else if constexpr (detail::direct_compound_no_gpu<E, R>) {
689  direct_compound_sub_impl(expr, result);
690  } else if constexpr (detail::vectorized_compound_no_gpu<E, R>) {
691  vectorized_compound_sub_impl(expr, result);
692  }
693 }
694 
700 template <typename E, typename R>
701 void sub_evaluate(E&& expr, R&& result) {
702  if constexpr (detail::standard_compound<E, R>) {
703  standard_compound_sub_impl(expr, result);
704  } else if constexpr (detail::direct_compound<E, R>) {
705  direct_compound_sub_impl(expr, result);
706  } else if constexpr (detail::vectorized_compound<E, R>) {
707  vectorized_compound_sub_impl(expr, result);
708  } else if constexpr (cublas_enabled && detail::gpu_compound<E, R> && !is_scalar<E>) {
709  if (local_context().cpu || is_something_forced()) {
710  sub_evaluate_no_gpu(expr, result);
711  } else {
712  gpu_compound_sub_impl(expr, result);
713  }
714  } else if constexpr (egblas_enabled && detail::gpu_compound<E, R> && is_scalar<E>) {
715  if (local_context().cpu || is_something_forced()) {
716  sub_evaluate_no_gpu(expr, result);
717  } else {
718  gpu_compound_sub_scalar_impl(expr, result);
719  }
720  }
721 }
722 
723 // Compound assign mul implementation functions
724 
733 template <typename E, typename R>
734 void standard_compound_mul_impl(E& expr, R& result) {
735  inc_counter("std:assign");
736 
737  pre_assign_rhs(expr);
738 
741 
742  for (size_t i = 0; i < etl::size(result); ++i) {
743  result[i] *= expr[i];
744  }
745 
746  result.validate_cpu();
747  result.invalidate_gpu();
748 }
749 
759 template <typename E, typename R>
760 void direct_compound_mul_impl(E& expr, R& result) {
761  pre_assign_rhs(expr);
762 
765 
766  if constexpr (is_thread_safe<E>) {
767  int factor = std::max(etl::complexity(expr), 1);
768  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
769  inc_counter("par:assign");
770  par_exec<detail::AssignMul>(expr, result);
771  } else {
772  inc_counter("mem:assign");
773  detail::AssignMul::apply(result, expr);
774  }
775  } else {
776  inc_counter("mem:assign");
777  detail::AssignMul::apply(result, expr);
778  }
779 
780  result.validate_cpu();
781  result.invalidate_gpu();
782 }
783 
793 template <typename E, typename R>
794 void vectorized_compound_mul_impl(E& expr, R& result) {
795  pre_assign_rhs(expr);
796 
799 
800  constexpr auto V = detail::select_vector_mode<E, R>();
801 
802  if constexpr (is_thread_safe<E>) {
803  int factor = std::max(etl::complexity(expr), 1);
804  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
805  inc_counter("par_vec:assign");
806  par_exec<detail::VectorizedAssignMul<V>>(expr, result);
807  } else {
808  inc_counter("vec:assign");
810  }
811  } else {
812  inc_counter("vec:assign");
814  }
815 
816  result.validate_cpu();
817  result.invalidate_gpu();
818 }
819 
820 #ifdef ETL_EGBLAS_MODE
821 
830 template <typename E, typename R>
831 void gpu_compound_mul_impl(E& expr, R& result) {
832  inc_counter("gpu:assign");
833 
834  result.ensure_gpu_up_to_date();
835 
836  // Compute the GPU representation of the expression
837  decltype(auto) t1 = smart_gpu_compute_hint(expr, result);
838 
839  value_t<E> alpha(1);
840  impl::egblas::axmy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);
841 
842  // Validate the GPU and invalidates the CPU
843  result.validate_gpu();
844  result.invalidate_cpu();
845 }
846 
847 #endif
848 
857 template <typename E, typename R>
858 void gpu_compound_mul_scalar_impl(E& expr, R& result) {
859  inc_counter("gpu:assign");
860 
861  result.ensure_gpu_up_to_date();
862 
863  // Compute the GPU representation of the expression
864  impl::egblas::scalar_mul(result.gpu_memory(), etl::size(result), 1, expr.value);
865 
866  // Validate the GPU and invalidates the CPU
867  result.validate_gpu();
868  result.invalidate_cpu();
869 }
870 
871 // Selector functions
872 
881 template <typename E, typename R>
882 void mul_evaluate_no_gpu(E&& expr, R&& result) {
883  if constexpr (detail::standard_compound_no_gpu<E, R>) {
884  standard_compound_mul_impl(expr, result);
885  } else if constexpr (detail::direct_compound_no_gpu<E, R>) {
886  direct_compound_mul_impl(expr, result);
887  } else if constexpr (detail::vectorized_compound_no_gpu<E, R>) {
888  vectorized_compound_mul_impl(expr, result);
889  }
890 }
891 
897 template <typename E, typename R>
898 void mul_evaluate(E&& expr, R&& result) {
899  if constexpr (detail::standard_compound<E, R>) {
900  standard_compound_mul_impl(expr, result);
901  } else if constexpr (detail::direct_compound<E, R>) {
902  direct_compound_mul_impl(expr, result);
903  } else if constexpr (detail::vectorized_compound<E, R>) {
904  vectorized_compound_mul_impl(expr, result);
905  } else if constexpr (egblas_enabled && detail::gpu_compound<E, R> && !is_scalar<E>) {
906  if (local_context().cpu || is_something_forced()) {
907  mul_evaluate_no_gpu(expr, result);
908  } else {
909  gpu_compound_mul_impl(expr, result);
910  }
911  } else if constexpr (cublas_enabled && detail::gpu_compound<E, R> && is_scalar<E>) {
912  if (local_context().cpu || is_something_forced()) {
913  mul_evaluate_no_gpu(expr, result);
914  } else {
915  gpu_compound_mul_scalar_impl(expr, result);
916  }
917  }
918 }
919 
920 // Compound Assign Div implementation functions
921 
930 template <typename E, typename R>
931 void standard_compound_div_impl(E& expr, R& result) {
932  inc_counter("std:assign");
933 
934  pre_assign_rhs(expr);
935 
938 
939  for (size_t i = 0; i < etl::size(result); ++i) {
940  result[i] /= expr[i];
941  }
942 
943  result.validate_cpu();
944  result.invalidate_gpu();
945 }
946 
956 template <typename E, typename R>
957 void direct_compound_div_impl(E& expr, R& result) {
958  pre_assign_rhs(expr);
959 
962 
963  if constexpr (is_thread_safe<E>) {
964  int factor = std::max(etl::complexity(expr), 1);
965  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
966  inc_counter("par:assign");
967  par_exec<detail::AssignDiv>(expr, result);
968  } else {
969  inc_counter("mem:assign");
970  detail::AssignDiv::apply(result, expr);
971  }
972  } else {
973  inc_counter("mem:assign");
974  detail::AssignDiv::apply(result, expr);
975  }
976 
977  result.validate_cpu();
978  result.invalidate_gpu();
979 }
980 
990 template <typename E, typename R>
991 void vectorized_compound_div_impl(E& expr, R& result) {
992  pre_assign_rhs(expr);
993 
996 
997  constexpr auto V = detail::select_vector_mode<E, R>();
998 
999  if constexpr (is_thread_safe<E>) {
1000  int factor = std::max(etl::complexity(expr), 1);
1001  if (engine_select_parallel(etl::size(result), parallel_threshold / factor)) {
1002  inc_counter("par_vec:assign");
1003  par_exec<detail::VectorizedAssignDiv<V>>(expr, result);
1004  } else {
1005  inc_counter("vec:assign");
1007  }
1008  } else {
1009  inc_counter("vec:assign");
1011  }
1012 
1013  result.validate_cpu();
1014  result.invalidate_gpu();
1015 }
1016 
1025 template <typename E, typename R>
1026 void gpu_compound_div_impl(E& expr, R& result) {
1027  inc_counter("gpu:assign");
1028 
1029  result.ensure_gpu_up_to_date();
1030 
1031  // Compute the GPU representation of the expression
1032  decltype(auto) t1 = smart_gpu_compute_hint(expr, result);
1033 
1034  value_t<E> alpha(1);
1035  impl::egblas::axdy(etl::size(result), alpha, t1.gpu_memory(), 1, result.gpu_memory(), 1);
1036 
1037  // Validate the GPU and invalidates the CPU
1038  result.validate_gpu();
1039  result.invalidate_cpu();
1040 }
1041 
1050 template <typename E, typename R>
1051 void gpu_compound_div_scalar_impl(E& expr, R& result) {
1052  inc_counter("gpu:assign");
1053 
1054  result.ensure_gpu_up_to_date();
1055 
1056  // Compute the GPU representation of the expression
1057  auto value = value_t<E>(1.0) / expr.value;
1058  impl::egblas::scalar_mul(result.gpu_memory(), etl::size(result), 1, value);
1059 
1060  // Validate the GPU and invalidates the CPU
1061  result.validate_gpu();
1062  result.invalidate_cpu();
1063 }
1064 
1065 // Selector functions
1066 
1075 template <typename E, typename R>
1076 void div_evaluate_no_gpu(E&& expr, R&& result) {
1077  if constexpr (detail::standard_compound_div_no_gpu<E, R>) {
1078  standard_compound_div_impl(expr, result);
1079  } else if constexpr (detail::direct_compound_div_no_gpu<E, R>) {
1080  direct_compound_div_impl(expr, result);
1081  } else if constexpr (detail::vectorized_compound_div_no_gpu<E, R>) {
1082  vectorized_compound_div_impl(expr, result);
1083  }
1084 }
1085 
1091 template <typename E, typename R>
1092 void div_evaluate(E&& expr, R&& result) {
1093  if constexpr (detail::standard_compound_div<E, R>) {
1094  standard_compound_div_impl(expr, result);
1095  } else if constexpr (detail::direct_compound_div<E, R>) {
1096  direct_compound_div_impl(expr, result);
1097  } else if constexpr (detail::vectorized_compound_div<E, R>) {
1098  vectorized_compound_div_impl(expr, result);
1099  } else if constexpr (egblas_enabled && detail::gpu_compound_div<E, R> && !is_scalar<E>) {
1100  if (local_context().cpu || is_something_forced()) {
1101  div_evaluate_no_gpu(expr, result);
1102  } else {
1103  gpu_compound_div_impl(expr, result);
1104  }
1105  } else if constexpr (cublas_enabled && detail::gpu_compound_div<E, R> && is_scalar<E>) {
1106  if (local_context().cpu || is_something_forced()) {
1107  div_evaluate_no_gpu(expr, result);
1108  } else {
1109  gpu_compound_div_scalar_impl(expr, result);
1110  }
1111  }
1112 }
1113 
1114 //Standard Mod Evaluate (no optimized versions for mod)
1115 
1121 template <typename E, typename R>
1122 void mod_evaluate(E&& expr, R&& result) {
1123  inc_counter("std:assign");
1124 
1125  pre_assign_rhs(expr);
1126 
1129 
1130  for (size_t i = 0; i < etl::size(result); ++i) {
1131  result[i] %= expr[i];
1132  }
1133 
1134  result.validate_cpu();
1135  result.invalidate_gpu();
1136 }
1137 
1143 template <typename E, typename R>
1144 void assign_evaluate(E&& expr, R&& result) {
1145  if constexpr (!detail::gpu_assign<E, R>) {
1146  //Evaluate sub parts, if any
1147  pre_assign_rhs(expr);
1148  }
1149 
1150  //Perform the real evaluation, selected by TMP
1151  assign_evaluate_impl(expr, result);
1152 }
1153 
1154 } // end of namespace standard_evaluator
1155 
1164 template <typename Expr, typename Result>
1165 constexpr bool direct_assign_compatible = decay_traits<Expr>::is_generator // No dimensions, always possible to assign
1166  || decay_traits<Expr>::storage_order == decay_traits<Result>::storage_order // Same storage always possible to assign
1167  || all_1d<Expr, Result> // Vectors can be directly assigned, regardless of the storage order
1168  ;
1169 
1175 template <typename Expr, typename Result>
1176 void std_assign_evaluate(Expr&& expr, Result&& result) {
1177 #ifdef DEBUG_EVALUATOR
1178  std::cout << result << "=" << expr << std::endl;
1179 #endif
1180 
1181  if constexpr (direct_assign_compatible<Expr, Result>) {
1182  standard_evaluator::assign_evaluate(expr, result);
1183  } else {
1184  inc_counter("eval:transpose");
1185  standard_evaluator::assign_evaluate(transpose(expr), result);
1186  }
1187 }
1188 
1194 template <typename Expr, typename Result>
1195 void std_add_evaluate(Expr&& expr, Result&& result) {
1196 #ifdef DEBUG_EVALUATOR
1197  std::cout << result << "+=" << expr << std::endl;
1198 #endif
1199 
1200  if constexpr (direct_assign_compatible<Expr, Result>) {
1201  standard_evaluator::add_evaluate(expr, result);
1202  } else {
1203  inc_counter("eval:transpose");
1204  standard_evaluator::add_evaluate(transpose(expr), result);
1205  }
1206 }
1207 
1213 template <typename Expr, typename Result>
1214 void std_sub_evaluate(Expr&& expr, Result&& result) {
1215 #ifdef DEBUG_EVALUATOR
1216  std::cout << result << "-=" << expr << std::endl;
1217 #endif
1218 
1219  if constexpr (direct_assign_compatible<Expr, Result>) {
1220  standard_evaluator::sub_evaluate(expr, result);
1221  } else {
1222  inc_counter("eval:transpose");
1223  standard_evaluator::sub_evaluate(transpose(expr), result);
1224  }
1225 }
1226 
1232 template <typename Expr, typename Result>
1233 void std_mul_evaluate(Expr&& expr, Result&& result) {
1234 #ifdef DEBUG_EVALUATOR
1235  std::cout << result << "*=" << expr << std::endl;
1236 #endif
1237 
1238  if constexpr (direct_assign_compatible<Expr, Result>) {
1239  standard_evaluator::mul_evaluate(expr, result);
1240  } else {
1241  inc_counter("eval:transpose");
1242  standard_evaluator::mul_evaluate(transpose(expr), result);
1243  }
1244 }
1245 
1251 template <typename Expr, typename Result>
1252 void std_div_evaluate(Expr&& expr, Result&& result) {
1253 #ifdef DEBUG_EVALUATOR
1254  std::cout << result << "/=" << expr << std::endl;
1255 #endif
1256 
1257  if constexpr (direct_assign_compatible<Expr, Result>) {
1258  standard_evaluator::div_evaluate(expr, result);
1259  } else {
1260  inc_counter("eval:transpose");
1261  standard_evaluator::div_evaluate(transpose(expr), result);
1262  }
1263 }
1264 
1270 template <typename Expr, typename Result>
1271 void std_mod_evaluate(Expr&& expr, Result&& result) {
1272 #ifdef DEBUG_EVALUATOR
1273  std::cout << result << "%=" << expr << std::endl;
1274 #endif
1275 
1276  if constexpr (direct_assign_compatible<Expr, Result>) {
1277  standard_evaluator::mod_evaluate(expr, result);
1278  } else {
1279  inc_counter("eval:transpose");
1280  standard_evaluator::mod_evaluate(transpose(expr), result);
1281  }
1282 }
1283 
1291 template <typename Expr>
1292 void force(Expr&& expr) {
1293  standard_evaluator::pre_assign_rhs(expr);
1294 }
1295 
1296 } //end of namespace etl
void gpu_compound_mul_scalar_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:858
constexpr int complexity([[maybe_unused]] const E &expr) noexcept
Return the complexity of the expression.
Definition: helpers.hpp:38
void sub_evaluate_no_gpu(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:685
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:125
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
void standard_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:931
void gpu_compound_div_scalar_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1051
void std_assign_evaluate(Expr &&expr, Result &&result)
Evaluation of the expr into result.
Definition: evaluator.hpp:1176
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:93
void fast_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:173
void gpu_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:631
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:113
void vectorized_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:991
void vectorized_compound_add_impl(E &expr, R &result)
Add the result of the expression to the result.
Definition: evaluator.hpp:393
void standard_compound_mul_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:734
constexpr bool cuda_enabled
Indicates if CUDA is available.
Definition: config.hpp:94
void direct_copy(const S *first, const S *last, T *target)
Performs a direct memory copy.
Definition: memory.hpp:24
void force(Expr &&expr)
Force the internal evaluation of an expression.
Definition: evaluator.hpp:1292
void mul_evaluate(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:898
void standard_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:80
void direct_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:562
void vectorized_compound_mul_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:794
void direct_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:231
Contains TMP selectors to select evaluation methods based on configuration.
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:29
auto transpose(const E &value)
Returns the transpose of the given expression.
Definition: expression_builder.hpp:528
void standard_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:536
void div_evaluate(E &&expr, R &&result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1092
Traits to get information about ETL types.
Definition: tmp.hpp:68
Root namespace for the ETL library.
Definition: adapter.hpp:15
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50
constexpr bool egblas_enabled
Indicates if the EGBLAS library is available for ETL.
Definition: config.hpp:119
constexpr bool cublas_enabled
Indicates if the NVIDIA CUBLAS library is available for ETL.
Definition: config.hpp:99
void par_exec(E &&expr, R &&result)
Assign the result of the expression to the result with the given Functor, using parallel implementati...
Definition: evaluator.hpp:59
Visitor to perform local evaluation when necessary.
Definition: eval_visitors.hpp:23
constexpr size_t parallel_threshold
The minimum number of elements before considering parallel implementation.
Definition: threshold.hpp:66
void std_mod_evaluate(Expr &&expr, Result &&result)
Compound modulo evaluation of the expr into result.
Definition: evaluator.hpp:1271
void add_evaluate(E &&expr, R &&result)
Add the result of the expression to the result.
Definition: evaluator.hpp:503
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:156
void pre_assign_rhs(E &&expr)
Allocate temporaries and evaluate sub expressions in RHS.
Definition: evaluator.hpp:48
void add_evaluate_no_gpu(E &&expr, R &&result)
Add the result of the expression to the result.
Definition: evaluator.hpp:487
void std_mul_evaluate(Expr &&expr, Result &&result)
Compound multiply evaluation of the expr into result.
Definition: evaluator.hpp:1233
void mul_evaluate_no_gpu(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:882
void sub_evaluate(E &&expr, R &&result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:701
void gpu_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1026
void gpu_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:195
void safe_ensure_cpu_up_to_date(E &&expr)
Ensure that the CPU is up to date.
Definition: helpers.hpp:278
void assign_evaluate(E &&expr, R &&result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:1144
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
void div_evaluate_no_gpu(E &&expr, R &&result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:1076
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:52
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:242
void standard_compound_add_impl(E &expr, R &result)
Add the result of the expression to the result.
Definition: evaluator.hpp:336
void direct_compound_div_impl(E &expr, R &result)
Divide the result by the result of the expression.
Definition: evaluator.hpp:957
void direct_compound_add_impl(E &expr, R &result)
Add the result of the expression to the result.
Definition: evaluator.hpp:359
void direct_compound_mul_impl(E &expr, R &result)
Multiply the result by the result of the expression.
Definition: evaluator.hpp:760
void vectorized_assign_impl(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:263
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:157
bool is_something_forced()
Indicates if some implementation is forced in the context.
Definition: context.hpp:60
void std_sub_evaluate(Expr &&expr, Result &&result)
Compound subtract evaluation of the expr into result.
Definition: evaluator.hpp:1214
void vectorized_compound_sub_impl(E &expr, R &result)
Subtract the result of the expression from the result.
Definition: evaluator.hpp:596
constexpr bool direct_assign_compatible
Traits indicating if a direct assign is possible.
Definition: evaluator.hpp:1165
constexpr bool parallel_support
Indicates if support for parallelization is integrated into the framework.
Definition: config.hpp:51
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Assign rhs to lhs.
Definition: linear_eval_functors.hpp:61
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:199
bool engine_select_parallel([[maybe_unused]] size_t n, [[maybe_unused]] size_t threshold=parallel_threshold)
Indicates if an 1D evaluation should run in paralle.
Definition: parallel_support.hpp:679
decltype(auto) smart_gpu_compute_hint(E &expr, Y &y)
Compute the expression into a representation that is GPU up to date.
Definition: helpers.hpp:368
Contains the linear functors used by the evaluator to perform its actions.
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
void std_div_evaluate(Expr &&expr, Result &&result)
Compound divide evaluation of the expr into result.
Definition: evaluator.hpp:1252
void mod_evaluate(E &&expr, R &&result)
Modulo the result by the result of the expression.
Definition: evaluator.hpp:1122
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
decltype(auto) smart_gpu_compute(X &x, Y &y)
Compute the expression into a representation that is GPU up to date and store this representation in ...
Definition: helpers.hpp:397
void std_add_evaluate(Expr &&expr, Result &&result)
Compound add evaluation of the expr into result.
Definition: evaluator.hpp:1195
void fast_assign_impl_full(E &expr, R &result)
Assign the result of the expression to the result.
Definition: evaluator.hpp:98