Expression Templates Library (ETL)
vec_eval_functors.hpp
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
14 #pragma once
15 
16 namespace etl::detail {
17 
21 template <vector_mode_t V>
24 
30  template <typename T>
31  static inline auto load(T&& x, size_t i) {
32  return x.template load<vect_impl>(i);
33  }
34 };
35 
42 template <vector_mode_t V>
45  using base_t::load;
46  using vect_impl = typename base_t::vect_impl;
47 
51  template <typename L_Expr, typename R_Expr>
52  static void apply(L_Expr&& lhs, R_Expr&& rhs) {
53  using IT = typename get_intrinsic_traits<V>::template type<value_t<R_Expr>>;
54 
55  const size_t N = etl::size(lhs);
56 
57  auto* lhs_mem = lhs.memory_start();
58 
59  constexpr bool remainder = !padding || !all_padded<L_Expr, R_Expr>;
60 
61  const size_t last = remainder ? prev_multiple(N, IT::size) : N;
62 
63  size_t i = 0;
64 
65  // 0. If possible and interesting, use streaming stores
66 
67  if constexpr (streaming) {
68  if (N > stream_threshold / (sizeof(value_t<L_Expr>) * 3) && !rhs.alias(lhs)) {
69  for (; i < last; i += IT::size) {
70  lhs.template stream<vect_impl>(load(rhs, i), i);
71  }
72 
73  for (; remainder && i < N; ++i) {
74  lhs_mem[i] = rhs[i];
75  }
76 
77  return;
78  }
79  }
80 
81  // 1. In the default case, simple unrolled vectorization
82 
83  for (; i + (IT::size * 3) < last; i += 4 * IT::size) {
84  lhs.template store<vect_impl>(load(rhs, i + 0 * IT::size), i + 0 * IT::size);
85  lhs.template store<vect_impl>(load(rhs, i + 1 * IT::size), i + 1 * IT::size);
86  lhs.template store<vect_impl>(load(rhs, i + 2 * IT::size), i + 2 * IT::size);
87  lhs.template store<vect_impl>(load(rhs, i + 3 * IT::size), i + 3 * IT::size);
88  }
89 
90  for (; i < last; i += IT::size) {
91  lhs.template store<vect_impl>(load(rhs, i), i);
92  }
93 
94  for (; remainder && i < N; ++i) {
95  lhs_mem[i] = rhs[i];
96  }
97  }
98 };
99 
103 template <vector_mode_t V>
106  using base_t::load;
107  using vect_impl = typename base_t::vect_impl;
108 
112  template <typename L_Expr, typename R_Expr>
113  static void apply(L_Expr&& lhs, R_Expr&& rhs) {
114  using IT = typename get_intrinsic_traits<V>::template type<value_t<R_Expr>>;
115 
116  const size_t N = etl::size(lhs);
117 
118  auto* lhs_mem = lhs.memory_start();
119 
120  constexpr bool remainder = !padding || !all_padded<L_Expr, R_Expr>;
121 
122  const size_t last = remainder ? prev_multiple(N, IT::size) : N;
123 
124  size_t i = 0;
125 
126  for (; i + (IT::size * 3) < last; i += 4 * IT::size) {
127  lhs.template store<vect_impl>(vect_impl::add(load(lhs, i + 0 * IT::size), load(rhs, i + 0 * IT::size)), i + 0 * IT::size);
128  lhs.template store<vect_impl>(vect_impl::add(load(lhs, i + 1 * IT::size), load(rhs, i + 1 * IT::size)), i + 1 * IT::size);
129  lhs.template store<vect_impl>(vect_impl::add(load(lhs, i + 2 * IT::size), load(rhs, i + 2 * IT::size)), i + 2 * IT::size);
130  lhs.template store<vect_impl>(vect_impl::add(load(lhs, i + 3 * IT::size), load(rhs, i + 3 * IT::size)), i + 3 * IT::size);
131  }
132 
133  for (; i < last; i += IT::size) {
134  lhs.template store<vect_impl>(vect_impl::add(load(lhs, i), load(rhs, i)), i);
135  }
136 
137  for (; remainder && i < N; ++i) {
138  lhs_mem[i] += rhs[i];
139  }
140  }
141 };
142 
146 template <vector_mode_t V>
149  using base_t::load;
150  using vect_impl = typename base_t::vect_impl;
151 
155  template <typename L_Expr, typename R_Expr>
156  static void apply(L_Expr&& lhs, R_Expr&& rhs) {
157  using IT = typename get_intrinsic_traits<V>::template type<value_t<R_Expr>>;
158 
159  const size_t N = etl::size(lhs);
160 
161  auto* lhs_mem = lhs.memory_start();
162 
163  constexpr bool remainder = !padding || !all_padded<L_Expr, R_Expr>;
164 
165  const size_t last = remainder ? prev_multiple(N, IT::size) : N;
166 
167  size_t i = 0;
168 
169  for (; i + (IT::size * 3) < last; i += 4 * IT::size) {
170  lhs.template store<vect_impl>(vect_impl::sub(load(lhs, i + 0 * IT::size), load(rhs, i + 0 * IT::size)), i + 0 * IT::size);
171  lhs.template store<vect_impl>(vect_impl::sub(load(lhs, i + 1 * IT::size), load(rhs, i + 1 * IT::size)), i + 1 * IT::size);
172  lhs.template store<vect_impl>(vect_impl::sub(load(lhs, i + 2 * IT::size), load(rhs, i + 2 * IT::size)), i + 2 * IT::size);
173  lhs.template store<vect_impl>(vect_impl::sub(load(lhs, i + 3 * IT::size), load(rhs, i + 3 * IT::size)), i + 3 * IT::size);
174  }
175 
176  for (; i < last; i += IT::size) {
177  lhs.template store<vect_impl>(vect_impl::sub(load(lhs, i), load(rhs, i)), i);
178  }
179 
180  for (; remainder && i < N; ++i) {
181  lhs_mem[i] -= rhs[i];
182  }
183  }
184 };
185 
189 template <vector_mode_t V>
192  using base_t::load;
193  using vect_impl = typename base_t::vect_impl;
194 
198  template <typename L_Expr, typename R_Expr>
199  static void apply(L_Expr&& lhs, R_Expr&& rhs) {
200  using IT = typename get_intrinsic_traits<V>::template type<value_t<R_Expr>>;
201 
202  const size_t N = etl::size(lhs);
203 
204  auto* lhs_mem = lhs.memory_start();
205 
206  constexpr bool remainder = !padding || !all_padded<L_Expr, R_Expr>;
207 
208  const size_t last = remainder ? prev_multiple(N, IT::size) : N;
209 
210  size_t i = 0;
211 
212  for (; i + (IT::size * 3) < last; i += 4 * IT::size) {
213  lhs.template store<vect_impl>(vect_impl::mul(load(lhs, i + 0 * IT::size), load(rhs, i + 0 * IT::size)), i + 0 * IT::size);
214  lhs.template store<vect_impl>(vect_impl::mul(load(lhs, i + 1 * IT::size), load(rhs, i + 1 * IT::size)), i + 1 * IT::size);
215  lhs.template store<vect_impl>(vect_impl::mul(load(lhs, i + 2 * IT::size), load(rhs, i + 2 * IT::size)), i + 2 * IT::size);
216  lhs.template store<vect_impl>(vect_impl::mul(load(lhs, i + 3 * IT::size), load(rhs, i + 3 * IT::size)), i + 3 * IT::size);
217  }
218 
219  for (; i < last; i += IT::size) {
220  lhs.template store<vect_impl>(vect_impl::mul(load(lhs, i), load(rhs, i)), i);
221  }
222 
223  for (; remainder && i < N; ++i) {
224  lhs_mem[i] *= rhs[i];
225  }
226  }
227 };
228 
232 template <vector_mode_t V>
235  using base_t::load;
236  using vect_impl = typename base_t::vect_impl;
237 
241  template <typename L_Expr, typename R_Expr>
242  static void apply(L_Expr&& lhs, R_Expr&& rhs) {
243  using IT = typename get_intrinsic_traits<V>::template type<value_t<R_Expr>>;
244 
245  const size_t N = etl::size(lhs);
246 
247  auto* lhs_mem = lhs.memory_start();
248 
249  constexpr bool remainder = !padding || !all_padded<L_Expr, R_Expr>;
250 
251  const size_t last = remainder ? prev_multiple(N, IT::size) : N;
252 
253  size_t i = 0;
254 
255  for (; i + (IT::size * 3) < last; i += 4 * IT::size) {
256  lhs.template store<vect_impl>(vect_impl::div(load(lhs, i + 0 * IT::size), load(rhs, i + 0 * IT::size)), i + 0 * IT::size);
257  lhs.template store<vect_impl>(vect_impl::div(load(lhs, i + 1 * IT::size), load(rhs, i + 1 * IT::size)), i + 1 * IT::size);
258  lhs.template store<vect_impl>(vect_impl::div(load(lhs, i + 2 * IT::size), load(rhs, i + 2 * IT::size)), i + 2 * IT::size);
259  lhs.template store<vect_impl>(vect_impl::div(load(lhs, i + 3 * IT::size), load(rhs, i + 3 * IT::size)), i + 3 * IT::size);
260  }
261 
262  for (; i < last; i += IT::size) {
263  lhs.template store<vect_impl>(vect_impl::div(load(lhs, i), load(rhs, i)), i);
264  }
265 
266  for (; remainder && i < N; ++i) {
267  lhs_mem[i] /= rhs[i];
268  }
269  }
270 };
271 
272 } //end of namespace etl::detail
constexpr bool padding
Indicates if ETL is allowed to pad matrices and vectors.
Definition: config.hpp:135
Traits to get the intrinsic traits for a vector mode.
Definition: vectorization.hpp:88
Functor for vectorized compound assign div.
Definition: vec_eval_functors.hpp:233
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:113
Functor for vectorized compound assign add.
Definition: vec_eval_functors.hpp:104
Definition: expression_builder.hpp:699
Common base for vectorized functors.
Definition: vec_eval_functors.hpp:22
auto load(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:143
constexpr size_t stream_threshold
The threshold at which stream is used.
Definition: threshold.hpp:80
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:156
Functor for vectorized assign.
Definition: vec_eval_functors.hpp:43
Functor for vectorized compound assign sub.
Definition: vec_eval_functors.hpp:147
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:52
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:242
typename get_vector_impl< V >::type vect_impl
The vectorization type.
Definition: vec_eval_functors.hpp:23
Functor for vectorized compound assign mul.
Definition: vec_eval_functors.hpp:190
static void apply(L_Expr &&lhs, R_Expr &&rhs)
Compute the vectorized iterations of the loop using aligned store operations.
Definition: vec_eval_functors.hpp:199
static auto load(T &&x, size_t i)
Load a vector from lhs at position i.
Definition: vec_eval_functors.hpp:31
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
constexpr bool streaming
Indicates if ETL is allowed to perform streaming (non-temporal writes).
Definition: config.hpp:130
Vectorization support when no vectorization is enabled.
Definition: no_vectorization.hpp:29