Expression Templates Library (ETL)
bias_batch_var_4d_expr.hpp
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
8 #pragma once
9 
10 #include "etl/expr/base_temporary_expr.hpp"
11 
13 
14 namespace etl {
15 
20 template <etl_4d A, etl_1d B>
21 struct bias_batch_var_4d_expr : base_temporary_expr_bin<bias_batch_var_4d_expr<A, B>, A, B> {
26 
27  static constexpr auto storage_order = sub_traits::storage_order;
28 
33  static constexpr bool gpu_computable =
34  (impl::egblas::has_sbias_batch_var4 && all_row_major<A> && all_single_precision<A>)
35  || (impl::egblas::has_dbias_batch_var4 && all_row_major<A> && all_double_precision<A>);
36 
41  explicit bias_batch_var_4d_expr(A a, B b) : base_type(a, b) {
42  //Nothing else to init
43  }
44 
50  template <etl_1d C>
51  static void check([[maybe_unused]] const A& a, [[maybe_unused]] const B& b, [[maybe_unused]] const C& c) {
52  if constexpr (all_fast<A, B, C>) {
53  static_assert(etl::dim<1, A>() == etl::dim<0, C>(), "Invalid dimensions for bias_batch_var_4d");
54  static_assert(etl::dim<0, B>() == etl::dim<0, C>(), "Invalid dimensions for bias_batch_var_4d");
55  } else {
56  cpp_assert(etl::dim<1>(a) == etl::dim<0>(c), "Invalid dimensions for bias_batch_var_4d");
57  cpp_assert(etl::dim<0>(b) == etl::dim<0>(c), "Invalid dimensions for bias_batch_var_4d");
58  }
59  }
60 
61  // Assignment functions
62 
67  template <etl_expr L>
68  void assign_to(L&& lhs) const {
69  inc_counter("temp:assign");
70 
71  auto& a = this->a();
72  auto& b = this->b();
73 
74  check(a, b, lhs);
75 
76  using T = value_t<A>;
77 
78  const auto N = etl::dim<0>(a);
79  const auto K = etl::dim<1>(a);
80  const auto F = static_cast<T>(etl::size(a) / etl::size(lhs));
81 
82  if constexpr (impl::egblas::has_sbias_batch_var4 && all_row_major<A> && all_floating<A, L>) {
83  const auto W = etl::dim<2>(a);
84  const auto H = etl::dim<3>(a);
85 
86  decltype(auto) t1 = smart_forward_gpu(a);
87  decltype(auto) t2 = smart_forward_gpu(b);
88 
89  t1.ensure_gpu_up_to_date();
90  t2.ensure_gpu_up_to_date();
91 
92  lhs.ensure_gpu_allocated();
93 
94  impl::egblas::bias_batch_var4(N, K, W, H, t1.gpu_memory(), t2.gpu_memory(), lhs.gpu_memory());
95 
96  lhs.validate_gpu();
97  lhs.invalidate_cpu();
98  } else {
99  standard_evaluator::pre_assign_rhs(a);
100  standard_evaluator::pre_assign_rhs(b);
101 
102  a.ensure_cpu_up_to_date();
103  b.ensure_cpu_up_to_date();
104 
105  // Note: We use etl::sum directly instead of doing the sum manually
106  // That way, we will access the already vectorized sum
107  // Now, this means that evaluator decisions will be called several
108  // times. This could be an issue that could be looked at in the future
109 
110  auto batch_fun_k = [&](const size_t first, const size_t last) {
111  CPU_SECTION {
112  for (size_t k = first; k < last; ++k) {
113  lhs(k) = 0;
114  }
115 
116  for (size_t bb = 0; bb < N; ++bb) {
117  for (size_t k = first; k < last; ++k) {
118  lhs(k) += sum((a(bb)(k) - b(k)) >> (a(bb)(k) - b(k)));
119  }
120  }
121 
122  for (size_t k = first; k < last; ++k) {
123  lhs(k) /= F;
124  }
125  }
126  };
127 
128  engine_dispatch_1d_serial(batch_fun_k, 0, K, 2UL);
129 
130  lhs.validate_cpu();
131  lhs.invalidate_gpu();
132  }
133  }
134 
139  template <etl_expr L>
140  void assign_add_to(L&& lhs) const {
141  auto& a = this->a();
142  auto& b = this->b();
143 
144  standard_evaluator::pre_assign_rhs(a);
145  standard_evaluator::pre_assign_rhs(b);
146 
147  using T = value_t<A>;
148 
149  const auto N = etl::dim<0>(a);
150  const auto K = etl::dim<1>(a);
151  const auto F = static_cast<T>(etl::size(a) / etl::size(lhs));
152 
153  check(a, b, lhs);
154 
155  a.ensure_cpu_up_to_date();
156  b.ensure_cpu_up_to_date();
157  lhs.ensure_cpu_up_to_date();
158 
159  auto batch_fun_k = [&](const size_t first, const size_t last) {
160  CPU_SECTION {
161  for (size_t k = first; k < last; ++k) {
162  T var = 0;
163 
164  for (size_t bb = 0; bb < N; ++bb) {
165  var += sum((a(bb)(k) - b(k)) >> (a(bb)(k) - b(k)));
166  }
167 
168  lhs(k) += var / F;
169  }
170  }
171  };
172 
173  engine_dispatch_1d_serial(batch_fun_k, 0, K, 2UL);
174 
175  lhs.validate_cpu();
176  lhs.invalidate_gpu();
177  }
178 
183  template <etl_expr L>
184  void assign_sub_to(L&& lhs) const {
185  auto& a = this->a();
186  auto& b = this->b();
187 
188  standard_evaluator::pre_assign_rhs(a);
189  standard_evaluator::pre_assign_rhs(b);
190 
191  using T = value_t<A>;
192 
193  const auto N = etl::dim<0>(a);
194  const auto K = etl::dim<1>(a);
195  const auto F = static_cast<T>(etl::size(a) / etl::size(lhs));
196 
197  check(a, b, lhs);
198 
199  a.ensure_cpu_up_to_date();
200  b.ensure_cpu_up_to_date();
201  lhs.ensure_cpu_up_to_date();
202 
203  auto batch_fun_k = [&](const size_t first, const size_t last) {
204  CPU_SECTION {
205  for (size_t k = first; k < last; ++k) {
206  T var = 0;
207 
208  for (size_t bb = 0; bb < N; ++bb) {
209  var += sum((a(bb)(k) - b(k)) >> (a(bb)(k) - b(k)));
210  }
211 
212  lhs(k) -= var / F;
213  }
214  }
215  };
216 
217  engine_dispatch_1d_serial(batch_fun_k, 0, K, 2UL);
218 
219  lhs.validate_cpu();
220  lhs.invalidate_gpu();
221  }
222 
227  template <etl_expr L>
228  void assign_mul_to(L&& lhs) const {
229  auto& a = this->a();
230  auto& b = this->b();
231 
232  standard_evaluator::pre_assign_rhs(a);
233  standard_evaluator::pre_assign_rhs(b);
234 
235  using T = value_t<A>;
236 
237  const auto N = etl::dim<0>(a);
238  const auto K = etl::dim<1>(a);
239  const auto F = static_cast<T>(etl::size(a) / etl::size(lhs));
240 
241  check(a, b, lhs);
242 
243  a.ensure_cpu_up_to_date();
244  b.ensure_cpu_up_to_date();
245  lhs.ensure_cpu_up_to_date();
246 
247  auto batch_fun_k = [&](const size_t first, const size_t last) {
248  CPU_SECTION {
249  for (size_t k = first; k < last; ++k) {
250  T var = 0;
251 
252  for (size_t bb = 0; bb < N; ++bb) {
253  var += sum((a(bb)(k) - b(k)) >> (a(bb)(k) - b(k)));
254  }
255 
256  lhs(k) *= var / F;
257  }
258  }
259  };
260 
261  engine_dispatch_1d_serial(batch_fun_k, 0, K, 2UL);
262 
263  lhs.validate_cpu();
264  lhs.invalidate_gpu();
265  }
266 
271  template <etl_expr L>
272  void assign_div_to(L&& lhs) const {
273  auto& a = this->a();
274  auto& b = this->b();
275 
276  standard_evaluator::pre_assign_rhs(a);
277  standard_evaluator::pre_assign_rhs(b);
278 
279  using T = value_t<A>;
280 
281  const auto N = etl::dim<0>(a);
282  const auto K = etl::dim<1>(a);
283  const auto F = static_cast<T>(etl::size(a) / etl::size(lhs));
284 
285  check(a, b, lhs);
286 
287  a.ensure_cpu_up_to_date();
288  b.ensure_cpu_up_to_date();
289  lhs.ensure_cpu_up_to_date();
290 
291  auto batch_fun_k = [&](const size_t first, const size_t last) {
292  CPU_SECTION {
293  for (size_t k = first; k < last; ++k) {
294  T var = 0;
295 
296  for (size_t bb = 0; bb < N; ++bb) {
297  var += sum((a(bb)(k) - b(k)) >> (a(bb)(k) - b(k)));
298  }
299 
300  lhs(k) /= var / F;
301  }
302  }
303  };
304 
305  engine_dispatch_1d_serial(batch_fun_k, 0, K, 2UL);
306 
307  lhs.validate_cpu();
308  lhs.invalidate_gpu();
309  }
310 
315  template <etl_expr L>
316  void assign_mod_to(L&& lhs) const {
317  auto& a = this->a();
318  auto& b = this->b();
319 
320  standard_evaluator::pre_assign_rhs(a);
321  standard_evaluator::pre_assign_rhs(b);
322 
323  using T = value_t<A>;
324 
325  const auto N = etl::dim<0>(a);
326  const auto K = etl::dim<1>(a);
327  const auto F = static_cast<T>(etl::size(a) / etl::size(lhs));
328 
329  check(a, b, lhs);
330 
331  a.ensure_cpu_up_to_date();
332  b.ensure_cpu_up_to_date();
333  lhs.ensure_cpu_up_to_date();
334 
335  auto batch_fun_k = [&](const size_t first, const size_t last) {
336  CPU_SECTION {
337  for (size_t k = first; k < last; ++k) {
338  T var = 0;
339 
340  for (size_t bb = 0; bb < N; ++bb) {
341  var += sum((a(bb)(k) - b(k)) >> (a(bb)(k) - b(k)));
342  }
343 
344  lhs(k) %= var / F;
345  }
346  }
347  };
348 
349  engine_dispatch_1d_serial(batch_fun_k, 0, K, 2UL);
350 
351  lhs.validate_cpu();
352  lhs.invalidate_gpu();
353  }
354 
361  friend std::ostream& operator<<(std::ostream& os, const bias_batch_var_4d_expr& expr) {
362  return os << "bias_batch_var_4d(" << expr._a << ")";
363  }
364 };
365 
370 template <typename A, typename B>
373  using sub_expr_t = std::decay_t<A>;
376 
377  static constexpr bool is_etl = true;
378  static constexpr bool is_transformer = false;
379  static constexpr bool is_view = false;
380  static constexpr bool is_magic_view = false;
381  static constexpr bool is_fast = sub_traits::is_fast;
382  static constexpr bool is_linear = false;
383  static constexpr bool is_thread_safe = true;
384  static constexpr bool is_value = false;
385  static constexpr bool is_direct = true;
386  static constexpr bool is_generator = false;
387  static constexpr bool is_padded = false;
388  static constexpr bool is_aligned = true;
389  static constexpr bool is_temporary = true;
390  static constexpr order storage_order = sub_traits::storage_order;
391  static constexpr bool gpu_computable = is_gpu_t<value_type> && cuda_enabled;
392 
398  template <vector_mode_t V>
399  static constexpr bool vectorizable = true;
400 
405  template <size_t DD>
406  static constexpr size_t dim() requires(DD == 0) {
407  return decay_traits<A>::template dim<1>();
408  }
409 
416  static size_t dim(const expr_t& e, [[maybe_unused]] size_t d) {
417  cpp_assert(d == 0, "Invalid dimensions access");
418  return etl::dim<1>(e._a);
419  }
420 
426  static size_t size(const expr_t& e) {
427  return etl::dim<1>(e._a);
428  }
429 
434  static constexpr size_t size() {
435  return decay_traits<A>::template dim<1>();
436  }
437 
442  static constexpr size_t dimensions() {
443  return 1;
444  }
445 
450  static constexpr int complexity() noexcept {
451  return -1;
452  }
453 };
454 
460 template <etl_4d A, etl_1d B>
463 }
464 
465 } //end of namespace etl
friend std::ostream & operator<<(std::ostream &os, const bias_batch_var_4d_expr &expr)
Print a representation of the expression on the given stream.
Definition: bias_batch_var_4d_expr.hpp:361
constexpr int complexity([[maybe_unused]] const E &expr) noexcept
Return the complexity of the expression.
Definition: helpers.hpp:38
static void check([[maybe_unused]] const A &a, [[maybe_unused]] const B &b, [[maybe_unused]] const C &c)
Validate the transposition dimensions.
Definition: bias_batch_var_4d_expr.hpp:51
value_t< A > value_type
The type of value of the expression.
Definition: bias_batch_var_4d_expr.hpp:22
void engine_dispatch_1d_serial(Functor &&functor, size_t first, size_t last, size_t threshold, [[maybe_unused]] size_t n_threads=etl::threads)
Dispatch the elements of a range to a functor in a parallel manner, using the global thread engine...
Definition: parallel_support.hpp:734
constexpr bool is_magic_view
Traits indicating if the given ETL type is a magic view expression.
Definition: traits.hpp:311
A _a
The sub expression reference.
Definition: base_temporary_expr.hpp:533
order
Storage order of a matrix.
Definition: order.hpp:15
constexpr bool cuda_enabled
Indicates if CUDA is available.
Definition: config.hpp:94
void assign_sub_to(L &&lhs) const
Sub from the given left-hand-side expression.
Definition: bias_batch_var_4d_expr.hpp:184
void assign_div_to(L &&lhs) const
Divide the given left-hand-side expression.
Definition: bias_batch_var_4d_expr.hpp:272
EGBLAS wrappers for the bias_batch_sum operation.
Abstract base class for temporary binary expression.
Definition: base_temporary_expr.hpp:529
std::add_lvalue_reference_t< B > b()
Returns the sub expression.
Definition: base_temporary_expr.hpp:593
std::decay_t< A > sub_expr_t
The sub expression type.
Definition: bias_batch_var_4d_expr.hpp:373
constexpr bool is_fast
Traits to test if the given ETL expresion type is fast (sizes known at compile-time) ...
Definition: traits.hpp:588
void assign_to(L &&lhs) const
Assign to a matrix of the same storage order.
Definition: bias_batch_var_4d_expr.hpp:68
bias_batch_var_4d_expr< detail::build_type< A >, detail::build_type< B > > bias_batch_var_4d(const A &a, const B &b)
Returns the transpose of the given expression.
Definition: bias_batch_var_4d_expr.hpp:461
Traits to get information about ETL types.
Definition: tmp.hpp:68
Root namespace for the ETL library.
Definition: adapter.hpp:15
bias_batch_var_4d_expr(A a, B b)
Construct a new expression.
Definition: bias_batch_var_4d_expr.hpp:41
static constexpr auto storage_order
The sub storage order.
Definition: bias_batch_var_4d_expr.hpp:27
auto dim(E &&value, size_t i) -> detail::identity_helper< E, dim_view< detail::build_identity_type< E >, D >>
Return a view representing the ith Dth dimension.
Definition: view_expression_builder.hpp:25
std::conditional_t< is_etl_value< T >, const std::decay_t< T > &, std::decay_t< T > > build_type
Helper to build the type for a sub expression.
Definition: expression_helpers.hpp:24
void assign_add_to(L &&lhs) const
Add to the given left-hand-side expression.
Definition: bias_batch_var_4d_expr.hpp:140
constexpr bool is_transformer
Traits indicating if the given ETL type is a transformer expression.
Definition: traits.hpp:297
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624
static constexpr bool gpu_computable
Indicates if the temporary expression can be directly evaluated using only GPU.
Definition: bias_batch_var_4d_expr.hpp:33
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
requires(D > 0) struct dyn_base
Matrix with run-time fixed dimensions.
Definition: dyn_base.hpp:113
constexpr bool is_view
Traits indicating if the given ETL type is a view expression.
Definition: traits.hpp:304
static constexpr bool is_fast
Indicates if T is a fast structure.
Definition: traits_base.hpp:25
A transposition expression.
Definition: bias_batch_var_4d_expr.hpp:21
void assign_mod_to(L &&lhs) const
Modulo the given left-hand-side expression.
Definition: bias_batch_var_4d_expr.hpp:316
constexpr bool is_thread_safe
Traits to test if the given ETL expresion type is thread safe.
Definition: traits.hpp:687
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
value_t< A > value_type
The value type of the expression.
Definition: bias_batch_var_4d_expr.hpp:375
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
std::add_lvalue_reference_t< A > a()
Returns the sub expression.
Definition: base_temporary_expr.hpp:577
void assign_mul_to(L &&lhs) const
Multiply the given left-hand-side expression.
Definition: bias_batch_var_4d_expr.hpp:228