Expression Templates Library (ETL)
bias_add.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
13 #pragma once
14 
15 namespace etl::impl::vec {
16 
23 template <typename V, typename L, typename R, typename C>
24 void bias_add_4d_impl(const L& x, const R& b, C&& y) {
25  using vec_type = V;
26  using T = value_t<L>;
27 
28  static constexpr size_t vec_size = vec_type::template traits<T>::size;
29 
30  const auto B = etl::dim<0>(x);
31  const auto M = etl::dim<2>(x);
32  const auto N = etl::dim<3>(x);
33  const auto MN = M * N;
34 
35  x.ensure_cpu_up_to_date();
36  b.ensure_cpu_up_to_date();
37 
38  // Note: This kernel is particularly adapted for large inner (MN) dimensions
39 
40  auto batch_fun = [&](size_t first, size_t last) {
41  for (size_t i = first; i < last; ++i) {
42  for (size_t j = 0; j < etl::dim<1>(x); ++j) {
43  auto x_s = x(i)(j).memory_start();
44  auto y_s = y(i)(j).memory_start();
45 
46  auto b1 = vec_type::set(b[j]);
47 
48  size_t m = 0;
49 
50  for (; m + vec_size * 8 - 1 < MN; m += vec_size * 8) {
51  auto x1 = vec_type::loadu(x_s + m + 0 * vec_size);
52  auto x2 = vec_type::loadu(x_s + m + 1 * vec_size);
53  auto x3 = vec_type::loadu(x_s + m + 2 * vec_size);
54  auto x4 = vec_type::loadu(x_s + m + 3 * vec_size);
55  auto x5 = vec_type::loadu(x_s + m + 4 * vec_size);
56  auto x6 = vec_type::loadu(x_s + m + 5 * vec_size);
57  auto x7 = vec_type::loadu(x_s + m + 6 * vec_size);
58  auto x8 = vec_type::loadu(x_s + m + 7 * vec_size);
59 
60  auto r1 = vec_type::add(x1, b1);
61  auto r2 = vec_type::add(x2, b1);
62  auto r3 = vec_type::add(x3, b1);
63  auto r4 = vec_type::add(x4, b1);
64  auto r5 = vec_type::add(x5, b1);
65  auto r6 = vec_type::add(x6, b1);
66  auto r7 = vec_type::add(x7, b1);
67  auto r8 = vec_type::add(x8, b1);
68 
69  vec_type::storeu(y_s + m + 0 * vec_size, r1);
70  vec_type::storeu(y_s + m + 1 * vec_size, r2);
71  vec_type::storeu(y_s + m + 2 * vec_size, r3);
72  vec_type::storeu(y_s + m + 3 * vec_size, r4);
73  vec_type::storeu(y_s + m + 4 * vec_size, r5);
74  vec_type::storeu(y_s + m + 5 * vec_size, r6);
75  vec_type::storeu(y_s + m + 6 * vec_size, r7);
76  vec_type::storeu(y_s + m + 7 * vec_size, r8);
77  }
78 
79  for (; m + vec_size * 4 - 1 < MN; m += vec_size * 4) {
80  auto x1 = vec_type::loadu(x_s + m + 0 * vec_size);
81  auto x2 = vec_type::loadu(x_s + m + 1 * vec_size);
82  auto x3 = vec_type::loadu(x_s + m + 2 * vec_size);
83  auto x4 = vec_type::loadu(x_s + m + 3 * vec_size);
84 
85  auto r1 = vec_type::add(x1, b1);
86  auto r2 = vec_type::add(x2, b1);
87  auto r3 = vec_type::add(x3, b1);
88  auto r4 = vec_type::add(x4, b1);
89 
90  vec_type::storeu(y_s + m + 0 * vec_size, r1);
91  vec_type::storeu(y_s + m + 1 * vec_size, r2);
92  vec_type::storeu(y_s + m + 2 * vec_size, r3);
93  vec_type::storeu(y_s + m + 3 * vec_size, r4);
94  }
95 
96  for (; m + vec_size * 2 - 1 < MN; m += vec_size * 2) {
97  auto x1 = vec_type::loadu(x_s + m + 0 * vec_size);
98  auto x2 = vec_type::loadu(x_s + m + 1 * vec_size);
99 
100  auto r1 = vec_type::add(x1, b1);
101  auto r2 = vec_type::add(x2, b1);
102 
103  vec_type::storeu(y_s + m + 0 * vec_size, r1);
104  vec_type::storeu(y_s + m + 1 * vec_size, r2);
105  }
106 
107  for (; m + vec_size - 1 < MN; m += vec_size) {
108  auto x1 = vec_type::loadu(x_s + m);
109 
110  auto r1 = vec_type::add(x1, b1);
111 
112  vec_type::storeu(y_s + m, r1);
113  }
114 
115  for (; m < MN; ++m) {
116  y_s[m] = x_s[m] + b[j];
117  }
118  }
119  }
120  };
121 
122  // TODO The gain of dispatching has to be checked again
123  batch_fun(0, B);
124 
125  y.invalidate_gpu();
126 }
127 
134 template <typename V, typename L, typename R, typename C>
135 void bias_add_2d_impl(const L& x, const R& b, C&& y) {
136  using vec_type = V;
137  using T = value_t<L>;
138 
139  static constexpr size_t vec_size = vec_type::template traits<T>::size;
140 
141  const auto B = etl::dim<0>(x);
142  const auto K = etl::dim<1>(x);
143 
144  x.ensure_cpu_up_to_date();
145  b.ensure_cpu_up_to_date();
146 
147  // Note: This kernel is particularly adapted for large inner (MN) dimensions
148 
149  auto batch_fun = [&](size_t first, size_t last) {
150  auto b_s = b.memory_start();
151  auto x_s = x.memory_start();
152  auto y_s = y.memory_start();
153 
154  for (size_t i = first; i < last; ++i) {
155  size_t j = 0;
156 
157  for (; j + vec_size - 1 < K; j += vec_size) {
158  auto r1 = vec_type::loadu(b_s + j);
159  auto x1 = vec_type::loadu(x_s + i * K + j);
160  auto t1 = vec_type::add(r1, x1);
161  vec_type::storeu(y_s + i * K + j, t1);
162  }
163 
164  for (; j < K; ++j) {
165  y(i, j) = x(i, j) + b(j);
166  }
167  }
168  };
169 
170  batch_fun(0, B);
171 
172  y.invalidate_gpu();
173 }
174 
181 template <typename A, typename B, typename C>
182 void bias_add_4d(const A& x, const B& b, C&& y) {
183  bias_add_4d_impl<default_vec>(x, b, y);
184 }
185 
192 template <typename A, typename B, typename C>
193 void bias_add_2d(const A& x, const B& b, C&& y) {
194  bias_add_2d_impl<default_vec>(x, b, y);
195 }
196 
197 } //end of namespace etl::impl::vec
Definition: bias_add.hpp:15
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
void bias_add_2d_impl(const L &x, const R &b, C &&y)
Compute the bias addition of b into x and store the result in y.
Definition: bias_add.hpp:135
void storeu(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:187
bias_add_2d_expr< detail::build_type< E >, detail::build_type< B > > bias_add_2d(const E &x, const B &biases)
Returns the result of adding the bias [K] to the 4D matrix [N1, K, N2, N3].
Definition: bias_add_2d_expr.hpp:378
auto loadu(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:154
bias_add_4d_expr< detail::build_type< E >, detail::build_type< B > > bias_add_4d(const E &x, const B &biases)
Returns the result of adding the bias [K] to the 4D matrix [N1, K, N2, N3].
Definition: bias_add_4d_expr.hpp:388
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
void bias_add_4d_impl(const L &x, const R &b, C &&y)
Compute the bias addition of b into x and store the result in y.
Definition: bias_add.hpp:24