Expression Templates Library (ETL)
gemm_expr.hpp
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
8 #pragma once
9 
10 #include "etl/expr/base_temporary_expr.hpp"
11 
12 //The implementations
13 #include "etl/impl/std/gemm.hpp"
14 #include "etl/impl/std/strassen_mmul.hpp"
15 #include "etl/impl/blas/gemm.hpp"
16 #include "etl/impl/vec/gemm.hpp"
17 #include "etl/impl/vec/gemm_conv.hpp"
18 #include "etl/impl/cublas/gemm.hpp"
19 
20 namespace etl {
21 
26 template <etl_expr A, etl_expr B, bool Strassen>
27 struct gemm_expr : base_temporary_expr_bin<gemm_expr<A, B, Strassen>, A, B> {
32 
33  static constexpr auto storage_order = left_traits::storage_order;
34 
39  static constexpr bool gpu_computable = cublas_enabled && all_homogeneous<A, B>;
40 
41  const value_type alpha;
42 
47  explicit gemm_expr(A a, B b) : base_type(a, b), alpha(1) {
48  //Nothing else to init
49  }
50 
55  explicit gemm_expr(A a, B b, value_type alpha) : base_type(a, b), alpha(alpha) {
56  //Nothing else to init
57  }
58 
65  template <etl_expr C>
66  static void check([[maybe_unused]] const A& a, [[maybe_unused]] const B& b, [[maybe_unused]] const C& c) {
67  if constexpr (all_fast<A, B, C>) {
68  static_assert(dim<1, A>() == dim<0, B>() //interior dimensions
69  && dim<0, A>() == dim<0, C>() //exterior dimension 1
70  && dim<1, B>() == dim<1, C>(), //exterior dimension 2
71  "Invalid sizes for multiplication");
72  } else {
73  cpp_assert(dim<1>(a) == dim<0>(b) //interior dimensions
74  && dim<0>(a) == dim<0>(c) //exterior dimension 1
75  && dim<1>(b) == dim<1>(c), //exterior dimension 2
76  "Invalid sizes for multiplication");
77  }
78  }
79 
80  // Assignment functions
81 
86  template <typename AA, typename BB, typename C>
87  static constexpr gemm_impl select_default_gemm_impl(bool no_gpu) {
88  //Note since these boolean will be known at compile time, the conditions will be a lot simplified
89  constexpr bool blas = cblas_enabled;
90  constexpr bool cublas = cublas_enabled;
91  constexpr bool homo = all_homogeneous<AA, BB, C>;
92 
93  if (cublas && homo && !no_gpu) {
94  return gemm_impl::CUBLAS;
95  } else if (blas && homo) {
96  return gemm_impl::BLAS;
97  }
98 
99  if (vec_enabled && vectorize_impl && homo && all_vectorizable_t<vector_mode, AA, BB, C>) {
100  return gemm_impl::VEC;
101  }
102 
103  return gemm_impl::STD;
104  }
105 
106 #ifdef ETL_MANUAL_SELECT
107 
112  template <typename AA, typename BB, typename C>
113  static inline gemm_impl select_gemm_impl() {
114  auto def = select_default_gemm_impl<AA, BB, C>(local_context().cpu);
115 
116  if (local_context().gemm_selector.forced) {
117  auto forced = local_context().gemm_selector.impl;
118 
119  switch (forced) {
120  //CUBLAS cannot always be used
121  case gemm_impl::CUBLAS:
122  if (!cublas_enabled || !all_homogeneous<AA, BB, C> || local_context().cpu) { //COVERAGE_EXCLUDE_LINE
123  std::cerr << "Forced selection to CUBLAS gemm implementation, but not possible for this expression"
124  << std::endl; //COVERAGE_EXCLUDE_LINE
125  return def; //COVERAGE_EXCLUDE_LINE
126  } //COVERAGE_EXCLUDE_LINE
127 
128  return forced;
129 
130  //BLAS cannot always be used
131  case gemm_impl::BLAS:
132  if (!cblas_enabled || !all_homogeneous<AA, BB, C>) { //COVERAGE_EXCLUDE_LINE
133  std::cerr << "Forced selection to BLAS gemm implementation, but not possible for this expression" << std::endl; //COVERAGE_EXCLUDE_LINE
134  return def; //COVERAGE_EXCLUDE_LINE
135  } //COVERAGE_EXCLUDE_LINE
136 
137  return forced;
138 
139  //VEC cannot always be used
140  case gemm_impl::VEC:
141  if (!vec_enabled || !vectorize_impl || !all_vectorizable_t<vector_mode, AA, BB, C> || !all_homogeneous<AA, BB, C>) { //COVERAGE_EXCLUDE_LINE
142  std::cerr << "Forced selection to VEC gemm implementation, but not possible for this expression" << std::endl; //COVERAGE_EXCLUDE_LINE
143  return def; //COVERAGE_EXCLUDE_LINE
144  } //COVERAGE_EXCLUDE_LINE
145 
146  return forced;
147 
148  //In other cases, simply use the forced impl
149  default:
150  return forced;
151  }
152  }
153 
154  return def;
155  }
156 
157 #else
158 
164  template <typename AA, typename BB, typename C>
165  static constexpr gemm_impl select_gemm_impl() {
166  return select_default_gemm_impl<AA, BB, C>(false);
167  }
168 
169 #endif
170 
177  template <typename AA, typename BB, typename C>
178  void apply_raw(AA&& a, BB&& b, C&& c) const {
179  constexpr_select auto impl = select_gemm_impl<AA, BB, C>();
180 
181  // clang-format off
182 
183  if constexpr (is_transpose_expr<AA>&& is_transpose_expr<BB>) {
184  if constexpr_select(impl == gemm_impl::STD) {
185  inc_counter("impl:std");
186  etl::impl::standard::mm_mul(smart_forward(a), smart_forward(b), c, alpha);
187  } else if constexpr_select(impl == gemm_impl::VEC) {
188  inc_counter("impl:vec");
189  etl::impl::vec::gemm(smart_forward(a), smart_forward(b), c, alpha);
190  } else if constexpr_select(impl == gemm_impl::BLAS) {
191  inc_counter("impl:blas");
192  etl::impl::blas::gemm_tt(smart_forward(a.a()), smart_forward(b.a()), c, alpha);
193  } else if constexpr_select(impl == gemm_impl::CUBLAS) {
194  inc_counter("impl:cublas");
195  etl::impl::cublas::gemm_tt(smart_forward_gpu(a.a()), smart_forward_gpu(b.a()), c, alpha);
196  } else {
197  cpp_unreachable("invalid selection of gemm");
198  }
199  } else if constexpr (!is_transpose_expr<AA> && is_transpose_expr<BB>) {
200  if constexpr_select(impl == gemm_impl::STD) {
201  inc_counter("impl:std");
202  etl::impl::standard::mm_mul(smart_forward(a), smart_forward(b), c, alpha);
203  } else if constexpr_select(impl == gemm_impl::VEC) {
204  inc_counter("impl:vec");
205  etl::impl::vec::gemm_nt(smart_forward(a), smart_forward(b.a()), c, alpha);
206  } else if constexpr_select(impl == gemm_impl::BLAS) {
207  inc_counter("impl:blas");
208  etl::impl::blas::gemm_nt(smart_forward(a), smart_forward(b.a()), c, alpha);
209  } else if constexpr_select(impl == gemm_impl::CUBLAS) {
210  inc_counter("impl:cublas");
211  etl::impl::cublas::gemm_nt(smart_forward_gpu(a), smart_forward_gpu(b.a()), c, alpha);
212  } else {
213  cpp_unreachable("Invalid selection of gemm");
214  }
215  } else if constexpr (is_transpose_expr<AA> && !is_transpose_expr<BB>) {
216  if constexpr_select(impl == gemm_impl::STD) {
217  inc_counter("impl:std");
218  etl::impl::standard::mm_mul(smart_forward(a), smart_forward(b), c, alpha);
219  } else if constexpr_select(impl == gemm_impl::VEC) {
220  inc_counter("impl:vec");
221  etl::impl::vec::gemm_tn(smart_forward(a.a()), smart_forward(b), c, alpha);
222  } else if constexpr_select(impl == gemm_impl::BLAS) {
223  inc_counter("impl:blas");
224  etl::impl::blas::gemm_tn(smart_forward(a.a()), smart_forward(b), c, alpha);
225  } else if constexpr_select(impl == gemm_impl::CUBLAS) {
226  inc_counter("impl:cublas");
227  etl::impl::cublas::gemm_tn(smart_forward_gpu(a.a()), smart_forward_gpu(b), c, alpha);
228  } else {
229  cpp_unreachable("Invalid selection of gemm");
230  }
231  } else /*if constexpr (!is_transpose_expr<AA> && !is_transpose_expr<BB>)*/ {
232  if constexpr_select(impl == gemm_impl::STD) {
233  inc_counter("impl:std");
234  etl::impl::standard::mm_mul(smart_forward(a), smart_forward(b), c, alpha);
235  } else if constexpr_select(impl == gemm_impl::VEC) {
236  inc_counter("impl:vec");
237  etl::impl::vec::gemm(smart_forward(a), smart_forward(b), c, alpha);
238  } else if constexpr_select(impl == gemm_impl::BLAS) {
239  inc_counter("impl:blas");
240  etl::impl::blas::gemm(smart_forward(a), smart_forward(b), c, alpha);
241  } else if constexpr_select(impl == gemm_impl::CUBLAS) {
242  inc_counter("impl:cublas");
243  etl::impl::cublas::gemm(smart_forward_gpu(a), smart_forward_gpu(b), c, alpha);
244  } else {
245  cpp_unreachable("Invalid selection of gemm");
246  }
247  }
248 
249  // clang-format on
250  }
251 
256  template <etl_expr C>
257  void assign_to(C&& c) const {
258  inc_counter("temp:assign");
259 
260  auto& a = this->a();
261  auto& b = this->b();
262 
263  check(a, b, c);
264 
265  if constexpr (!Strassen) {
266  apply_raw(a, b, c);
267  } else {
268  etl::impl::standard::strassen_mm_mul(smart_forward(a), smart_forward(b), c);
269  }
270  }
271 
276  template <etl_expr L>
277  void assign_add_to(L&& lhs) const {
278  std_add_evaluate(*this, lhs);
279  }
280 
285  template <etl_expr L>
286  void assign_sub_to(L&& lhs) const {
287  std_sub_evaluate(*this, lhs);
288  }
289 
294  template <etl_expr L>
295  void assign_mul_to(L&& lhs) const {
296  std_mul_evaluate(*this, lhs);
297  }
298 
303  template <etl_expr L>
304  void assign_div_to(L&& lhs) const {
305  std_div_evaluate(*this, lhs);
306  }
307 
312  template <etl_expr L>
313  void assign_mod_to(L&& lhs) const {
314  std_mod_evaluate(*this, lhs);
315  }
316 
323  friend std::ostream& operator<<(std::ostream& os, const gemm_expr& expr) {
324  return os << expr._a << " * " << expr._b;
325  }
326 };
327 
332 template <typename A, typename B, bool Strassen>
333 struct etl_traits<etl::gemm_expr<A, B, Strassen>> {
335  using left_expr_t = std::decay_t<A>;
336  using right_expr_t = std::decay_t<B>;
340 
341  static constexpr bool is_etl = true;
342  static constexpr bool is_transformer = false;
343  static constexpr bool is_view = false;
344  static constexpr bool is_magic_view = false;
345  static constexpr bool is_fast = left_traits::is_fast && right_traits::is_fast;
346  static constexpr bool is_linear = false;
347  static constexpr bool is_thread_safe = true;
348  static constexpr bool is_value = false;
349  static constexpr bool is_direct = true;
350  static constexpr bool is_generator = false;
351  static constexpr bool is_padded = false;
352  static constexpr bool is_aligned = true;
353  static constexpr bool is_temporary = true;
354  static constexpr order storage_order = left_traits::storage_order;
355  static constexpr bool gpu_computable = is_gpu_t<value_type> && cuda_enabled;
356 
362  template <vector_mode_t V>
363  static constexpr bool vectorizable = true;
364 
369  template <size_t DD>
370  static constexpr size_t dim() {
371  return DD == 0 ? decay_traits<A>::template dim<0>() : decay_traits<B>::template dim<1>();
372  }
373 
380  static size_t dim(const expr_t& e, size_t d) {
381  if (d == 0) {
382  return etl::dim(e._a, 0);
383  } else {
384  return etl::dim(e._b, 1);
385  }
386  }
387 
393  static size_t size(const expr_t& e) {
394  return etl::dim(e._a, 0) * etl::dim(e._b, 1);
395  }
396 
401  static constexpr size_t size() {
402  return decay_traits<A>::template dim<0>() * decay_traits<B>::template dim<1>();
403  }
404 
409  static constexpr size_t dimensions() {
410  return 2;
411  }
412 
417  static constexpr int complexity() noexcept {
418  return -1;
419  }
420 };
421 
422 // Operators
423 
430 template <etl_2d A, etl_2d B>
431 auto operator*(A&& a, B&& b) {
433 }
434 
441 template <etl_2d A, etl_2d B>
442 auto mul(A&& a, B&& b) {
444 }
445 
446 // alpha * gemm operators
447 
454 template <etl_2d A, etl_2d B>
456  return gemm_expr<A, B, false>{gemm.a(), gemm.b(), alpha};
457 }
458 
465 template <etl_2d A, etl_2d B>
467  return gemm_expr<A, B, false>{gemm.a(), gemm.b(), alpha};
468 }
469 
470 // Variant with three parameters
471 
479 template <etl_2d A, etl_2d B, etl_2d C>
480 auto mul(A&& a, B&& b, C&& c) {
481  c = mul(a, b);
482  return c;
483 }
484 
485 // Strassen variants
486 
493 template <etl_2d A, etl_2d B>
494 auto strassen_mul(A&& a, B&& b) {
496 }
497 
505 template <etl_2d A, etl_2d B, etl_2d C>
506 auto strassen_mul(A&& a, B&& b, C&& c) {
507  c = mul(a, b);
508  return c;
509 }
510 
511 } //end of namespace etl
void assign_to(C &&c) const
Assign to a matrix of the same storage order.
Definition: gemm_expr.hpp:257
A transposition expression.
Definition: gemm_expr.hpp:27
gemm_impl
Enumeration describing the different matrix-matrix multiplication implementations.
Definition: gemm_impl.hpp:21
B _b
The sub expression reference.
Definition: base_temporary_expr.hpp:534
auto mul(A &&a, B &&b)
Multiply two matrices together.
Definition: gemm_expr.hpp:442
Standard implementation.
static constexpr size_t dim()
Returns the DDth dimension of the expression.
Definition: gemm_expr.hpp:370
void assign_mul_to(L &&lhs) const
Multiply the given left-hand-side expression.
Definition: gemm_expr.hpp:295
constexpr bool is_magic_view
Traits indicating if the given ETL type is a magic view expression.
Definition: traits.hpp:311
static constexpr size_t dimensions()
Returns the number of dimensions of the expression.
Definition: gemm_expr.hpp:409
void assign_mod_to(L &&lhs) const
Modulo the given left-hand-side expression.
Definition: gemm_expr.hpp:313
A _a
The sub expression reference.
Definition: base_temporary_expr.hpp:533
void assign_add_to(L &&lhs) const
Add to the given left-hand-side expression.
Definition: gemm_expr.hpp:277
static size_t size(const expr_t &e)
Returns the size of the expression.
Definition: gemm_expr.hpp:393
auto strassen_mul(A &&a, B &&b)
Multiply two matrices together using strassen.
Definition: gemm_expr.hpp:494
constexpr bool vectorize_impl
Indicates if the implementations can be automatically vectorized by ETL.
Definition: config.hpp:35
constexpr bool vec_enabled
Indicates if vectorization is available in any format.
Definition: config.hpp:220
order
Storage order of a matrix.
Definition: order.hpp:15
constexpr bool cuda_enabled
Indicates if CUDA is available.
Definition: config.hpp:94
static constexpr auto storage_order
The sub storage order.
Definition: gemm_expr.hpp:33
Abstract base class for temporary binary expression.
Definition: base_temporary_expr.hpp:529
VEC implementation.
auto operator*(LE &&lhs, RE rhs)
Builds an expression representing the multiplication of lhs and rhs (scalar)
Definition: binary_expression_builder.hpp:149
static constexpr gemm_impl select_gemm_impl()
Select the best implementation of GEMM.
Definition: gemm_expr.hpp:165
BLAS implementation.
std::add_lvalue_reference_t< B > b()
Returns the sub expression.
Definition: base_temporary_expr.hpp:593
void apply_raw(AA &&a, BB &&b, C &&c) const
Compute C = trans(A) * trans(B)
Definition: gemm_expr.hpp:178
static void check([[maybe_unused]] const A &a, [[maybe_unused]] const B &b, [[maybe_unused]] const C &c)
Assert for the validity of the matrix-matrix multiplication operation.
Definition: gemm_expr.hpp:66
constexpr bool is_fast
Traits to test if the given ETL expresion type is fast (sizes known at compile-time) ...
Definition: traits.hpp:588
Traits to get information about ETL types.
Definition: tmp.hpp:68
BLAS implementation.
Root namespace for the ETL library.
Definition: adapter.hpp:15
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50
value_t< A > value_type
The value type of the expression.
Definition: gemm_expr.hpp:339
auto dim(E &&value, size_t i) -> detail::identity_helper< E, dim_view< detail::build_identity_type< E >, D >>
Return a view representing the ith Dth dimension.
Definition: view_expression_builder.hpp:25
void assign_div_to(L &&lhs) const
Divide the given left-hand-side expression.
Definition: gemm_expr.hpp:304
gemm_expr(A a, B b)
Construct a new expression.
Definition: gemm_expr.hpp:47
constexpr bool cublas_enabled
Indicates if the NVIDIA CUBLAS library is available for ETL.
Definition: config.hpp:99
gemm_expr(A a, B b, value_type alpha)
Construct a new expression.
Definition: gemm_expr.hpp:55
std::conditional_t< is_etl_value< T >, const std::decay_t< T > &, std::decay_t< T > > build_type
Helper to build the type for a sub expression.
Definition: expression_helpers.hpp:24
bool cpu
Force CPU evaluation.
Definition: context.hpp:29
static constexpr size_t size()
Returns the size of the expression.
Definition: gemm_expr.hpp:401
void std_mod_evaluate(Expr &&expr, Result &&result)
Compound modulo evaluation of the expr into result.
Definition: evaluator.hpp:1271
void assign_sub_to(L &&lhs) const
Sub from the given left-hand-side expression.
Definition: gemm_expr.hpp:286
void std_mul_evaluate(Expr &&expr, Result &&result)
Compound multiply evaluation of the expr into result.
Definition: evaluator.hpp:1233
constexpr bool is_transformer
Traits indicating if the given ETL type is a transformer expression.
Definition: traits.hpp:297
std::decay_t< A > left_expr_t
The left sub expression type.
Definition: gemm_expr.hpp:335
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
static constexpr int complexity() noexcept
Estimate the complexity of computation.
Definition: gemm_expr.hpp:417
constexpr bool is_view
Traits indicating if the given ETL type is a view expression.
Definition: traits.hpp:304
std::decay_t< B > right_expr_t
The right sub expression type.
Definition: gemm_expr.hpp:336
value_t< A > value_type
The type of value of the expression.
Definition: gemm_expr.hpp:28
static constexpr bool is_fast
Indicates if T is a fast structure.
Definition: traits_base.hpp:25
friend std::ostream & operator<<(std::ostream &os, const gemm_expr &expr)
Print a representation of the expression on the given stream.
Definition: gemm_expr.hpp:323
void std_sub_evaluate(Expr &&expr, Result &&result)
Compound subtract evaluation of the expr into result.
Definition: evaluator.hpp:1214
decltype(auto) smart_forward(E &expr)
Smart forwarding for a temporary expression.
Definition: helpers.hpp:323
constexpr bool cblas_enabled
Indicates if a BLAS library is available for ETL.
Definition: config.hpp:76
constexpr bool is_thread_safe
Traits to test if the given ETL expresion type is thread safe.
Definition: traits.hpp:687
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
void std_div_evaluate(Expr &&expr, Result &&result)
Compound divide evaluation of the expr into result.
Definition: evaluator.hpp:1252
static constexpr bool gpu_computable
Indicates if the temporary expression can be directly evaluated using only GPU.
Definition: gemm_expr.hpp:39
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
static constexpr gemm_impl select_default_gemm_impl(bool no_gpu)
Select an implementation of GEMM, not considering local context.
Definition: gemm_expr.hpp:87
std::add_lvalue_reference_t< A > a()
Returns the sub expression.
Definition: base_temporary_expr.hpp:577
const value_type alpha
The alpha multiplicator.
Definition: gemm_expr.hpp:41
static size_t dim(const expr_t &e, size_t d)
Returns the dth dimension of the expression.
Definition: gemm_expr.hpp:380
void std_add_evaluate(Expr &&expr, Result &&result)
Compound add evaluation of the expr into result.
Definition: evaluator.hpp:1195