Expression Templates Library (ETL)
transpose.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
13 #pragma once
14 
15 //Include the implementations
19 #include "etl/impl/cublas/transpose.hpp"
20 
21 #if __INTEL_MKL__ == 11 && __INTEL_MKL_MINOR__ == 2
22 #define SLOW_MKL
23 #endif
24 
25 namespace etl::detail {
26 
27 //TODO We should take into account parallel blas when selecting MKL transpose
28 
39 template <typename A, typename C>
41  if (cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
43  }
44 
45 #ifdef SLOW_MKL
46  // STD is always faster than MKL for out-of-place transpose
47  return transpose_impl::STD;
48 #else
49  // Condition to use MKL
50  constexpr bool mkl_possible = mkl_enabled && is_dma<C> && is_floating<C>;
51 
52  if (mkl_possible) {
53  return transpose_impl::MKL;
54  } else {
55  return transpose_impl::STD;
56  }
57 #endif
58 }
59 
70 template <typename A, typename C>
72  if (cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
74  }
75 
76  constexpr bool vec_possible = vectorize_impl && is_dma<C> && is_floating<C>;
77 
78 #ifdef SLOW_MKL
79  // VEC and STD is always faster than MKL for out-of-place transpose
80  if (vec_possible) {
81  return transpose_impl::VEC;
82  } else {
83  return transpose_impl::STD;
84  }
85 #else
86  // Condition to use MKL
87  constexpr bool mkl_possible = mkl_enabled && is_dma<C> && is_floating<C>;
88 
89  if (mkl_possible) {
90  return transpose_impl::MKL;
91  } else if (vec_possible) {
92  return transpose_impl::VEC;
93  } else {
94  return transpose_impl::STD;
95  }
96 #endif
97 }
98 
110 template <typename A, typename C>
112  if (cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
113  return transpose_impl::CUBLAS;
114  }
115 
116  // Condition to use MKL
117  constexpr bool mkl_possible = mkl_enabled && is_dma<C> && is_floating<C>;
118 
119  if (mkl_possible) {
120  return transpose_impl::MKL;
121  } else {
122  return transpose_impl::STD;
123  }
124 }
125 
126 #ifdef ETL_MANUAL_SELECT
127 
134 template <typename A, typename C>
135 transpose_impl select_transpose_impl(transpose_impl def) {
136  if (local_context().transpose_selector.forced) {
137  auto forced = local_context().transpose_selector.impl;
138 
139  switch (forced) {
140  //CUBLAS cannot always be used
142  if (!cublas_enabled || !all_dma<A, C> || !all_floating<A, C> || local_context().cpu) {
143  std::cerr << "Forced selection to CUBLAS transpose implementation, but not possible for this expression" << std::endl;
144  return def;
145  }
146 
147  return forced;
148 
149  //MKL cannot always be used
150  case transpose_impl::MKL:
151  if (!mkl_enabled || !all_dma<A, C> || !all_floating<A, C>) {
152  std::cerr << "Forced selection to MKL transpose implementation, but not possible for this expression" << std::endl;
153  return def;
154  }
155 
156  return forced;
157 
158  //VEC cannot always be used
159  case transpose_impl::VEC:
160  if (!vectorize_impl || !all_dma<A, C> || !all_floating<A, C>) {
161  std::cerr << "Forced selection to VEC transpose implementation, but not possible for this expression" << std::endl;
162  return def;
163  }
164 
165  return forced;
166 
167  //In other cases, simply use the forced impl
168  default:
169  return forced;
170  }
171  }
172 
173  return def;
174 }
175 
184 template <typename A, typename C>
186  return select_transpose_impl<A, C>(select_default_transpose_impl<A, C>(local_context().cpu));
187 }
188 
197 template <typename A, typename C>
199  return select_transpose_impl<A, C>(select_default_oop_transpose_impl<A, C>(local_context().cpu));
200 }
201 
211 template <typename C>
213  return select_transpose_impl<C, C>(select_default_in_square_transpose_impl<C, C>(local_context().cpu));
214 }
215 
216 #else
217 
226 template <typename A, typename C>
228  return select_default_transpose_impl<A, C>(false);
229 }
230 
239 template <typename A, typename C>
241  return select_default_oop_transpose_impl<A, C>(false);
242 }
243 
253 template <typename C>
255  return select_default_in_square_transpose_impl<C, C>(false);
256 }
257 
258 #endif
259 
268  template <typename C>
269  static void apply(C&& c) {
270  constexpr_select const auto impl = select_in_square_transpose_impl<C>();
271 
272  if
273  constexpr_select(impl == transpose_impl::MKL) {
274  inc_counter("impl:mkl");
275  etl::impl::blas::inplace_square_transpose(c);
276  }
277  else if
278  constexpr_select(impl == transpose_impl::CUBLAS) {
279  inc_counter("impl:cublas");
280  etl::impl::cublas::inplace_square_transpose(c);
281  }
282  else if
283  constexpr_select(impl == transpose_impl::STD) {
284  inc_counter("impl:std");
285  etl::impl::standard::inplace_square_transpose(c);
286  }
287  else {
288  cpp_unreachable("Invalid transpose_impl selection");
289  }
290  }
291 };
292 
301  template <typename C>
302  static void apply(C&& c) {
303  constexpr_select const auto impl = select_normal_transpose_impl<C, C>();
304 
305  if
306  constexpr_select(impl == transpose_impl::MKL) {
307  inc_counter("impl:mkl");
308  etl::impl::blas::inplace_rectangular_transpose(c);
309  }
310  else if
311  constexpr_select(impl == transpose_impl::CUBLAS) {
312  inc_counter("impl:cublas");
313  etl::impl::cublas::inplace_rectangular_transpose(c);
314  }
315  else if
316  constexpr_select(impl == transpose_impl::STD) {
317  inc_counter("impl:std");
318  etl::impl::standard::inplace_rectangular_transpose(c);
319  }
320  else {
321  cpp_unreachable("Invalid transpose_impl selection");
322  }
323  }
324 };
325 
329 struct transpose {
335  template <typename A, typename C>
336  static void apply(A&& a, C&& c) {
337  constexpr_select const auto impl = select_oop_transpose_impl<A, C>();
338 
339  if
340  constexpr_select(impl == transpose_impl::CUBLAS) {
341  c.ensure_gpu_allocated();
342 
343  decltype(auto) aa = smart_forward_gpu(a);
344 
345  // Detect inplace (some implementations do not support inplace if not told explicitely)
346  if (aa.gpu_memory() && aa.gpu_memory() == c.gpu_memory()) {
347  if (is_square(c)) {
349  } else {
351  }
352 
353  return;
354  }
355 
356  inc_counter("impl:cublas");
357  etl::impl::cublas::transpose(aa, c);
358  }
359  else {
360  decltype(auto) aa = smart_forward(a);
361 
362  // Detect inplace (some implementations do not support inplace if not told explicitely)
363  if (aa.memory_start() == c.memory_start()) {
364  if (is_square(c)) {
366  } else {
368  }
369 
370  return;
371  }
372 
373  if
374  constexpr_select(impl == transpose_impl::MKL) {
375  inc_counter("impl:mkl");
376  etl::impl::blas::transpose(aa, c);
377  }
378  else if
379  constexpr_select(impl == transpose_impl::VEC) {
380  inc_counter("impl:vec");
381  etl::impl::vec::transpose(aa, c);
382  }
383  else if
384  constexpr_select(impl == transpose_impl::STD) {
385  inc_counter("impl:std");
386  etl::impl::standard::transpose(aa, c);
387  }
388  else {
389  cpp_unreachable("Invalid transpose_impl selection");
390  }
391  }
392  }
393 };
394 
395 } //end of namespace etl::detail
static void apply(A &&a, C &&c)
Tranpose a and store the results in c.
Definition: transpose.hpp:336
Standard implementation of the "transpose" algorithm.
Functor for inplace square matrix transposition.
Definition: transpose.hpp:263
Vectorized implementation of the transpose operation.
constexpr bool mkl_enabled
Indicates if the MKL library is available for ETL.
Definition: config.hpp:64
Standard implementation.
Functor for general matrix transposition.
Definition: transpose.hpp:329
constexpr bool vectorize_impl
Indicates if the implementations can be automatically vectorized by ETL.
Definition: config.hpp:35
constexpr transpose_impl select_oop_transpose_impl()
Select the transposition implementation to use.
Definition: transpose.hpp:240
constexpr transpose_impl select_default_oop_transpose_impl(bool no_gpu)
Select the default transposition implementation to use.
Definition: transpose.hpp:71
VEC implementation.
Definition: expression_builder.hpp:699
MKL implementation of the transpose algorithm.
BLAS implementation.
The Intel MKL implementation.
Functor for inplace rectangular matrix transposition.
Definition: transpose.hpp:296
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50
constexpr transpose_impl select_normal_transpose_impl()
Select the transposition implementation to use.
Definition: transpose.hpp:227
constexpr bool cublas_enabled
Indicates if the NVIDIA CUBLAS library is available for ETL.
Definition: config.hpp:99
bool is_square(E &&expr)
Indicates if the given expression is a square matrix or not.
Definition: globals.hpp:30
bool cpu
Force CPU evaluation.
Definition: context.hpp:29
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
constexpr transpose_impl select_default_in_square_transpose_impl(bool no_gpu)
Select the default transposition implementation to use for an inplace square transposition operation...
Definition: transpose.hpp:111
constexpr transpose_impl select_default_transpose_impl(bool no_gpu)
Select the default transposition implementation to use.
Definition: transpose.hpp:40
decltype(auto) smart_forward(E &expr)
Smart forwarding for a temporary expression.
Definition: helpers.hpp:323
static void apply(C &&c)
Tranpose c inplace.
Definition: transpose.hpp:302
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
constexpr transpose_impl select_in_square_transpose_impl()
Select the transposition implementation to use for an inplace square transposition operation...
Definition: transpose.hpp:254
transpose_impl
Enumeration describing the different implementations of transpose.
Definition: transpose_impl.hpp:20
static void apply(C &&c)
Tranpose c inplace.
Definition: transpose.hpp:269