19 #include "etl/impl/cublas/transpose.hpp" 21 #if __INTEL_MKL__ == 11 && __INTEL_MKL_MINOR__ == 2 39 template <
typename A,
typename C>
41 if (
cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
50 constexpr
bool mkl_possible =
mkl_enabled && is_dma<C> && is_floating<C>;
70 template <
typename A,
typename C>
72 if (
cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
76 constexpr
bool vec_possible =
vectorize_impl && is_dma<C> && is_floating<C>;
87 constexpr
bool mkl_possible =
mkl_enabled && is_dma<C> && is_floating<C>;
91 }
else if (vec_possible) {
110 template <
typename A,
typename C>
112 if (
cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
117 constexpr
bool mkl_possible =
mkl_enabled && is_dma<C> && is_floating<C>;
126 #ifdef ETL_MANUAL_SELECT 134 template <
typename A,
typename C>
143 std::cerr <<
"Forced selection to CUBLAS transpose implementation, but not possible for this expression" << std::endl;
151 if (!
mkl_enabled || !all_dma<A, C> || !all_floating<A, C>) {
152 std::cerr <<
"Forced selection to MKL transpose implementation, but not possible for this expression" << std::endl;
161 std::cerr <<
"Forced selection to VEC transpose implementation, but not possible for this expression" << std::endl;
184 template <
typename A,
typename C>
186 return select_transpose_impl<A, C>(select_default_transpose_impl<A, C>(
local_context().
cpu));
197 template <
typename A,
typename C>
199 return select_transpose_impl<A, C>(select_default_oop_transpose_impl<A, C>(
local_context().
cpu));
211 template <
typename C>
213 return select_transpose_impl<C, C>(select_default_in_square_transpose_impl<C, C>(
local_context().
cpu));
226 template <
typename A,
typename C>
228 return select_default_transpose_impl<A, C>(
false);
239 template <
typename A,
typename C>
241 return select_default_oop_transpose_impl<A, C>(
false);
253 template <
typename C>
255 return select_default_in_square_transpose_impl<C, C>(
false);
268 template <
typename C>
270 constexpr_select
const auto impl = select_in_square_transpose_impl<C>();
275 etl::impl::blas::inplace_square_transpose(c);
280 etl::impl::cublas::inplace_square_transpose(c);
285 etl::impl::standard::inplace_square_transpose(c);
288 cpp_unreachable(
"Invalid transpose_impl selection");
301 template <
typename C>
303 constexpr_select
const auto impl = select_normal_transpose_impl<C, C>();
308 etl::impl::blas::inplace_rectangular_transpose(c);
313 etl::impl::cublas::inplace_rectangular_transpose(c);
318 etl::impl::standard::inplace_rectangular_transpose(c);
321 cpp_unreachable(
"Invalid transpose_impl selection");
335 template <
typename A,
typename C>
337 constexpr_select
const auto impl = select_oop_transpose_impl<A, C>();
341 c.ensure_gpu_allocated();
346 if (aa.gpu_memory() && aa.gpu_memory() == c.gpu_memory()) {
357 etl::impl::cublas::transpose(aa, c);
363 if (aa.memory_start() == c.memory_start()) {
376 etl::impl::blas::transpose(aa, c);
381 etl::impl::vec::transpose(aa, c);
386 etl::impl::standard::transpose(aa, c);
389 cpp_unreachable(
"Invalid transpose_impl selection");
static void apply(A &&a, C &&c)
Tranpose a and store the results in c.
Definition: transpose.hpp:336
Standard implementation of the "transpose" algorithm.
Functor for inplace square matrix transposition.
Definition: transpose.hpp:263
Vectorized implementation of the transpose operation.
constexpr bool mkl_enabled
Indicates if the MKL library is available for ETL.
Definition: config.hpp:64
Functor for general matrix transposition.
Definition: transpose.hpp:329
constexpr bool vectorize_impl
Indicates if the implementations can be automatically vectorized by ETL.
Definition: config.hpp:35
constexpr transpose_impl select_oop_transpose_impl()
Select the transposition implementation to use.
Definition: transpose.hpp:240
constexpr transpose_impl select_default_oop_transpose_impl(bool no_gpu)
Select the default transposition implementation to use.
Definition: transpose.hpp:71
Definition: expression_builder.hpp:699
MKL implementation of the transpose algorithm.
The Intel MKL implementation.
Functor for inplace rectangular matrix transposition.
Definition: transpose.hpp:296
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50
constexpr transpose_impl select_normal_transpose_impl()
Select the transposition implementation to use.
Definition: transpose.hpp:227
constexpr bool cublas_enabled
Indicates if the NVIDIA CUBLAS library is available for ETL.
Definition: config.hpp:99
bool is_square(E &&expr)
Indicates if the given expression is a square matrix or not.
Definition: globals.hpp:30
bool cpu
Force CPU evaluation.
Definition: context.hpp:29
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
constexpr transpose_impl select_default_in_square_transpose_impl(bool no_gpu)
Select the default transposition implementation to use for an inplace square transposition operation...
Definition: transpose.hpp:111
constexpr transpose_impl select_default_transpose_impl(bool no_gpu)
Select the default transposition implementation to use.
Definition: transpose.hpp:40
decltype(auto) smart_forward(E &expr)
Smart forwarding for a temporary expression.
Definition: helpers.hpp:323
static void apply(C &&c)
Tranpose c inplace.
Definition: transpose.hpp:302
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
constexpr transpose_impl select_in_square_transpose_impl()
Select the transposition implementation to use for an inplace square transposition operation...
Definition: transpose.hpp:254
transpose_impl
Enumeration describing the different implementations of transpose.
Definition: transpose_impl.hpp:20
static void apply(C &&c)
Tranpose c inplace.
Definition: transpose.hpp:269