wichtounet/etl/transpose_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 #pragma once

 //Include the implementations
 #include "etl/impl/std/transpose.hpp"
 #include "etl/impl/vec/transpose.hpp"
 #include "etl/impl/blas/transpose.hpp"
 #include "etl/impl/cublas/transpose.hpp"

 #if __INTEL_MKL__ == 11 && __INTEL_MKL_MINOR__ == 2
 #define SLOW_MKL
 #endif

 namespace etl::detail {

 //TODO We should take into account parallel blas when selecting MKL transpose

 template <typename A, typename C>
 constexpr transpose_impl select_default_transpose_impl(bool no_gpu) {
     if (cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
         return transpose_impl::CUBLAS;
     }

 #ifdef SLOW_MKL
     // STD is always faster than MKL for out-of-place transpose
     return transpose_impl::STD;
 #else
     // Condition to use MKL
     constexpr bool mkl_possible = mkl_enabled && is_dma<C> && is_floating<C>;

     if (mkl_possible) {
         return transpose_impl::MKL;
     } else {
         return transpose_impl::STD;
     }
 #endif
 }

 template <typename A, typename C>
 constexpr transpose_impl select_default_oop_transpose_impl(bool no_gpu) {
     if (cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
         return transpose_impl::CUBLAS;
     }

     constexpr bool vec_possible = vectorize_impl && is_dma<C> && is_floating<C>;

 #ifdef SLOW_MKL
     // VEC and STD is always faster than MKL for out-of-place transpose
     if (vec_possible) {
         return transpose_impl::VEC;
     } else {
         return transpose_impl::STD;
     }
 #else
     // Condition to use MKL
     constexpr bool mkl_possible = mkl_enabled && is_dma<C> && is_floating<C>;

     if (mkl_possible) {
         return transpose_impl::MKL;
     } else if (vec_possible) {
         return transpose_impl::VEC;
     } else {
         return transpose_impl::STD;
     }
 #endif
 }

 template <typename A, typename C>
 constexpr transpose_impl select_default_in_square_transpose_impl(bool no_gpu) {
     if (cublas_enabled && all_dma<A, C> && all_floating<A, C> && !no_gpu) {
         return transpose_impl::CUBLAS;
     }

     // Condition to use MKL
     constexpr bool mkl_possible = mkl_enabled && is_dma<C> && is_floating<C>;

     if (mkl_possible) {
         return transpose_impl::MKL;
     } else {
         return transpose_impl::STD;
     }
 }

 #ifdef ETL_MANUAL_SELECT

 template <typename A, typename C>
 transpose_impl select_transpose_impl(transpose_impl def) {
     if (local_context().transpose_selector.forced) {
         auto forced = local_context().transpose_selector.impl;

         switch (forced) {
             //CUBLAS cannot always be used
             case transpose_impl::CUBLAS:
                 if (!cublas_enabled || !all_dma<A, C> || !all_floating<A, C> || local_context().cpu) {
                     std::cerr << "Forced selection to CUBLAS transpose implementation, but not possible for this expression" << std::endl;
                     return def;
                 }

                 return forced;

             //MKL cannot always be used
             case transpose_impl::MKL:
                 if (!mkl_enabled || !all_dma<A, C> || !all_floating<A, C>) {
                     std::cerr << "Forced selection to MKL transpose implementation, but not possible for this expression" << std::endl;
                     return def;
                 }

                 return forced;

             //VEC cannot always be used
             case transpose_impl::VEC:
                 if (!vectorize_impl || !all_dma<A, C> || !all_floating<A, C>) {
                     std::cerr << "Forced selection to VEC transpose implementation, but not possible for this expression" << std::endl;
                     return def;
                 }

                 return forced;

             //In other cases, simply use the forced impl
             default:
                 return forced;
         }
     }

     return def;
 }

 template <typename A, typename C>
 transpose_impl select_normal_transpose_impl() {
     return select_transpose_impl<A, C>(select_default_transpose_impl<A, C>(local_context().cpu));
 }

 template <typename A, typename C>
 transpose_impl select_oop_transpose_impl() {
     return select_transpose_impl<A, C>(select_default_oop_transpose_impl<A, C>(local_context().cpu));
 }

 template <typename C>
 transpose_impl select_in_square_transpose_impl() {
     return select_transpose_impl<C, C>(select_default_in_square_transpose_impl<C, C>(local_context().cpu));
 }

 #else

 template <typename A, typename C>
 constexpr transpose_impl select_normal_transpose_impl() {
     return select_default_transpose_impl<A, C>(false);
 }

 template <typename A, typename C>
 constexpr transpose_impl select_oop_transpose_impl() {
     return select_default_oop_transpose_impl<A, C>(false);
 }

 template <typename C>
 constexpr transpose_impl select_in_square_transpose_impl() {
     return select_default_in_square_transpose_impl<C, C>(false);
 }

 #endif

 struct inplace_square_transpose {
     template <typename C>
     static void apply(C&& c) {
         constexpr_select const auto impl = select_in_square_transpose_impl<C>();

         if
             constexpr_select(impl == transpose_impl::MKL) {
                 inc_counter("impl:mkl");
                 etl::impl::blas::inplace_square_transpose(c);
             }
         else if
             constexpr_select(impl == transpose_impl::CUBLAS) {
                 inc_counter("impl:cublas");
                 etl::impl::cublas::inplace_square_transpose(c);
             }
         else if
             constexpr_select(impl == transpose_impl::STD) {
                 inc_counter("impl:std");
                 etl::impl::standard::inplace_square_transpose(c);
             }
         else {
             cpp_unreachable("Invalid transpose_impl selection");
         }
     }
 };

 struct inplace_rectangular_transpose {
     template <typename C>
     static void apply(C&& c) {
         constexpr_select const auto impl = select_normal_transpose_impl<C, C>();

         if
             constexpr_select(impl == transpose_impl::MKL) {
                 inc_counter("impl:mkl");
                 etl::impl::blas::inplace_rectangular_transpose(c);
             }
         else if
             constexpr_select(impl == transpose_impl::CUBLAS) {
                 inc_counter("impl:cublas");
                 etl::impl::cublas::inplace_rectangular_transpose(c);
             }
         else if
             constexpr_select(impl == transpose_impl::STD) {
                 inc_counter("impl:std");
                 etl::impl::standard::inplace_rectangular_transpose(c);
             }
         else {
             cpp_unreachable("Invalid transpose_impl selection");
         }
     }
 };

 struct transpose {
     template <typename A, typename C>
     static void apply(A&& a, C&& c) {
         constexpr_select const auto impl = select_oop_transpose_impl<A, C>();

         if
             constexpr_select(impl == transpose_impl::CUBLAS) {
                 c.ensure_gpu_allocated();

                 decltype(auto) aa = smart_forward_gpu(a);

                 // Detect inplace (some implementations do not support inplace if not told explicitely)
                 if (aa.gpu_memory() && aa.gpu_memory() == c.gpu_memory()) {
                     if (is_square(c)) {
                         inplace_square_transpose::apply(c);
                     } else {
                         inplace_rectangular_transpose::apply(c);
                     }

                     return;
                 }

                 inc_counter("impl:cublas");
                 etl::impl::cublas::transpose(aa, c);
             }
         else {
             decltype(auto) aa = smart_forward(a);

             // Detect inplace (some implementations do not support inplace if not told explicitely)
             if (aa.memory_start() == c.memory_start()) {
                 if (is_square(c)) {
                     inplace_square_transpose::apply(c);
                 } else {
                     inplace_rectangular_transpose::apply(c);
                 }

                 return;
             }

             if
                 constexpr_select(impl == transpose_impl::MKL) {
                     inc_counter("impl:mkl");
                     etl::impl::blas::transpose(aa, c);
                 }
             else if
                 constexpr_select(impl == transpose_impl::VEC) {
                     inc_counter("impl:vec");
                     etl::impl::vec::transpose(aa, c);
                 }
             else if
                 constexpr_select(impl == transpose_impl::STD) {
                     inc_counter("impl:std");
                     etl::impl::standard::transpose(aa, c);
                 }
             else {
                 cpp_unreachable("Invalid transpose_impl selection");
             }
         }
     }
 };

 } //end of namespace etl::detail
etl::detail::transpose::apply
static void apply(A &&a, C &&c)
Tranpose a and store the results in c.
Definition: transpose.hpp:336

transpose.hpp
Standard implementation of the "transpose" algorithm.

etl::detail::inplace_square_transpose
Functor for inplace square matrix transposition.
Definition: transpose.hpp:263

transpose.hpp
Vectorized implementation of the transpose operation.

etl::mkl_enabled
constexpr bool mkl_enabled
Indicates if the MKL library is available for ETL.
Definition: config.hpp:64

etl::batch_softmax_impl::STD
Standard implementation.

etl::detail::transpose
Functor for general matrix transposition.
Definition: transpose.hpp:329

etl::vectorize_impl
constexpr bool vectorize_impl
Indicates if the implementations can be automatically vectorized by ETL.
Definition: config.hpp:35

etl::detail::select_oop_transpose_impl
constexpr transpose_impl select_oop_transpose_impl()
Select the transposition implementation to use.
Definition: transpose.hpp:240

etl::detail::select_default_oop_transpose_impl
constexpr transpose_impl select_default_oop_transpose_impl(bool no_gpu)
Select the default transposition implementation to use.
Definition: transpose.hpp:71

etl::bias_add_impl::VEC
VEC implementation.

etl::detail
Definition: expression_builder.hpp:699

transpose.hpp
MKL implementation of the transpose algorithm.

etl::dot_impl::CUBLAS
BLAS implementation.

etl::fft_impl::MKL
The Intel MKL implementation.

etl::detail::inplace_rectangular_transpose
Functor for inplace rectangular matrix transposition.
Definition: transpose.hpp:296

etl::local_context
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50

etl::detail::select_normal_transpose_impl
constexpr transpose_impl select_normal_transpose_impl()
Select the transposition implementation to use.
Definition: transpose.hpp:227

etl::cublas_enabled
constexpr bool cublas_enabled
Indicates if the NVIDIA CUBLAS library is available for ETL.
Definition: config.hpp:99

etl::is_square
bool is_square(E &&expr)
Indicates if the given expression is a square matrix or not.
Definition: globals.hpp:30

etl::context::cpu
bool cpu
Force CPU evaluation.
Definition: context.hpp:29

etl::smart_forward_gpu
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343

etl::detail::select_default_in_square_transpose_impl
constexpr transpose_impl select_default_in_square_transpose_impl(bool no_gpu)
Select the default transposition implementation to use for an inplace square transposition operation...
Definition: transpose.hpp:111

etl::detail::select_default_transpose_impl
constexpr transpose_impl select_default_transpose_impl(bool no_gpu)
Select the default transposition implementation to use.
Definition: transpose.hpp:40

etl::smart_forward
decltype(auto) smart_forward(E &expr)
Smart forwarding for a temporary expression.
Definition: helpers.hpp:323

etl::detail::inplace_rectangular_transpose::apply
static void apply(C &&c)
Tranpose c inplace.
Definition: transpose.hpp:302

etl::inc_counter
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25

etl::detail::select_in_square_transpose_impl
constexpr transpose_impl select_in_square_transpose_impl()
Select the transposition implementation to use for an inplace square transposition operation...
Definition: transpose.hpp:254

etl::transpose_impl
transpose_impl
Enumeration describing the different implementations of transpose.
Definition: transpose_impl.hpp:20

etl::detail::inplace_square_transpose::apply
static void apply(C &&c)
Tranpose c inplace.
Definition: transpose.hpp:269