17 template <
typename V,
typename T>
18 inline void transpose_block_4x4_kernel(
size_t N,
size_t M,
const T* A2, T* C2,
size_t i2,
size_t j2) {
19 C2[(j2 + 0) * N + (i2 + 0)] = A2[(i2 + 0) * M + (j2 + 0)];
20 C2[(j2 + 1) * N + (i2 + 0)] = A2[(i2 + 0) * M + (j2 + 1)];
21 C2[(j2 + 2) * N + (i2 + 0)] = A2[(i2 + 0) * M + (j2 + 2)];
22 C2[(j2 + 3) * N + (i2 + 0)] = A2[(i2 + 0) * M + (j2 + 3)];
24 C2[(j2 + 0) * N + (i2 + 1)] = A2[(i2 + 1) * M + (j2 + 0)];
25 C2[(j2 + 1) * N + (i2 + 1)] = A2[(i2 + 1) * M + (j2 + 1)];
26 C2[(j2 + 2) * N + (i2 + 1)] = A2[(i2 + 1) * M + (j2 + 2)];
27 C2[(j2 + 3) * N + (i2 + 1)] = A2[(i2 + 1) * M + (j2 + 3)];
29 C2[(j2 + 0) * N + (i2 + 2)] = A2[(i2 + 2) * M + (j2 + 0)];
30 C2[(j2 + 1) * N + (i2 + 2)] = A2[(i2 + 2) * M + (j2 + 1)];
31 C2[(j2 + 2) * N + (i2 + 2)] = A2[(i2 + 2) * M + (j2 + 2)];
32 C2[(j2 + 3) * N + (i2 + 2)] = A2[(i2 + 2) * M + (j2 + 3)];
34 C2[(j2 + 0) * N + (i2 + 3)] = A2[(i2 + 3) * M + (j2 + 0)];
35 C2[(j2 + 1) * N + (i2 + 3)] = A2[(i2 + 3) * M + (j2 + 1)];
36 C2[(j2 + 2) * N + (i2 + 3)] = A2[(i2 + 3) * M + (j2 + 2)];
37 C2[(j2 + 3) * N + (i2 + 3)] = A2[(i2 + 3) * M + (j2 + 3)];
45 inline void transpose_block_4x4_kernel<sse_vec>(
size_t N,
size_t M,
const float* A2,
float* C2,
size_t i2,
size_t j2) {
53 _MM_TRANSPOSE4_PS(r1.value, r2.value, r3.value, r4.value);
63 template <
typename V,
typename A,
typename C>
65 const size_t N = etl::dim<0>(a);
66 const size_t M = etl::dim<1>(a);
68 const auto* A2 = a.memory_start();
69 auto* C2 = c.memory_start();
72 constexpr
size_t block_size = 16;
73 constexpr
size_t kernel_block_size = 4;
76 auto batch_fun_i = [&](
const size_t ifirst,
const size_t ilast) {
77 cpp_assert(ilast <= N,
"Invalid dispatch");
81 for (; i + block_size - 1 < ilast; i += block_size) {
85 for (; j + block_size - 1 < M; j += block_size) {
86 for (
size_t i2 = i; i2 < i + block_size; i2 += kernel_block_size) {
87 for (
size_t j2 = j; j2 < j + block_size; j2 += kernel_block_size) {
88 transpose_block_4x4_kernel<V>(N, M, A2, C2, i2, j2);
94 for (; j + kernel_block_size - 1 < M; j += kernel_block_size) {
95 for (
size_t i2 = i; i2 < i + block_size; i2 += kernel_block_size) {
96 transpose_block_4x4_kernel<V>(N, M, A2, C2, i2, j);
102 for (
size_t i2 = i; i2 < i + block_size; ++i2) {
103 C2[j * N + i2] = A2[i2 * M + j];
108 for (; i + kernel_block_size - 1 < ilast; i += kernel_block_size) {
112 for (; j + kernel_block_size - 1 < M; j += kernel_block_size) {
113 transpose_block_4x4_kernel<V>(N, M, A2, C2, i, j);
118 for (
size_t i2 = i; i2 < i + kernel_block_size; ++i2) {
119 C2[j * N + i2] = A2[i2 * M + j];
124 for (; i < ilast; ++i) {
125 for (
size_t j = 0; j < M; ++j) {
126 C2[j * N + i] = A2[i * M + j];
134 for (
size_t j = 0; j < M; ++j) {
135 for (
size_t i = 0; i < N; ++i) {
136 C2[i * M + j] = A2[j * N + i];
142 template <
typename A,
typename C>
143 void transpose([[maybe_unused]] A&& a, [[maybe_unused]] C&& c) {
144 if constexpr (all_vectorizable<vector_mode, A, C> &&
sse3_enabled) {
147 transpose_impl<sse_vec>(a, c);
150 cpp_unreachable(
"Invalid call to vec::batch_outer");
void engine_dispatch_1d(Functor &&functor, size_t first, size_t last, [[maybe_unused]] size_t threshold, [[maybe_unused]] size_t n_threads=etl::threads)
Dispatch the elements of a range to a functor in a parallel manner, using the global thread engine...
Definition: parallel_support.hpp:708
Definition: bias_add.hpp:15
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
auto transpose(const E &value)
Returns the transpose of the given expression.
Definition: expression_builder.hpp:528
void storeu(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:187
auto loadu(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:154
const size_t threads
The number of threads ETL can use in parallel mode.
Definition: config.hpp:45
bool engine_select_parallel([[maybe_unused]] size_t n, [[maybe_unused]] size_t threshold=parallel_threshold)
Indicates if an 1D evaluation should run in paralle.
Definition: parallel_support.hpp:679
constexpr bool sse3_enabled
Indicates if SSE3 is available.
Definition: config.hpp:215
transpose_impl
Enumeration describing the different implementations of transpose.
Definition: transpose_impl.hpp:20