Expression Templates Library (ETL)
avx_vectorization.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
13 #pragma once
14 
15 #ifdef __AVX__
16 
17 #include <immintrin.h>
18 #include <emmintrin.h>
19 #include <xmmintrin.h>
20 
21 #include "etl/inline.hpp"
22 #include "etl/avx_exp.hpp"
23 
24 #ifdef VECT_DEBUG
25 #include <iostream>
26 #endif
27 
28 #define ETL_INLINE_VEC_VOID ETL_STATIC_INLINE(void)
29 #define ETL_INLINE_VEC_256 ETL_STATIC_INLINE(__m256)
30 #define ETL_INLINE_VEC_256D ETL_STATIC_INLINE(__m256d)
31 #define ETL_OUT_VEC_256 ETL_OUT_INLINE(__m256)
32 #define ETL_OUT_VEC_256D ETL_OUT_INLINE(__m256d)
33 
34 namespace etl {
35 
39 using avx_simd_float = simd_pack<vector_mode_t::AVX, float, __m256>;
40 
44 using avx_simd_double = simd_pack<vector_mode_t::AVX, double, __m256d>;
45 
49 template <typename T>
50 using avx_simd_complex_float = simd_pack<vector_mode_t::AVX, T, __m256>;
51 
55 template <typename T>
56 using avx_simd_complex_double = simd_pack<vector_mode_t::AVX, T, __m256d>;
57 
61 using avx_simd_byte = simd_pack<vector_mode_t::AVX, int8_t, __m256i>;
62 
66 using avx_simd_short = simd_pack<vector_mode_t::AVX, int16_t, __m256i>;
67 
71 using avx_simd_int = simd_pack<vector_mode_t::AVX, int32_t, __m256i>;
72 
76 using avx_simd_long = simd_pack<vector_mode_t::AVX, int64_t, __m256i>;
77 
81 template <typename T>
82 struct avx_intrinsic_traits {
83  static constexpr bool vectorizable = false;
84  static constexpr size_t size = 1;
85  static constexpr size_t alignment = alignof(T);
86 
87  using intrinsic_type = T;
88 };
89 
93 template <>
94 struct avx_intrinsic_traits<float> {
95  static constexpr bool vectorizable = true;
96  static constexpr size_t size = 8;
97  static constexpr size_t alignment = 32;
98 
99  using intrinsic_type = avx_simd_float;
100 };
101 
105 template <>
106 struct avx_intrinsic_traits<double> {
107  static constexpr bool vectorizable = true;
108  static constexpr size_t size = 4;
109  static constexpr size_t alignment = 32;
110 
111  using intrinsic_type = avx_simd_double;
112 };
113 
117 template <>
118 struct avx_intrinsic_traits<std::complex<float>> {
119  static constexpr bool vectorizable = true;
120  static constexpr size_t size = 4;
121  static constexpr size_t alignment = 32;
122 
123  using intrinsic_type = avx_simd_complex_float<std::complex<float>>;
124 };
125 
129 template <>
130 struct avx_intrinsic_traits<std::complex<double>> {
131  static constexpr bool vectorizable = true;
132  static constexpr size_t size = 2;
133  static constexpr size_t alignment = 32;
134 
135  using intrinsic_type = avx_simd_complex_double<std::complex<double>>;
136 };
137 
141 template <>
142 struct avx_intrinsic_traits<etl::complex<float>> {
143  static constexpr bool vectorizable = true;
144  static constexpr size_t size = 4;
145  static constexpr size_t alignment = 32;
146 
147  using intrinsic_type = avx_simd_complex_float<etl::complex<float>>;
148 };
149 
153 template <>
154 struct avx_intrinsic_traits<etl::complex<double>> {
155  static constexpr bool vectorizable = true;
156  static constexpr size_t size = 2;
157  static constexpr size_t alignment = 32;
158 
159  using intrinsic_type = avx_simd_complex_double<etl::complex<double>>;
160 };
161 
165 template <>
166 struct avx_intrinsic_traits<int8_t> {
167  static constexpr bool vectorizable = avx2_enabled;
168  static constexpr size_t size = 32;
169  static constexpr size_t alignment = 32;
170 
171  using intrinsic_type = avx_simd_byte;
172 };
173 
177 template <>
178 struct avx_intrinsic_traits<int16_t> {
179  static constexpr bool vectorizable = avx2_enabled;
180  static constexpr size_t size = 16;
181  static constexpr size_t alignment = 32;
182 
183  using intrinsic_type = avx_simd_short;
184 };
185 
189 template <>
190 struct avx_intrinsic_traits<int32_t> {
191  static constexpr bool vectorizable = avx2_enabled;
192  static constexpr size_t size = 8;
193  static constexpr size_t alignment = 32;
194 
195  using intrinsic_type = avx_simd_int;
196 };
197 
201 template <>
202 struct avx_intrinsic_traits<int64_t> {
203  static constexpr bool vectorizable = avx2_enabled;
204  static constexpr size_t size = 4;
205  static constexpr size_t alignment = 32;
206 
207  using intrinsic_type = avx_simd_long;
208 };
209 
213 struct avx_vec {
217  template <typename T>
218  using traits = avx_intrinsic_traits<T>;
219 
223  template <typename T>
224  using vec_type = typename traits<T>::intrinsic_type;
225 
226 #ifdef VEC_DEBUG
227 
231  template <typename T>
232  static std::string debug_d(T value) {
233  union test {
234  __m256d vec; // a data field, maybe a register, maybe not
235  double array[4];
236  test(__m256d vec) : vec(vec) {}
237  };
238 
239  test u_value = value;
240  std::cout << "[" << u_value.array[0] << "," << u_value.array[1] << "," << u_value.array[2] << "," << u_value.array[3] << "]" << std::endl;
241  }
242 
246  template <typename T>
247  static std::string debug_s(T value) {
248  union test {
249  __m256 vec; // a data field, maybe a register, maybe not
250  float array[8];
251  test(__m256 vec) : vec(vec) {}
252  };
253 
254  test u_value = value;
255  std::cout << "[" << u_value.array[0] << "," << u_value.array[1] << "," << u_value.array[2] << "," << u_value.array[3] << "," << u_value.array[4] << ","
256  << u_value.array[5] << "," << u_value.array[6] << "," << u_value.array[7] << "]" << std::endl;
257  }
258 
259 #else
260 
264  template <typename T>
265  static std::string debug_d(T) {
266  return "";
267  }
268 
272  template <typename T>
273  static std::string debug_s(T) {
274  return "";
275  }
276 
277 #endif
278 
279 #ifdef __AVX2__
280 
284  ETL_STATIC_INLINE(void) storeu(int8_t* memory, avx_simd_byte value) {
285  _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
286  }
287 
292  ETL_STATIC_INLINE(void) storeu(int16_t* memory, avx_simd_short value) {
293  _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
294  }
295 
300  ETL_STATIC_INLINE(void) storeu(int32_t* memory, avx_simd_int value) {
301  _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
302  }
303 
308  ETL_STATIC_INLINE(void) storeu(int64_t* memory, avx_simd_long value) {
309  _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
310  }
311 #endif
312 
317  ETL_STATIC_INLINE(void) storeu(float* memory, avx_simd_float value) {
318  _mm256_storeu_ps(memory, value.value);
319  }
320 
325  ETL_STATIC_INLINE(void) storeu(double* memory, avx_simd_double value) {
326  _mm256_storeu_pd(memory, value.value);
327  }
328 
333  ETL_STATIC_INLINE(void) storeu(std::complex<float>* memory, avx_simd_complex_float<std::complex<float>> value) {
334  _mm256_storeu_ps(reinterpret_cast<float*>(memory), value.value);
335  }
336 
341  ETL_STATIC_INLINE(void) storeu(std::complex<double>* memory, avx_simd_complex_double<std::complex<double>> value) {
342  _mm256_storeu_pd(reinterpret_cast<double*>(memory), value.value);
343  }
344 
349  ETL_STATIC_INLINE(void) storeu(etl::complex<float>* memory, avx_simd_complex_float<etl::complex<float>> value) {
350  _mm256_storeu_ps(reinterpret_cast<float*>(memory), value.value);
351  }
352 
357  ETL_STATIC_INLINE(void) storeu(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) {
358  _mm256_storeu_pd(reinterpret_cast<double*>(memory), value.value);
359  }
360 
361 #ifdef __AVX2__
362 
366  ETL_STATIC_INLINE(void) stream(int8_t* memory, avx_simd_byte value) {
367  _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
368  }
369 
374  ETL_STATIC_INLINE(void) stream(int16_t* memory, avx_simd_short value) {
375  _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
376  }
377 
382  ETL_STATIC_INLINE(void) stream(int32_t* memory, avx_simd_int value) {
383  _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
384  }
385 
390  ETL_STATIC_INLINE(void) stream(int64_t* memory, avx_simd_long value) {
391  _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
392  }
393 #endif
394 
399  ETL_STATIC_INLINE(void) stream(float* memory, avx_simd_float value) {
400  _mm256_stream_ps(memory, value.value);
401  }
402 
407  ETL_STATIC_INLINE(void) stream(double* memory, avx_simd_double value) {
408  _mm256_stream_pd(memory, value.value);
409  }
410 
415  ETL_STATIC_INLINE(void) stream(std::complex<float>* memory, avx_simd_complex_float<std::complex<float>> value) {
416  _mm256_stream_ps(reinterpret_cast<float*>(memory), value.value);
417  }
418 
423  ETL_STATIC_INLINE(void) stream(std::complex<double>* memory, avx_simd_complex_double<std::complex<double>> value) {
424  _mm256_stream_pd(reinterpret_cast<double*>(memory), value.value);
425  }
426 
431  ETL_STATIC_INLINE(void) stream(etl::complex<float>* memory, avx_simd_complex_float<etl::complex<float>> value) {
432  _mm256_stream_ps(reinterpret_cast<float*>(memory), value.value);
433  }
434 
439  ETL_STATIC_INLINE(void) stream(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) {
440  _mm256_stream_pd(reinterpret_cast<double*>(memory), value.value);
441  }
442 
443 #ifdef __AVX2__
444 
448  ETL_STATIC_INLINE(void) store(int8_t* memory, avx_simd_byte value) {
449  _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
450  }
451 
456  ETL_STATIC_INLINE(void) store(int16_t* memory, avx_simd_short value) {
457  _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
458  }
459 
464  ETL_STATIC_INLINE(void) store(int32_t* memory, avx_simd_int value) {
465  _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
466  }
467 
472  ETL_STATIC_INLINE(void) store(int64_t* memory, avx_simd_long value) {
473  _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
474  }
475 #endif
476 
481  ETL_STATIC_INLINE(void) store(float* memory, avx_simd_float value) {
482  _mm256_store_ps(memory, value.value);
483  }
484 
489  ETL_STATIC_INLINE(void) store(double* memory, avx_simd_double value) {
490  _mm256_store_pd(memory, value.value);
491  }
492 
497  ETL_STATIC_INLINE(void) store(std::complex<float>* memory, avx_simd_complex_float<std::complex<float>> value) {
498  _mm256_store_ps(reinterpret_cast<float*>(memory), value.value);
499  }
500 
505  ETL_STATIC_INLINE(void) store(std::complex<double>* memory, avx_simd_complex_double<std::complex<double>> value) {
506  _mm256_store_pd(reinterpret_cast<double*>(memory), value.value);
507  }
508 
513  ETL_STATIC_INLINE(void) store(etl::complex<float>* memory, avx_simd_complex_float<etl::complex<float>> value) {
514  _mm256_store_ps(reinterpret_cast<float*>(memory), value.value);
515  }
516 
521  ETL_STATIC_INLINE(void) store(etl::complex<double>* memory, avx_simd_complex_double<etl::complex<double>> value) {
522  _mm256_store_pd(reinterpret_cast<double*>(memory), value.value);
523  }
524 
528  template <typename T>
529  ETL_TMP_INLINE(typename avx_intrinsic_traits<T>::intrinsic_type)
530  zero();
531 
532 #ifdef __AVX2__
533 
536  ETL_STATIC_INLINE(avx_simd_byte) load(const int8_t* memory) {
537  return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
538  }
539 
543  ETL_STATIC_INLINE(avx_simd_short) load(const int16_t* memory) {
544  return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
545  }
546 
550  ETL_STATIC_INLINE(avx_simd_int) load(const int32_t* memory) {
551  return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
552  }
553 
557  ETL_STATIC_INLINE(avx_simd_long) load(const int64_t* memory) {
558  return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
559  }
560 #endif
561 
565  ETL_STATIC_INLINE(avx_simd_float) load(const float* memory) {
566  return _mm256_load_ps(memory);
567  }
568 
572  ETL_STATIC_INLINE(avx_simd_double) load(const double* memory) {
573  return _mm256_load_pd(memory);
574  }
575 
579  ETL_STATIC_INLINE(avx_simd_complex_float<std::complex<float>>) load(const std::complex<float>* memory) {
580  return _mm256_load_ps(reinterpret_cast<const float*>(memory));
581  }
582 
586  ETL_STATIC_INLINE(avx_simd_complex_double<std::complex<double>>) load(const std::complex<double>* memory) {
587  return _mm256_load_pd(reinterpret_cast<const double*>(memory));
588  }
589 
593  ETL_STATIC_INLINE(avx_simd_complex_float<etl::complex<float>>) load(const etl::complex<float>* memory) {
594  return _mm256_load_ps(reinterpret_cast<const float*>(memory));
595  }
596 
600  ETL_STATIC_INLINE(avx_simd_complex_double<etl::complex<double>>) load(const etl::complex<double>* memory) {
601  return _mm256_load_pd(reinterpret_cast<const double*>(memory));
602  }
603 
604 #ifdef __AVX2__
605 
608  ETL_STATIC_INLINE(avx_simd_byte) loadu(const int8_t* memory) {
609  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
610  }
611 
615  ETL_STATIC_INLINE(avx_simd_short) loadu(const int16_t* memory) {
616  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
617  }
618 
622  ETL_STATIC_INLINE(avx_simd_int) loadu(const int32_t* memory) {
623  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
624  }
625 
629  ETL_STATIC_INLINE(avx_simd_long) loadu(const int64_t* memory) {
630  return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
631  }
632 #endif
633 
637  ETL_STATIC_INLINE(avx_simd_float) loadu(const float* memory) {
638  return _mm256_loadu_ps(memory);
639  }
640 
644  ETL_STATIC_INLINE(avx_simd_double) loadu(const double* memory) {
645  return _mm256_loadu_pd(memory);
646  }
647 
651  ETL_STATIC_INLINE(avx_simd_complex_float<std::complex<float>>) loadu(const std::complex<float>* memory) {
652  return _mm256_loadu_ps(reinterpret_cast<const float*>(memory));
653  }
654 
658  ETL_STATIC_INLINE(avx_simd_complex_double<std::complex<double>>) loadu(const std::complex<double>* memory) {
659  return _mm256_loadu_pd(reinterpret_cast<const double*>(memory));
660  }
661 
665  ETL_STATIC_INLINE(avx_simd_complex_float<etl::complex<float>>) loadu(const etl::complex<float>* memory) {
666  return _mm256_loadu_ps(reinterpret_cast<const float*>(memory));
667  }
668 
672  ETL_STATIC_INLINE(avx_simd_complex_double<etl::complex<double>>) loadu(const etl::complex<double>* memory) {
673  return _mm256_loadu_pd(reinterpret_cast<const double*>(memory));
674  }
675 
676 #ifdef __AVX2__
677 
680  ETL_STATIC_INLINE(avx_simd_byte) set(int8_t value) {
681  return _mm256_set1_epi8(value);
682  }
683 
687  ETL_STATIC_INLINE(avx_simd_short) set(int16_t value) {
688  return _mm256_set1_epi16(value);
689  }
690 
694  ETL_STATIC_INLINE(avx_simd_int) set(int32_t value) {
695  return _mm256_set1_epi32(value);
696  }
697 
701  ETL_STATIC_INLINE(avx_simd_long) set(int64_t value) {
702  return _mm256_set1_epi64x(value);
703  }
704 #endif
705 
709  ETL_STATIC_INLINE(avx_simd_double) set(double value) {
710  return _mm256_set1_pd(value);
711  }
712 
716  ETL_STATIC_INLINE(avx_simd_float) set(float value) {
717  return _mm256_set1_ps(value);
718  }
719 
723  ETL_STATIC_INLINE(avx_simd_complex_float<std::complex<float>>) set(std::complex<float> value) {
724  std::complex<float> tmp[]{value, value, value, value};
725  return loadu(tmp);
726  }
727 
731  ETL_STATIC_INLINE(avx_simd_complex_double<std::complex<double>>) set(std::complex<double> value) {
732  std::complex<double> tmp[]{value, value};
733  return loadu(tmp);
734  }
735 
739  ETL_STATIC_INLINE(avx_simd_complex_float<etl::complex<float>>) set(etl::complex<float> value) {
740  etl::complex<float> tmp[]{value, value, value, value};
741  return loadu(tmp);
742  }
743 
747  ETL_STATIC_INLINE(avx_simd_complex_double<etl::complex<double>>) set(etl::complex<double> value) {
748  etl::complex<double> tmp[]{value, value};
749  return loadu(tmp);
750  }
751 
755  ETL_STATIC_INLINE(avx_simd_float) round_up(avx_simd_float x) {
756  return _mm256_round_ps(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
757  }
758 
762  ETL_STATIC_INLINE(avx_simd_double) round_up(avx_simd_double x) {
763  return _mm256_round_pd(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
764  }
765 
766  // Addition
767 
768 #ifdef __AVX2__
769 
772  ETL_STATIC_INLINE(avx_simd_byte) add(avx_simd_byte lhs, avx_simd_byte rhs) {
773  return _mm256_add_epi8(lhs.value, rhs.value);
774  }
775 
779  ETL_STATIC_INLINE(avx_simd_short) add(avx_simd_short lhs, avx_simd_short rhs) {
780  return _mm256_add_epi16(lhs.value, rhs.value);
781  }
782 
786  ETL_STATIC_INLINE(avx_simd_int) add(avx_simd_int lhs, avx_simd_int rhs) {
787  return _mm256_add_epi32(lhs.value, rhs.value);
788  }
789 
793  ETL_STATIC_INLINE(avx_simd_long) add(avx_simd_long lhs, avx_simd_long rhs) {
794  return _mm256_add_epi64(lhs.value, rhs.value);
795  }
796 #endif
797 
801  ETL_STATIC_INLINE(avx_simd_float) add(avx_simd_float lhs, avx_simd_float rhs) {
802  return _mm256_add_ps(lhs.value, rhs.value);
803  }
804 
808  ETL_STATIC_INLINE(avx_simd_double) add(avx_simd_double lhs, avx_simd_double rhs) {
809  return _mm256_add_pd(lhs.value, rhs.value);
810  }
811 
815  template <typename T>
816  ETL_STATIC_INLINE(avx_simd_complex_float<T>)
817  add(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
818  return _mm256_add_ps(lhs.value, rhs.value);
819  }
820 
824  template <typename T>
825  ETL_STATIC_INLINE(avx_simd_complex_double<T>)
826  add(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
827  return _mm256_add_pd(lhs.value, rhs.value);
828  }
829 
830  // Subtraction
831 
832 #ifdef __AVX2__
833 
836  ETL_STATIC_INLINE(avx_simd_byte) sub(avx_simd_byte lhs, avx_simd_byte rhs) {
837  return _mm256_sub_epi8(lhs.value, rhs.value);
838  }
839 
843  ETL_STATIC_INLINE(avx_simd_short) sub(avx_simd_short lhs, avx_simd_short rhs) {
844  return _mm256_sub_epi16(lhs.value, rhs.value);
845  }
846 
850  ETL_STATIC_INLINE(avx_simd_int) sub(avx_simd_int lhs, avx_simd_int rhs) {
851  return _mm256_sub_epi32(lhs.value, rhs.value);
852  }
853 
857  ETL_STATIC_INLINE(avx_simd_long) sub(avx_simd_long lhs, avx_simd_long rhs) {
858  return _mm256_sub_epi64(lhs.value, rhs.value);
859  }
860 #endif
861 
865  ETL_STATIC_INLINE(avx_simd_float) sub(avx_simd_float lhs, avx_simd_float rhs) {
866  return _mm256_sub_ps(lhs.value, rhs.value);
867  }
868 
872  ETL_STATIC_INLINE(avx_simd_double) sub(avx_simd_double lhs, avx_simd_double rhs) {
873  return _mm256_sub_pd(lhs.value, rhs.value);
874  }
875 
879  template <typename T>
880  ETL_STATIC_INLINE(avx_simd_complex_float<T>)
881  sub(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
882  return _mm256_sub_ps(lhs.value, rhs.value);
883  }
884 
888  template <typename T>
889  ETL_STATIC_INLINE(avx_simd_complex_double<T>)
890  sub(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
891  return _mm256_sub_pd(lhs.value, rhs.value);
892  }
893 
894  // Square root
895 
900  ETL_STATIC_INLINE(avx_simd_float) sqrt(avx_simd_float x) {
901  return _mm256_sqrt_ps(x.value);
902  }
903 
908  ETL_STATIC_INLINE(avx_simd_double) sqrt(avx_simd_double x) {
909  return _mm256_sqrt_pd(x.value);
910  }
911 
912  // Negation
913 
914  // TODO negation epi32
915 
920  ETL_STATIC_INLINE(avx_simd_float) minus(avx_simd_float x) {
921  return _mm256_xor_ps(x.value, _mm256_set1_ps(-0.f));
922  }
923 
928  ETL_STATIC_INLINE(avx_simd_double) minus(avx_simd_double x) {
929  return _mm256_xor_pd(x.value, _mm256_set1_pd(-0.));
930  }
931 
932  // Multiplication
933 
934 #ifdef __AVX2__
935 
938  ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) {
939  auto aodd = _mm256_srli_epi16(lhs.value, 8);
940  auto bodd = _mm256_srli_epi16(rhs.value, 8);
941  auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value);
942  auto mulodd = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8);
943  return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF));
944  }
945 
949  ETL_STATIC_INLINE(avx_simd_short) mul(avx_simd_short lhs, avx_simd_short rhs) {
950  return _mm256_mullo_epi16(lhs.value, rhs.value);
951  }
952 
956  ETL_STATIC_INLINE(avx_simd_int) mul(avx_simd_int lhs, avx_simd_int rhs) {
957  return _mm256_mullo_epi32(lhs.value, rhs.value);
958  }
959 
963  ETL_STATIC_INLINE(avx_simd_long) mul(avx_simd_long lhs, avx_simd_long rhs) {
964  int64_t result[4];
965 
966  result[0] = lhs[0] * rhs[0];
967  result[1] = lhs[1] * rhs[1];
968  result[2] = lhs[2] * rhs[2];
969  result[3] = lhs[3] * rhs[3];
970 
971  return loadu(&result[0]);
972  }
973 #endif
974 
978  ETL_STATIC_INLINE(avx_simd_float) mul(avx_simd_float lhs, avx_simd_float rhs) {
979  return _mm256_mul_ps(lhs.value, rhs.value);
980  }
981 
985  ETL_STATIC_INLINE(avx_simd_double) mul(avx_simd_double lhs, avx_simd_double rhs) {
986  return _mm256_mul_pd(lhs.value, rhs.value);
987  }
988 
992  template <typename T>
993  ETL_STATIC_INLINE(avx_simd_complex_float<T>)
994  mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
995  //lhs = [x1.real, x1.img, x2.real, x2.img, ...]
996  //rhs = [y1.real, y1.img, y2.real, y2.img, ...]
997 
998  //ymm1 = [y1.real, y1.real, y2.real, y2.real, ...]
999  __m256 ymm1 = _mm256_moveldup_ps(rhs.value);
1000 
1001  //ymm2 = [x1.img, x1.real, x2.img, x2.real]
1002  __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);
1003 
1004  //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
1005  __m256 ymm3 = _mm256_movehdup_ps(rhs.value);
1006 
1007  //ymm4 = ymm2 * ymm3
1008  __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3);
1009 
1010  //result = [(lhs * ymm1) -+ ymm4];
1011 
1012 #ifdef __FMA__
1013  return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4);
1014 #elif defined(__FMA4__)
1015  return _mm256_maddsub_ps(lhs.value, ymm1, ymm4);
1016 #else
1017  __m256 tmp = _mm256_mul_ps(lhs.value, ymm1);
1018  return _mm256_addsub_ps(tmp, ymm4);
1019 #endif
1020  }
1021 
1025  template <typename T>
1026  ETL_STATIC_INLINE(avx_simd_complex_double<T>)
1027  mul(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
1028  //lhs = [x1.real, x1.img, x2.real, x2.img]
1029  //rhs = [y1.real, y1.img, y2.real, y2.img]
1030 
1031  //ymm1 = [y1.real, y1.real, y2.real, y2.real]
1032  __m256d ymm1 = _mm256_movedup_pd(rhs.value);
1033 
1034  //ymm2 = [x1.img, x1.real, x2.img, x2.real]
1035  __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);
1036 
1037  //ymm3 = [y1.imag, y1.imag, y2.imag, y2.imag]
1038  __m256d ymm3 = _mm256_permute_pd(rhs.value, 0b1111);
1039 
1040  //ymm4 = ymm2 * ymm3
1041  __m256d ymm4 = _mm256_mul_pd(ymm2, ymm3);
1042 
1043  //result = [(lhs * ymm1) -+ ymm4];
1044 
1045 #ifdef __FMA__
1046  return _mm256_fmaddsub_pd(lhs.value, ymm1, ymm4);
1047 #elif defined(__FMA4__)
1048  return _mm256_maddsub_pd(lhs.value, ymm1, ymm4);
1049 #else
1050  __m256d tmp = _mm256_mul_pd(lhs.value, ymm1);
1051  return _mm256_addsub_pd(tmp, ymm4);
1052 #endif
1053  }
1054 
1055  // Fused Multiplay Add (FMA)
1056 
1057 #ifdef __AVX2__
1058 
1061  ETL_STATIC_INLINE(avx_simd_byte) fmadd(avx_simd_byte a, avx_simd_byte b, avx_simd_byte c) {
1062  return add(mul(a, b), c);
1063  }
1064 
1068  ETL_STATIC_INLINE(avx_simd_short) fmadd(avx_simd_short a, avx_simd_short b, avx_simd_short c) {
1069  return add(mul(a, b), c);
1070  }
1071 
1075  ETL_STATIC_INLINE(avx_simd_int) fmadd(avx_simd_int a, avx_simd_int b, avx_simd_int c) {
1076  return add(mul(a, b), c);
1077  }
1078 
1082  ETL_STATIC_INLINE(avx_simd_long) fmadd(avx_simd_long a, avx_simd_long b, avx_simd_long c) {
1083  return add(mul(a, b), c);
1084  }
1085 #endif
1086 
1090  ETL_STATIC_INLINE(avx_simd_float) fmadd(avx_simd_float a, avx_simd_float b, avx_simd_float c) {
1091 #ifdef __FMA__
1092  return _mm256_fmadd_ps(a.value, b.value, c.value);
1093 #else
1094  return add(mul(a, b), c);
1095 #endif
1096  }
1097 
1101  ETL_STATIC_INLINE(avx_simd_double) fmadd(avx_simd_double a, avx_simd_double b, avx_simd_double c) {
1102 #ifdef __FMA__
1103  return _mm256_fmadd_pd(a.value, b.value, c.value);
1104 #else
1105  return add(mul(a, b), c);
1106 #endif
1107  }
1108 
1112  template <typename T>
1113  ETL_STATIC_INLINE(avx_simd_complex_float<T>)
1114  fmadd(avx_simd_complex_float<T> a, avx_simd_complex_float<T> b, avx_simd_complex_float<T> c) {
1115  return add(mul(a, b), c);
1116  }
1117 
1121  template <typename T>
1122  ETL_STATIC_INLINE(avx_simd_complex_double<T>)
1123  fmadd(avx_simd_complex_double<T> a, avx_simd_complex_double<T> b, avx_simd_complex_double<T> c) {
1124  return add(mul(a, b), c);
1125  }
1126 
1127  // Division
1128 
1132  ETL_STATIC_INLINE(avx_simd_float) div(avx_simd_float lhs, avx_simd_float rhs) {
1133  return _mm256_div_ps(lhs.value, rhs.value);
1134  }
1135 
1139  ETL_STATIC_INLINE(avx_simd_double) div(avx_simd_double lhs, avx_simd_double rhs) {
1140  return _mm256_div_pd(lhs.value, rhs.value);
1141  }
1142 
1146  template <typename T>
1147  ETL_STATIC_INLINE(avx_simd_complex_float<T>)
1148  div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
1149  //lhs = [x1.real, x1.img, x2.real, x2.img ...]
1150  //rhs = [y1.real, y1.img, y2.real, y2.img ...]
1151 
1152  //ymm0 = [y1.real, y1.real, y2.real, y2.real, ...]
1153  __m256 ymm0 = _mm256_moveldup_ps(rhs.value);
1154 
1155  //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
1156  __m256 ymm1 = _mm256_movehdup_ps(rhs.value);
1157 
1158  //ymm2 = [x1.img, x1.real, x2.img, x2.real]
1159  __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);
1160 
1161  //ymm4 = [x.img * y.img, x.real * y.img]
1162  __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);
1163 
1164  //ymm5 = subadd((lhs * ymm0), ymm4)
1165 
1166 #ifdef __FMA__
1167  __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
1168 #else
1169  __m256 t1 = _mm256_mul_ps(lhs.value, ymm0);
1170  __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
1171  __m256 ymm5 = _mm256_addsub_ps(t1, t2);
1172 #endif
1173 
1174  //ymm3 = [y.imag^2, y.imag^2]
1175  __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);
1176 
1177  //ymm0 = (ymm0 * ymm0 + ymm3)
1178 
1179 #ifdef __FMA__
1180  ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
1181 #else
1182  __m256 t3 = _mm256_mul_ps(ymm0, ymm0);
1183  ymm0 = _mm256_add_ps(t3, ymm3);
1184 #endif
1185 
1186  //result = ymm5 / ymm0
1187  return _mm256_div_ps(ymm5, ymm0);
1188  }
1189 
1193  template <typename T>
1194  ETL_STATIC_INLINE(avx_simd_complex_double<T>)
1195  div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
1196  //lhs = [x1.real, x1.img, x2.real, x2.img]
1197  //rhs = [y1.real, y1.img, y2.real, y2.img]
1198 
1199  //ymm0 = [y1.real, y1.real, y2.real, y2.real]
1200  __m256d ymm0 = _mm256_movedup_pd(rhs.value);
1201 
1202  //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
1203  __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);
1204 
1205  //ymm2 = [x1.img, x1.real, x2.img, x2.real]
1206  __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);
1207 
1208  //ymm4 = [x.img * y.img, x.real * y.img]
1209  __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);
1210 
1211  //ymm5 = subadd((lhs * ymm0), ymm4)
1212 
1213 #ifdef __FMA__
1214  __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
1215 #else
1216  __m256d t1 = _mm256_mul_pd(lhs.value, ymm0);
1217  __m256d t2 = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
1218  __m256d ymm5 = _mm256_addsub_pd(t1, t2);
1219 #endif
1220 
1221  //ymm3 = [y.imag^2, y.imag^2]
1222  __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);
1223 
1224  //ymm0 = (ymm0 * ymm0 + ymm3)
1225 
1226 #ifdef __FMA__
1227  ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
1228 #else
1229  __m256d t3 = _mm256_mul_pd(ymm0, ymm0);
1230  ymm0 = _mm256_add_pd(t3, ymm3);
1231 #endif
1232 
1233  //result = ymm5 / ymm0
1234  return _mm256_div_pd(ymm5, ymm0);
1235  }
1236 
1237  // Cosinus
1238 
1242  ETL_STATIC_INLINE(avx_simd_float) cos(avx_simd_float x) {
1243  return etl::cos256_ps(x.value);
1244  }
1245 
1249  ETL_STATIC_INLINE(avx_simd_float) sin(avx_simd_float x) {
1250  return etl::sin256_ps(x.value);
1251  }
1252 
1253 #ifndef __INTEL_COMPILER
1254 
1255  //Exponential
1256 
1260  ETL_STATIC_INLINE(avx_simd_float) exp(avx_simd_float x) {
1261  return etl::exp256_ps(x.value);
1262  }
1263 
1267  ETL_STATIC_INLINE(avx_simd_double) exp(avx_simd_double x) {
1268  return etl::exp256_pd(x.value);
1269  }
1270 
1271  //Logarithm
1272 
1276  ETL_STATIC_INLINE(avx_simd_float) log(avx_simd_float x) {
1277  return etl::log256_ps(x.value);
1278  }
1279 
1280 #else //__INTEL_COMPILER
1281 
1282  //Exponential
1283 
1287  ETL_STATIC_INLINE(avx_simd_double) exp(avx_simd_double x) {
1288  return _mm256_exp_pd(x.value);
1289  }
1290 
1294  ETL_STATIC_INLINE(avx_simd_float) exp(avx_simd_float x) {
1295  return _mm256_exp_ps(x.value);
1296  }
1297 
1298  //Logarithm
1299 
1303  ETL_STATIC_INLINE(avx_simd_double) log(avx_simd_double x) {
1304  return _mm256_log_pd(x.value);
1305  }
1306 
1310  ETL_STATIC_INLINE(avx_simd_float) log(avx_simd_float x) {
1311  return _mm256_log_ps(x.value);
1312  }
1313 
1314 #endif //__INTEL_COMPILER
1315 
1316  //Min
1317 
1321  ETL_STATIC_INLINE(avx_simd_double) min(avx_simd_double lhs, avx_simd_double rhs) {
1322  return _mm256_min_pd(lhs.value, rhs.value);
1323  }
1324 
1328  ETL_STATIC_INLINE(avx_simd_float) min(avx_simd_float lhs, avx_simd_float rhs) {
1329  return _mm256_min_ps(lhs.value, rhs.value);
1330  }
1331 
1332  //Max
1333 
1337  ETL_STATIC_INLINE(avx_simd_double) max(avx_simd_double lhs, avx_simd_double rhs) {
1338  return _mm256_max_pd(lhs.value, rhs.value);
1339  }
1340 
1344  ETL_STATIC_INLINE(avx_simd_float) max(avx_simd_float lhs, avx_simd_float rhs) {
1345  return _mm256_max_ps(lhs.value, rhs.value);
1346  }
1347 
1353  ETL_STATIC_INLINE(float) hadd(avx_simd_float in) {
1354  const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
1355  const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
1356  const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
1357  return _mm_cvtss_f32(x32);
1358  }
1359 
1365  ETL_STATIC_INLINE(double) hadd(avx_simd_double in) {
1366  const __m256d t1 = _mm256_hadd_pd(in.value, _mm256_permute2f128_pd(in.value, in.value, 1));
1367  const __m256d t2 = _mm256_hadd_pd(t1, t1);
1368  return _mm_cvtsd_f64(_mm256_castpd256_pd128(t2));
1369  }
1370 
1371  //TODO Vectorize the following functions
1372 
1378  ETL_STATIC_INLINE(int8_t) hadd(avx_simd_byte in) {
1379  return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7] + in[8] + in[9] + in[10] + in[11] + in[12] + in[13] + in[14] + in[15] + in[16]
1380  + in[17] + in[18] + in[19] + in[20] + in[21] + in[22] + in[23] + in[24] + in[25] + in[26] + in[27] + in[28] + in[29] + in[30] + in[31];
1381  }
1382 
1388  ETL_STATIC_INLINE(int16_t) hadd(avx_simd_short in) {
1389  return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7] + in[8] + in[9] + in[10] + in[11] + in[12] + in[13] + in[14] + in[15];
1390  }
1391 
1397  ETL_STATIC_INLINE(int32_t) hadd(avx_simd_int in) {
1398  return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7];
1399  }
1400 
1406  ETL_STATIC_INLINE(int64_t) hadd(avx_simd_long in) {
1407  return in[0] + in[1] + in[2] + in[3];
1408  }
1409 
1415  template <typename T>
1416  ETL_STATIC_INLINE(T)
1417  hadd(avx_simd_complex_float<T> in) {
1418  return in[0] + in[1] + in[2] + in[3];
1419  }
1420 
1426  template <typename T>
1427  ETL_STATIC_INLINE(T)
1428  hadd(avx_simd_complex_double<T> in) {
1429  return in[0] + in[1];
1430  }
1431 };
1432 
1433 #ifdef __AVX2__
1434 
1437 template <>
1438 ETL_OUT_INLINE(avx_simd_byte)
1439 avx_vec::zero<int8_t>() {
1440  return _mm256_setzero_si256();
1441 }
1442 
1446 template <>
1447 ETL_OUT_INLINE(avx_simd_short)
1448 avx_vec::zero<int16_t>() {
1449  return _mm256_setzero_si256();
1450 }
1451 
1455 template <>
1456 ETL_OUT_INLINE(avx_simd_int)
1457 avx_vec::zero<int32_t>() {
1458  return _mm256_setzero_si256();
1459 }
1460 
1464 template <>
1465 ETL_OUT_INLINE(avx_simd_long)
1466 avx_vec::zero<int64_t>() {
1467  return _mm256_setzero_si256();
1468 }
1469 #endif
1470 
1474 template <>
1475 ETL_OUT_INLINE(avx_simd_float)
1476 avx_vec::zero<float>() {
1477  return _mm256_setzero_ps();
1478 }
1479 
1483 template <>
1484 ETL_OUT_INLINE(avx_simd_double)
1485 avx_vec::zero<double>() {
1486  return _mm256_setzero_pd();
1487 }
1488 
1492 template <>
1493 ETL_OUT_INLINE(avx_simd_complex_float<etl::complex<float>>)
1494 avx_vec::zero<etl::complex<float>>() {
1495  return _mm256_setzero_ps();
1496 }
1497 
1501 template <>
1502 ETL_OUT_INLINE(avx_simd_complex_double<etl::complex<double>>)
1503 avx_vec::zero<etl::complex<double>>() {
1504  return _mm256_setzero_pd();
1505 }
1506 
1510 template <>
1511 ETL_OUT_INLINE(avx_simd_complex_float<std::complex<float>>)
1512 avx_vec::zero<std::complex<float>>() {
1513  return _mm256_setzero_ps();
1514 }
1515 
1519 template <>
1520 ETL_OUT_INLINE(avx_simd_complex_double<std::complex<double>>)
1521 avx_vec::zero<std::complex<double>>() {
1522  return _mm256_setzero_pd();
1523 }
1524 
1525 } //end of namespace etl
1526 
1527 #endif //__AVX__
auto sin(E &&value) -> detail::unary_helper< E, sin_unary_op >
Apply sinus on each value of the given expression.
Definition: function_expression_builder.hpp:114
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
Complex number implementation.
Definition: complex.hpp:31
auto mul(A &&a, B &&b)
Multiply two matrices together.
Definition: gemm_expr.hpp:442
void minus([[maybe_unused]] size_t n, [[maybe_unused]] float alpha, [[maybe_unused]] float *A, [[maybe_unused]] size_t lda, [[maybe_unused]] float *B, [[maybe_unused]] size_t ldb)
Wrappers for single-precision egblas minus operation.
Definition: minus.hpp:43
auto sqrt(E &&value) -> detail::unary_helper< E, sqrt_unary_op >
Apply square root on each value of the given expression.
Definition: function_expression_builder.hpp:24
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
constexpr bool avx2_enabled
Indicates if AVX2 is available.
Definition: config.hpp:210
auto cos(E &&value) -> detail::unary_helper< E, cos_unary_op >
Apply cosinus on each value of the given expression.
Definition: function_expression_builder.hpp:104
auto load(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:143
Root namespace for the ETL library.
Definition: adapter.hpp:15
void store(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:176
void stream(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once, using non-temporal store.
Definition: dyn_matrix_view.hpp:165
void storeu(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:187
auto loadu(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:154
auto min(L &&lhs, R &&rhs)
Create an expression with the min value of lhs or rhs.
Definition: expression_builder.hpp:77
auto exp(E &&value) -> detail::unary_helper< E, exp_unary_op >
Apply exponential on each value of the given expression.
Definition: function_expression_builder.hpp:154
Inlining macros.
auto log(E &&value) -> detail::unary_helper< E, log_unary_op >
Apply logarithm (base e) on each value of the given expression.
Definition: function_expression_builder.hpp:64