17 #include <immintrin.h> 18 #include <emmintrin.h> 19 #include <xmmintrin.h> 22 #include "etl/avx_exp.hpp" 28 #define ETL_INLINE_VEC_VOID ETL_STATIC_INLINE(void) 29 #define ETL_INLINE_VEC_256 ETL_STATIC_INLINE(__m256) 30 #define ETL_INLINE_VEC_256D ETL_STATIC_INLINE(__m256d) 31 #define ETL_OUT_VEC_256 ETL_OUT_INLINE(__m256) 32 #define ETL_OUT_VEC_256D ETL_OUT_INLINE(__m256d) 39 using avx_simd_float = simd_pack<vector_mode_t::AVX, float, __m256>;
44 using avx_simd_double = simd_pack<vector_mode_t::AVX, double, __m256d>;
50 using avx_simd_complex_float = simd_pack<vector_mode_t::AVX, T, __m256>;
56 using avx_simd_complex_double = simd_pack<vector_mode_t::AVX, T, __m256d>;
61 using avx_simd_byte = simd_pack<vector_mode_t::AVX, int8_t, __m256i>;
66 using avx_simd_short = simd_pack<vector_mode_t::AVX, int16_t, __m256i>;
71 using avx_simd_int = simd_pack<vector_mode_t::AVX, int32_t, __m256i>;
76 using avx_simd_long = simd_pack<vector_mode_t::AVX, int64_t, __m256i>;
82 struct avx_intrinsic_traits {
83 static constexpr
bool vectorizable =
false;
84 static constexpr
size_t size = 1;
85 static constexpr
size_t alignment =
alignof(T);
87 using intrinsic_type = T;
94 struct avx_intrinsic_traits<float> {
95 static constexpr
bool vectorizable =
true;
96 static constexpr
size_t size = 8;
97 static constexpr
size_t alignment = 32;
99 using intrinsic_type = avx_simd_float;
106 struct avx_intrinsic_traits<double> {
107 static constexpr
bool vectorizable =
true;
108 static constexpr
size_t size = 4;
109 static constexpr
size_t alignment = 32;
111 using intrinsic_type = avx_simd_double;
118 struct avx_intrinsic_traits<std::complex<float>> {
119 static constexpr
bool vectorizable =
true;
120 static constexpr
size_t size = 4;
121 static constexpr
size_t alignment = 32;
123 using intrinsic_type = avx_simd_complex_float<std::complex<float>>;
130 struct avx_intrinsic_traits<std::complex<double>> {
131 static constexpr
bool vectorizable =
true;
132 static constexpr
size_t size = 2;
133 static constexpr
size_t alignment = 32;
135 using intrinsic_type = avx_simd_complex_double<std::complex<double>>;
142 struct avx_intrinsic_traits<
etl::complex<float>> {
143 static constexpr
bool vectorizable =
true;
144 static constexpr
size_t size = 4;
145 static constexpr
size_t alignment = 32;
147 using intrinsic_type = avx_simd_complex_float<etl::complex<float>>;
154 struct avx_intrinsic_traits<
etl::complex<double>> {
155 static constexpr
bool vectorizable =
true;
156 static constexpr
size_t size = 2;
157 static constexpr
size_t alignment = 32;
159 using intrinsic_type = avx_simd_complex_double<etl::complex<double>>;
166 struct avx_intrinsic_traits<int8_t> {
168 static constexpr
size_t size = 32;
169 static constexpr
size_t alignment = 32;
171 using intrinsic_type = avx_simd_byte;
178 struct avx_intrinsic_traits<int16_t> {
180 static constexpr
size_t size = 16;
181 static constexpr
size_t alignment = 32;
183 using intrinsic_type = avx_simd_short;
190 struct avx_intrinsic_traits<int32_t> {
192 static constexpr
size_t size = 8;
193 static constexpr
size_t alignment = 32;
195 using intrinsic_type = avx_simd_int;
202 struct avx_intrinsic_traits<int64_t> {
204 static constexpr
size_t size = 4;
205 static constexpr
size_t alignment = 32;
207 using intrinsic_type = avx_simd_long;
217 template <
typename T>
218 using traits = avx_intrinsic_traits<T>;
223 template <
typename T>
224 using vec_type =
typename traits<T>::intrinsic_type;
231 template <
typename T>
232 static std::string debug_d(T value) {
236 test(__m256d vec) : vec(vec) {}
239 test u_value = value;
240 std::cout <<
"[" << u_value.array[0] <<
"," << u_value.array[1] <<
"," << u_value.array[2] <<
"," << u_value.array[3] <<
"]" << std::endl;
246 template <
typename T>
247 static std::string debug_s(T value) {
251 test(__m256 vec) : vec(vec) {}
254 test u_value = value;
255 std::cout <<
"[" << u_value.array[0] <<
"," << u_value.array[1] <<
"," << u_value.array[2] <<
"," << u_value.array[3] <<
"," << u_value.array[4] <<
"," 256 << u_value.array[5] <<
"," << u_value.array[6] <<
"," << u_value.array[7] <<
"]" << std::endl;
264 template <
typename T>
265 static std::string debug_d(T) {
272 template <
typename T>
273 static std::string debug_s(T) {
284 ETL_STATIC_INLINE(
void)
storeu(int8_t* memory, avx_simd_byte value) {
285 _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
292 ETL_STATIC_INLINE(
void)
storeu(int16_t* memory, avx_simd_short value) {
293 _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
300 ETL_STATIC_INLINE(
void)
storeu(int32_t* memory, avx_simd_int value) {
301 _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
308 ETL_STATIC_INLINE(
void)
storeu(int64_t* memory, avx_simd_long value) {
309 _mm256_storeu_si256(reinterpret_cast<__m256i*>(memory), value.value);
317 ETL_STATIC_INLINE(
void)
storeu(
float* memory, avx_simd_float value) {
318 _mm256_storeu_ps(memory, value.value);
325 ETL_STATIC_INLINE(
void)
storeu(
double* memory, avx_simd_double value) {
326 _mm256_storeu_pd(memory, value.value);
333 ETL_STATIC_INLINE(
void)
storeu(std::complex<float>* memory, avx_simd_complex_float<std::complex<float>> value) {
334 _mm256_storeu_ps(reinterpret_cast<float*>(memory), value.value);
341 ETL_STATIC_INLINE(
void)
storeu(std::complex<double>* memory, avx_simd_complex_double<std::complex<double>> value) {
342 _mm256_storeu_pd(reinterpret_cast<double*>(memory), value.value);
350 _mm256_storeu_ps(reinterpret_cast<float*>(memory), value.value);
358 _mm256_storeu_pd(reinterpret_cast<double*>(memory), value.value);
366 ETL_STATIC_INLINE(
void)
stream(int8_t* memory, avx_simd_byte value) {
367 _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
374 ETL_STATIC_INLINE(
void)
stream(int16_t* memory, avx_simd_short value) {
375 _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
382 ETL_STATIC_INLINE(
void)
stream(int32_t* memory, avx_simd_int value) {
383 _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
390 ETL_STATIC_INLINE(
void)
stream(int64_t* memory, avx_simd_long value) {
391 _mm256_stream_si256(reinterpret_cast<__m256i*>(memory), value.value);
399 ETL_STATIC_INLINE(
void)
stream(
float* memory, avx_simd_float value) {
400 _mm256_stream_ps(memory, value.value);
407 ETL_STATIC_INLINE(
void)
stream(
double* memory, avx_simd_double value) {
408 _mm256_stream_pd(memory, value.value);
415 ETL_STATIC_INLINE(
void)
stream(std::complex<float>* memory, avx_simd_complex_float<std::complex<float>> value) {
416 _mm256_stream_ps(reinterpret_cast<float*>(memory), value.value);
423 ETL_STATIC_INLINE(
void)
stream(std::complex<double>* memory, avx_simd_complex_double<std::complex<double>> value) {
424 _mm256_stream_pd(reinterpret_cast<double*>(memory), value.value);
432 _mm256_stream_ps(reinterpret_cast<float*>(memory), value.value);
440 _mm256_stream_pd(reinterpret_cast<double*>(memory), value.value);
448 ETL_STATIC_INLINE(
void)
store(int8_t* memory, avx_simd_byte value) {
449 _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
456 ETL_STATIC_INLINE(
void)
store(int16_t* memory, avx_simd_short value) {
457 _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
464 ETL_STATIC_INLINE(
void)
store(int32_t* memory, avx_simd_int value) {
465 _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
472 ETL_STATIC_INLINE(
void)
store(int64_t* memory, avx_simd_long value) {
473 _mm256_store_si256(reinterpret_cast<__m256i*>(memory), value.value);
481 ETL_STATIC_INLINE(
void)
store(
float* memory, avx_simd_float value) {
482 _mm256_store_ps(memory, value.value);
489 ETL_STATIC_INLINE(
void)
store(
double* memory, avx_simd_double value) {
490 _mm256_store_pd(memory, value.value);
497 ETL_STATIC_INLINE(
void)
store(std::complex<float>* memory, avx_simd_complex_float<std::complex<float>> value) {
498 _mm256_store_ps(reinterpret_cast<float*>(memory), value.value);
505 ETL_STATIC_INLINE(
void)
store(std::complex<double>* memory, avx_simd_complex_double<std::complex<double>> value) {
506 _mm256_store_pd(reinterpret_cast<double*>(memory), value.value);
514 _mm256_store_ps(reinterpret_cast<float*>(memory), value.value);
522 _mm256_store_pd(reinterpret_cast<double*>(memory), value.value);
528 template <
typename T>
529 ETL_TMP_INLINE(
typename avx_intrinsic_traits<T>::intrinsic_type)
536 ETL_STATIC_INLINE(avx_simd_byte)
load(
const int8_t* memory) {
537 return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
543 ETL_STATIC_INLINE(avx_simd_short)
load(
const int16_t* memory) {
544 return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
550 ETL_STATIC_INLINE(avx_simd_int)
load(
const int32_t* memory) {
551 return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
557 ETL_STATIC_INLINE(avx_simd_long)
load(
const int64_t* memory) {
558 return _mm256_load_si256(reinterpret_cast<const __m256i*>(memory));
565 ETL_STATIC_INLINE(avx_simd_float)
load(
const float* memory) {
566 return _mm256_load_ps(memory);
572 ETL_STATIC_INLINE(avx_simd_double)
load(
const double* memory) {
573 return _mm256_load_pd(memory);
579 ETL_STATIC_INLINE(avx_simd_complex_float<std::complex<float>>)
load(const std::complex<
float>* memory) {
580 return _mm256_load_ps(reinterpret_cast<const float*>(memory));
586 ETL_STATIC_INLINE(avx_simd_complex_double<std::complex<double>>)
load(const std::complex<
double>* memory) {
587 return _mm256_load_pd(reinterpret_cast<const double*>(memory));
594 return _mm256_load_ps(reinterpret_cast<const float*>(memory));
601 return _mm256_load_pd(reinterpret_cast<const double*>(memory));
608 ETL_STATIC_INLINE(avx_simd_byte)
loadu(
const int8_t* memory) {
609 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
615 ETL_STATIC_INLINE(avx_simd_short)
loadu(
const int16_t* memory) {
616 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
622 ETL_STATIC_INLINE(avx_simd_int)
loadu(
const int32_t* memory) {
623 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
629 ETL_STATIC_INLINE(avx_simd_long)
loadu(
const int64_t* memory) {
630 return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(memory));
637 ETL_STATIC_INLINE(avx_simd_float)
loadu(
const float* memory) {
638 return _mm256_loadu_ps(memory);
644 ETL_STATIC_INLINE(avx_simd_double)
loadu(
const double* memory) {
645 return _mm256_loadu_pd(memory);
651 ETL_STATIC_INLINE(avx_simd_complex_float<std::complex<float>>)
loadu(const std::complex<
float>* memory) {
652 return _mm256_loadu_ps(reinterpret_cast<const float*>(memory));
658 ETL_STATIC_INLINE(avx_simd_complex_double<std::complex<double>>)
loadu(const std::complex<
double>* memory) {
659 return _mm256_loadu_pd(reinterpret_cast<const double*>(memory));
666 return _mm256_loadu_ps(reinterpret_cast<const float*>(memory));
673 return _mm256_loadu_pd(reinterpret_cast<const double*>(memory));
680 ETL_STATIC_INLINE(avx_simd_byte)
set(int8_t value) {
681 return _mm256_set1_epi8(value);
687 ETL_STATIC_INLINE(avx_simd_short)
set(int16_t value) {
688 return _mm256_set1_epi16(value);
694 ETL_STATIC_INLINE(avx_simd_int)
set(int32_t value) {
695 return _mm256_set1_epi32(value);
701 ETL_STATIC_INLINE(avx_simd_long)
set(int64_t value) {
702 return _mm256_set1_epi64x(value);
709 ETL_STATIC_INLINE(avx_simd_double)
set(
double value) {
710 return _mm256_set1_pd(value);
716 ETL_STATIC_INLINE(avx_simd_float)
set(
float value) {
717 return _mm256_set1_ps(value);
723 ETL_STATIC_INLINE(avx_simd_complex_float<std::complex<float>>) set(std::complex<
float> value) {
724 std::complex<float> tmp[]{value, value, value, value};
731 ETL_STATIC_INLINE(avx_simd_complex_double<std::complex<double>>) set(std::complex<
double> value) {
732 std::complex<double> tmp[]{value, value};
755 ETL_STATIC_INLINE(avx_simd_float) round_up(avx_simd_float x) {
756 return _mm256_round_ps(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
762 ETL_STATIC_INLINE(avx_simd_double) round_up(avx_simd_double x) {
763 return _mm256_round_pd(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
772 ETL_STATIC_INLINE(avx_simd_byte) add(avx_simd_byte lhs, avx_simd_byte rhs) {
773 return _mm256_add_epi8(lhs.value, rhs.value);
779 ETL_STATIC_INLINE(avx_simd_short) add(avx_simd_short lhs, avx_simd_short rhs) {
780 return _mm256_add_epi16(lhs.value, rhs.value);
786 ETL_STATIC_INLINE(avx_simd_int) add(avx_simd_int lhs, avx_simd_int rhs) {
787 return _mm256_add_epi32(lhs.value, rhs.value);
793 ETL_STATIC_INLINE(avx_simd_long) add(avx_simd_long lhs, avx_simd_long rhs) {
794 return _mm256_add_epi64(lhs.value, rhs.value);
801 ETL_STATIC_INLINE(avx_simd_float) add(avx_simd_float lhs, avx_simd_float rhs) {
802 return _mm256_add_ps(lhs.value, rhs.value);
808 ETL_STATIC_INLINE(avx_simd_double) add(avx_simd_double lhs, avx_simd_double rhs) {
809 return _mm256_add_pd(lhs.value, rhs.value);
815 template <
typename T>
816 ETL_STATIC_INLINE(avx_simd_complex_float<T>)
817 add(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
818 return _mm256_add_ps(lhs.value, rhs.value);
824 template <
typename T>
825 ETL_STATIC_INLINE(avx_simd_complex_double<T>)
826 add(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
827 return _mm256_add_pd(lhs.value, rhs.value);
836 ETL_STATIC_INLINE(avx_simd_byte) sub(avx_simd_byte lhs, avx_simd_byte rhs) {
837 return _mm256_sub_epi8(lhs.value, rhs.value);
843 ETL_STATIC_INLINE(avx_simd_short) sub(avx_simd_short lhs, avx_simd_short rhs) {
844 return _mm256_sub_epi16(lhs.value, rhs.value);
850 ETL_STATIC_INLINE(avx_simd_int) sub(avx_simd_int lhs, avx_simd_int rhs) {
851 return _mm256_sub_epi32(lhs.value, rhs.value);
857 ETL_STATIC_INLINE(avx_simd_long) sub(avx_simd_long lhs, avx_simd_long rhs) {
858 return _mm256_sub_epi64(lhs.value, rhs.value);
865 ETL_STATIC_INLINE(avx_simd_float) sub(avx_simd_float lhs, avx_simd_float rhs) {
866 return _mm256_sub_ps(lhs.value, rhs.value);
872 ETL_STATIC_INLINE(avx_simd_double) sub(avx_simd_double lhs, avx_simd_double rhs) {
873 return _mm256_sub_pd(lhs.value, rhs.value);
879 template <
typename T>
880 ETL_STATIC_INLINE(avx_simd_complex_float<T>)
881 sub(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
882 return _mm256_sub_ps(lhs.value, rhs.value);
888 template <
typename T>
889 ETL_STATIC_INLINE(avx_simd_complex_double<T>)
890 sub(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
891 return _mm256_sub_pd(lhs.value, rhs.value);
900 ETL_STATIC_INLINE(avx_simd_float)
sqrt(avx_simd_float x) {
901 return _mm256_sqrt_ps(x.value);
908 ETL_STATIC_INLINE(avx_simd_double)
sqrt(avx_simd_double x) {
909 return _mm256_sqrt_pd(x.value);
920 ETL_STATIC_INLINE(avx_simd_float)
minus(avx_simd_float x) {
921 return _mm256_xor_ps(x.value, _mm256_set1_ps(-0.f));
928 ETL_STATIC_INLINE(avx_simd_double)
minus(avx_simd_double x) {
929 return _mm256_xor_pd(x.value, _mm256_set1_pd(-0.));
938 ETL_STATIC_INLINE(avx_simd_byte)
mul(avx_simd_byte lhs, avx_simd_byte rhs) {
939 auto aodd = _mm256_srli_epi16(lhs.value, 8);
940 auto bodd = _mm256_srli_epi16(rhs.value, 8);
941 auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value);
942 auto mulodd = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8);
943 return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF));
949 ETL_STATIC_INLINE(avx_simd_short)
mul(avx_simd_short lhs, avx_simd_short rhs) {
950 return _mm256_mullo_epi16(lhs.value, rhs.value);
956 ETL_STATIC_INLINE(avx_simd_int)
mul(avx_simd_int lhs, avx_simd_int rhs) {
957 return _mm256_mullo_epi32(lhs.value, rhs.value);
963 ETL_STATIC_INLINE(avx_simd_long)
mul(avx_simd_long lhs, avx_simd_long rhs) {
966 result[0] = lhs[0] * rhs[0];
967 result[1] = lhs[1] * rhs[1];
968 result[2] = lhs[2] * rhs[2];
969 result[3] = lhs[3] * rhs[3];
971 return loadu(&result[0]);
978 ETL_STATIC_INLINE(avx_simd_float)
mul(avx_simd_float lhs, avx_simd_float rhs) {
979 return _mm256_mul_ps(lhs.value, rhs.value);
985 ETL_STATIC_INLINE(avx_simd_double)
mul(avx_simd_double lhs, avx_simd_double rhs) {
986 return _mm256_mul_pd(lhs.value, rhs.value);
992 template <
typename T>
993 ETL_STATIC_INLINE(avx_simd_complex_float<T>)
994 mul(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
999 __m256 ymm1 = _mm256_moveldup_ps(rhs.value);
1002 __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);
1005 __m256 ymm3 = _mm256_movehdup_ps(rhs.value);
1008 __m256 ymm4 = _mm256_mul_ps(ymm2, ymm3);
1013 return _mm256_fmaddsub_ps(lhs.value, ymm1, ymm4);
1014 #elif defined(__FMA4__) 1015 return _mm256_maddsub_ps(lhs.value, ymm1, ymm4);
1017 __m256 tmp = _mm256_mul_ps(lhs.value, ymm1);
1018 return _mm256_addsub_ps(tmp, ymm4);
1025 template <
typename T>
1026 ETL_STATIC_INLINE(avx_simd_complex_double<T>)
1027 mul(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
1032 __m256d ymm1 = _mm256_movedup_pd(rhs.value);
1035 __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);
1038 __m256d ymm3 = _mm256_permute_pd(rhs.value, 0b1111);
1041 __m256d ymm4 = _mm256_mul_pd(ymm2, ymm3);
1046 return _mm256_fmaddsub_pd(lhs.value, ymm1, ymm4);
1047 #elif defined(__FMA4__) 1048 return _mm256_maddsub_pd(lhs.value, ymm1, ymm4);
1050 __m256d tmp = _mm256_mul_pd(lhs.value, ymm1);
1051 return _mm256_addsub_pd(tmp, ymm4);
1061 ETL_STATIC_INLINE(avx_simd_byte) fmadd(avx_simd_byte a, avx_simd_byte b, avx_simd_byte c) {
1062 return add(
mul(a, b), c);
1068 ETL_STATIC_INLINE(avx_simd_short) fmadd(avx_simd_short a, avx_simd_short b, avx_simd_short c) {
1069 return add(
mul(a, b), c);
1075 ETL_STATIC_INLINE(avx_simd_int) fmadd(avx_simd_int a, avx_simd_int b, avx_simd_int c) {
1076 return add(
mul(a, b), c);
1082 ETL_STATIC_INLINE(avx_simd_long) fmadd(avx_simd_long a, avx_simd_long b, avx_simd_long c) {
1083 return add(
mul(a, b), c);
1090 ETL_STATIC_INLINE(avx_simd_float) fmadd(avx_simd_float a, avx_simd_float b, avx_simd_float c) {
1092 return _mm256_fmadd_ps(a.value, b.value, c.value);
1094 return add(
mul(a, b), c);
1101 ETL_STATIC_INLINE(avx_simd_double) fmadd(avx_simd_double a, avx_simd_double b, avx_simd_double c) {
1103 return _mm256_fmadd_pd(a.value, b.value, c.value);
1105 return add(
mul(a, b), c);
1112 template <
typename T>
1113 ETL_STATIC_INLINE(avx_simd_complex_float<T>)
1114 fmadd(avx_simd_complex_float<T> a, avx_simd_complex_float<T> b, avx_simd_complex_float<T> c) {
1115 return add(
mul(a, b), c);
1121 template <
typename T>
1122 ETL_STATIC_INLINE(avx_simd_complex_double<T>)
1123 fmadd(avx_simd_complex_double<T> a, avx_simd_complex_double<T> b, avx_simd_complex_double<T> c) {
1124 return add(
mul(a, b), c);
1132 ETL_STATIC_INLINE(avx_simd_float) div(avx_simd_float lhs, avx_simd_float rhs) {
1133 return _mm256_div_ps(lhs.value, rhs.value);
1139 ETL_STATIC_INLINE(avx_simd_double) div(avx_simd_double lhs, avx_simd_double rhs) {
1140 return _mm256_div_pd(lhs.value, rhs.value);
1146 template <
typename T>
1147 ETL_STATIC_INLINE(avx_simd_complex_float<T>)
1148 div(avx_simd_complex_float<T> lhs, avx_simd_complex_float<T> rhs) {
1153 __m256 ymm0 = _mm256_moveldup_ps(rhs.value);
1156 __m256 ymm1 = _mm256_movehdup_ps(rhs.value);
1159 __m256 ymm2 = _mm256_permute_ps(lhs.value, 0b10110001);
1162 __m256 ymm4 = _mm256_mul_ps(ymm2, ymm1);
1167 __m256 ymm5 = _mm256_fmsubadd_ps(lhs.value, ymm0, ymm4);
1169 __m256 t1 = _mm256_mul_ps(lhs.value, ymm0);
1170 __m256 t2 = _mm256_sub_ps(_mm256_set1_ps(0.0), ymm4);
1171 __m256 ymm5 = _mm256_addsub_ps(t1, t2);
1175 __m256 ymm3 = _mm256_mul_ps(ymm1, ymm1);
1180 ymm0 = _mm256_fmadd_ps(ymm0, ymm0, ymm3);
1182 __m256 t3 = _mm256_mul_ps(ymm0, ymm0);
1183 ymm0 = _mm256_add_ps(t3, ymm3);
1187 return _mm256_div_ps(ymm5, ymm0);
1193 template <
typename T>
1194 ETL_STATIC_INLINE(avx_simd_complex_double<T>)
1195 div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) {
1200 __m256d ymm0 = _mm256_movedup_pd(rhs.value);
1203 __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111);
1206 __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101);
1209 __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1);
1214 __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4);
1216 __m256d t1 = _mm256_mul_pd(lhs.value, ymm0);
1217 __m256d t2 = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4);
1218 __m256d ymm5 = _mm256_addsub_pd(t1, t2);
1222 __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1);
1227 ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3);
1229 __m256d t3 = _mm256_mul_pd(ymm0, ymm0);
1230 ymm0 = _mm256_add_pd(t3, ymm3);
1234 return _mm256_div_pd(ymm5, ymm0);
1242 ETL_STATIC_INLINE(avx_simd_float)
cos(avx_simd_float x) {
1243 return etl::cos256_ps(x.value);
1249 ETL_STATIC_INLINE(avx_simd_float)
sin(avx_simd_float x) {
1250 return etl::sin256_ps(x.value);
1253 #ifndef __INTEL_COMPILER 1260 ETL_STATIC_INLINE(avx_simd_float)
exp(avx_simd_float x) {
1261 return etl::exp256_ps(x.value);
1267 ETL_STATIC_INLINE(avx_simd_double)
exp(avx_simd_double x) {
1268 return etl::exp256_pd(x.value);
1276 ETL_STATIC_INLINE(avx_simd_float)
log(avx_simd_float x) {
1277 return etl::log256_ps(x.value);
1280 #else //__INTEL_COMPILER 1287 ETL_STATIC_INLINE(avx_simd_double)
exp(avx_simd_double x) {
1288 return _mm256_exp_pd(x.value);
1294 ETL_STATIC_INLINE(avx_simd_float)
exp(avx_simd_float x) {
1295 return _mm256_exp_ps(x.value);
1303 ETL_STATIC_INLINE(avx_simd_double)
log(avx_simd_double x) {
1304 return _mm256_log_pd(x.value);
1310 ETL_STATIC_INLINE(avx_simd_float)
log(avx_simd_float x) {
1311 return _mm256_log_ps(x.value);
1314 #endif //__INTEL_COMPILER 1321 ETL_STATIC_INLINE(avx_simd_double)
min(avx_simd_double lhs, avx_simd_double rhs) {
1322 return _mm256_min_pd(lhs.value, rhs.value);
1328 ETL_STATIC_INLINE(avx_simd_float)
min(avx_simd_float lhs, avx_simd_float rhs) {
1329 return _mm256_min_ps(lhs.value, rhs.value);
1337 ETL_STATIC_INLINE(avx_simd_double)
max(avx_simd_double lhs, avx_simd_double rhs) {
1338 return _mm256_max_pd(lhs.value, rhs.value);
1344 ETL_STATIC_INLINE(avx_simd_float)
max(avx_simd_float lhs, avx_simd_float rhs) {
1345 return _mm256_max_ps(lhs.value, rhs.value);
1353 ETL_STATIC_INLINE(
float) hadd(avx_simd_float in) {
1354 const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
1355 const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
1356 const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
1357 return _mm_cvtss_f32(x32);
1365 ETL_STATIC_INLINE(
double) hadd(avx_simd_double in) {
1366 const __m256d t1 = _mm256_hadd_pd(in.value, _mm256_permute2f128_pd(in.value, in.value, 1));
1367 const __m256d t2 = _mm256_hadd_pd(t1, t1);
1368 return _mm_cvtsd_f64(_mm256_castpd256_pd128(t2));
1378 ETL_STATIC_INLINE(int8_t) hadd(avx_simd_byte in) {
1379 return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7] + in[8] + in[9] + in[10] + in[11] + in[12] + in[13] + in[14] + in[15] + in[16]
1380 + in[17] + in[18] + in[19] + in[20] + in[21] + in[22] + in[23] + in[24] + in[25] + in[26] + in[27] + in[28] + in[29] + in[30] + in[31];
1388 ETL_STATIC_INLINE(int16_t) hadd(avx_simd_short in) {
1389 return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7] + in[8] + in[9] + in[10] + in[11] + in[12] + in[13] + in[14] + in[15];
1397 ETL_STATIC_INLINE(int32_t) hadd(avx_simd_int in) {
1398 return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7];
1406 ETL_STATIC_INLINE(int64_t) hadd(avx_simd_long in) {
1407 return in[0] + in[1] + in[2] + in[3];
1415 template <
typename T>
1416 ETL_STATIC_INLINE(T)
1417 hadd(avx_simd_complex_float<T> in) {
1418 return in[0] + in[1] + in[2] + in[3];
1426 template <
typename T>
1427 ETL_STATIC_INLINE(T)
1428 hadd(avx_simd_complex_double<T> in) {
1429 return in[0] + in[1];
1438 ETL_OUT_INLINE(avx_simd_byte)
1439 avx_vec::zero<int8_t>() {
1440 return _mm256_setzero_si256();
1447 ETL_OUT_INLINE(avx_simd_short)
1448 avx_vec::zero<int16_t>() {
1449 return _mm256_setzero_si256();
1456 ETL_OUT_INLINE(avx_simd_int)
1457 avx_vec::zero<int32_t>() {
1458 return _mm256_setzero_si256();
1465 ETL_OUT_INLINE(avx_simd_long)
1466 avx_vec::zero<int64_t>() {
1467 return _mm256_setzero_si256();
1475 ETL_OUT_INLINE(avx_simd_float)
1476 avx_vec::zero<float>() {
1477 return _mm256_setzero_ps();
1484 ETL_OUT_INLINE(avx_simd_double)
1485 avx_vec::zero<double>() {
1486 return _mm256_setzero_pd();
1494 avx_vec::zero<
etl::complex<
float>>() {
1495 return _mm256_setzero_ps();
1503 avx_vec::zero<
etl::complex<
double>>() {
1504 return _mm256_setzero_pd();
1511 ETL_OUT_INLINE(avx_simd_complex_float<std::complex<float>>)
1512 avx_vec::zero<std::complex<
float>>() {
1513 return _mm256_setzero_ps();
1520 ETL_OUT_INLINE(avx_simd_complex_double<std::complex<double>>)
1521 avx_vec::zero<std::complex<
double>>() {
1522 return _mm256_setzero_pd();
auto sin(E &&value) -> detail::unary_helper< E, sin_unary_op >
Apply sinus on each value of the given expression.
Definition: function_expression_builder.hpp:114
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
Complex number implementation.
Definition: complex.hpp:31
auto mul(A &&a, B &&b)
Multiply two matrices together.
Definition: gemm_expr.hpp:442
void minus([[maybe_unused]] size_t n, [[maybe_unused]] float alpha, [[maybe_unused]] float *A, [[maybe_unused]] size_t lda, [[maybe_unused]] float *B, [[maybe_unused]] size_t ldb)
Wrappers for single-precision egblas minus operation.
Definition: minus.hpp:43
auto sqrt(E &&value) -> detail::unary_helper< E, sqrt_unary_op >
Apply square root on each value of the given expression.
Definition: function_expression_builder.hpp:24
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
constexpr bool avx2_enabled
Indicates if AVX2 is available.
Definition: config.hpp:210
auto cos(E &&value) -> detail::unary_helper< E, cos_unary_op >
Apply cosinus on each value of the given expression.
Definition: function_expression_builder.hpp:104
auto load(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:143
Root namespace for the ETL library.
Definition: adapter.hpp:15
void store(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:176
void stream(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once, using non-temporal store.
Definition: dyn_matrix_view.hpp:165
void storeu(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:187
auto loadu(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:154
auto min(L &&lhs, R &&rhs)
Create an expression with the min value of lhs or rhs.
Definition: expression_builder.hpp:77
auto exp(E &&value) -> detail::unary_helper< E, exp_unary_op >
Apply exponential on each value of the given expression.
Definition: function_expression_builder.hpp:154
auto log(E &&value) -> detail::unary_helper< E, log_unary_op >
Apply logarithm (base e) on each value of the given expression.
Definition: function_expression_builder.hpp:64