17 #include <immintrin.h> 18 #include <emmintrin.h> 19 #include <xmmintrin.h> 29 #define _mm_undefined_ps _mm_setzero_ps 30 #define _mm_undefined_pd _mm_setzero_pd 38 using sse_simd_float = simd_pack<vector_mode_t::SSE3, float, __m128>;
43 using sse_simd_double = simd_pack<vector_mode_t::SSE3, double, __m128d>;
49 using sse_simd_complex_float = simd_pack<vector_mode_t::SSE3, T, __m128>;
55 using sse_simd_complex_double = simd_pack<vector_mode_t::SSE3, T, __m128d>;
60 using sse_simd_byte = simd_pack<vector_mode_t::SSE3, int8_t, __m128i>;
65 using sse_simd_short = simd_pack<vector_mode_t::SSE3, int16_t, __m128i>;
70 using sse_simd_int = simd_pack<vector_mode_t::SSE3, int32_t, __m128i>;
75 using sse_simd_long = simd_pack<vector_mode_t::SSE3, int64_t, __m128i>;
81 struct sse_intrinsic_traits {
82 static constexpr
bool vectorizable =
false;
83 static constexpr
size_t size = 1;
84 static constexpr
size_t alignment =
alignof(T);
86 using intrinsic_type = T;
93 struct sse_intrinsic_traits<float> {
94 static constexpr
bool vectorizable =
true;
95 static constexpr
size_t size = 4;
96 static constexpr
size_t alignment = 16;
98 using intrinsic_type = sse_simd_float;
105 struct sse_intrinsic_traits<double> {
106 static constexpr
bool vectorizable =
true;
107 static constexpr
size_t size = 2;
108 static constexpr
size_t alignment = 16;
110 using intrinsic_type = sse_simd_double;
117 struct sse_intrinsic_traits<std::complex<float>> {
118 static constexpr
bool vectorizable =
true;
119 static constexpr
size_t size = 2;
120 static constexpr
size_t alignment = 16;
122 using intrinsic_type = sse_simd_complex_float<std::complex<float>>;
129 struct sse_intrinsic_traits<std::complex<double>> {
130 static constexpr
bool vectorizable =
true;
131 static constexpr
size_t size = 1;
132 static constexpr
size_t alignment = 16;
134 using intrinsic_type = sse_simd_complex_double<std::complex<double>>;
141 struct sse_intrinsic_traits<
etl::complex<float>> {
142 static constexpr
bool vectorizable =
true;
143 static constexpr
size_t size = 2;
144 static constexpr
size_t alignment = 16;
146 using intrinsic_type = sse_simd_complex_float<etl::complex<float>>;
153 struct sse_intrinsic_traits<
etl::complex<double>> {
154 static constexpr
bool vectorizable =
true;
155 static constexpr
size_t size = 1;
156 static constexpr
size_t alignment = 16;
158 using intrinsic_type = sse_simd_complex_double<etl::complex<double>>;
165 struct sse_intrinsic_traits<int8_t> {
166 static constexpr
bool vectorizable =
true;
167 static constexpr
size_t size = 16;
168 static constexpr
size_t alignment = 16;
170 using intrinsic_type = sse_simd_byte;
177 struct sse_intrinsic_traits<int16_t> {
178 static constexpr
bool vectorizable =
true;
179 static constexpr
size_t size = 8;
180 static constexpr
size_t alignment = 16;
182 using intrinsic_type = sse_simd_short;
189 struct sse_intrinsic_traits<int32_t> {
190 static constexpr
bool vectorizable =
true;
191 static constexpr
size_t size = 4;
192 static constexpr
size_t alignment = 16;
194 using intrinsic_type = sse_simd_int;
201 struct sse_intrinsic_traits<int64_t> {
202 static constexpr
bool vectorizable =
true;
203 static constexpr
size_t size = 2;
204 static constexpr
size_t alignment = 16;
206 using intrinsic_type = sse_simd_long;
216 template <
typename T>
217 using traits = sse_intrinsic_traits<T>;
222 template <
typename T>
223 using vec_type =
typename traits<T>::intrinsic_type;
230 template <
typename T>
231 static void debug_d(T value) {
235 test(__m128d vec) : vec(vec) {}
238 test u_value = value;
239 std::cout <<
"[" << u_value.array[0] <<
"," << u_value.array[1] <<
"]" << std::endl;
245 template <
typename T>
246 static void debug_s(T value) {
250 test(__m128 vec) : vec(vec) {}
253 test u_value = value;
254 std::cout <<
"[" << u_value.array[0] <<
"," << u_value.array[1] <<
"," << u_value.array[2] <<
"," << u_value.array[3] <<
"]" << std::endl;
262 template <
typename T>
263 static std::string debug_d(T) {
270 template <
typename T>
271 static std::string debug_s(T) {
281 ETL_STATIC_INLINE(
void)
storeu(int8_t* memory, sse_simd_byte value) {
282 _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
289 ETL_STATIC_INLINE(
void)
storeu(int16_t* memory, sse_simd_short value) {
290 _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
297 ETL_STATIC_INLINE(
void)
storeu(int32_t* memory, sse_simd_int value) {
298 _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
305 ETL_STATIC_INLINE(
void)
storeu(int64_t* memory, sse_simd_long value) {
306 _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
313 ETL_STATIC_INLINE(
void)
storeu(
float* memory, sse_simd_float value) {
314 _mm_storeu_ps(memory, value.value);
321 ETL_STATIC_INLINE(
void)
storeu(
double* memory, sse_simd_double value) {
322 _mm_storeu_pd(memory, value.value);
329 ETL_STATIC_INLINE(
void)
storeu(std::complex<float>* memory, sse_simd_complex_float<std::complex<float>> value) {
330 _mm_storeu_ps(reinterpret_cast<float*>(memory), value.value);
337 ETL_STATIC_INLINE(
void)
storeu(std::complex<double>* memory, sse_simd_complex_double<std::complex<double>> value) {
338 _mm_storeu_pd(reinterpret_cast<double*>(memory), value.value);
346 _mm_storeu_ps(reinterpret_cast<float*>(memory), value.value);
354 _mm_storeu_pd(reinterpret_cast<double*>(memory), value.value);
361 ETL_STATIC_INLINE(
void)
store(int8_t* memory, sse_simd_byte value) {
362 _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
369 ETL_STATIC_INLINE(
void)
store(int16_t* memory, sse_simd_short value) {
370 _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
377 ETL_STATIC_INLINE(
void)
store(int32_t* memory, sse_simd_int value) {
378 _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
385 ETL_STATIC_INLINE(
void)
store(int64_t* memory, sse_simd_long value) {
386 _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
393 ETL_STATIC_INLINE(
void)
store(
float* memory, sse_simd_float value) {
394 _mm_store_ps(memory, value.value);
401 ETL_STATIC_INLINE(
void)
store(
double* memory, sse_simd_double value) {
402 _mm_store_pd(memory, value.value);
409 ETL_STATIC_INLINE(
void)
store(std::complex<float>* memory, sse_simd_complex_float<std::complex<float>> value) {
410 _mm_store_ps(reinterpret_cast<float*>(memory), value.value);
417 ETL_STATIC_INLINE(
void)
store(std::complex<double>* memory, sse_simd_complex_double<std::complex<double>> value) {
418 _mm_store_pd(reinterpret_cast<double*>(memory), value.value);
426 _mm_store_ps(reinterpret_cast<float*>(memory), value.value);
434 _mm_store_pd(reinterpret_cast<double*>(memory), value.value);
441 ETL_STATIC_INLINE(
void)
stream(int8_t* memory, sse_simd_byte value) {
442 _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
449 ETL_STATIC_INLINE(
void)
stream(int16_t* memory, sse_simd_short value) {
450 _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
457 ETL_STATIC_INLINE(
void)
stream(int32_t* memory, sse_simd_int value) {
458 _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
465 ETL_STATIC_INLINE(
void)
stream(int64_t* memory, sse_simd_long value) {
466 _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
473 ETL_STATIC_INLINE(
void)
stream(
float* memory, sse_simd_float value) {
474 _mm_stream_ps(memory, value.value);
481 ETL_STATIC_INLINE(
void)
stream(
double* memory, sse_simd_double value) {
482 _mm_stream_pd(memory, value.value);
489 ETL_STATIC_INLINE(
void)
stream(std::complex<float>* memory, sse_simd_complex_float<std::complex<float>> value) {
490 _mm_stream_ps(reinterpret_cast<float*>(memory), value.value);
497 ETL_STATIC_INLINE(
void)
stream(std::complex<double>* memory, sse_simd_complex_double<std::complex<double>> value) {
498 _mm_stream_pd(reinterpret_cast<double*>(memory), value.value);
506 _mm_stream_ps(reinterpret_cast<float*>(memory), value.value);
514 _mm_stream_pd(reinterpret_cast<double*>(memory), value.value);
520 template <
typename T>
521 ETL_TMP_INLINE(
typename sse_intrinsic_traits<T>::intrinsic_type)
527 ETL_STATIC_INLINE(sse_simd_byte)
load(
const int8_t* memory) {
528 return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
534 ETL_STATIC_INLINE(sse_simd_short)
load(
const int16_t* memory) {
535 return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
541 ETL_STATIC_INLINE(sse_simd_int)
load(
const int32_t* memory) {
542 return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
548 ETL_STATIC_INLINE(sse_simd_long)
load(
const int64_t* memory) {
549 return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
555 ETL_STATIC_INLINE(sse_simd_float)
load(
const float* memory) {
556 return _mm_load_ps(memory);
562 ETL_STATIC_INLINE(sse_simd_double)
load(
const double* memory) {
563 return _mm_load_pd(memory);
569 ETL_STATIC_INLINE(sse_simd_complex_float<std::complex<float>>)
load(const std::complex<
float>* memory) {
570 return _mm_load_ps(reinterpret_cast<const float*>(memory));
576 ETL_STATIC_INLINE(sse_simd_complex_double<std::complex<double>>)
load(const std::complex<
double>* memory) {
577 return _mm_load_pd(reinterpret_cast<const double*>(memory));
584 return _mm_load_ps(reinterpret_cast<const float*>(memory));
591 return _mm_load_pd(reinterpret_cast<const double*>(memory));
597 ETL_STATIC_INLINE(sse_simd_byte)
loadu(
const int8_t* memory) {
598 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
604 ETL_STATIC_INLINE(sse_simd_short)
loadu(
const int16_t* memory) {
605 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
611 ETL_STATIC_INLINE(sse_simd_int)
loadu(
const int32_t* memory) {
612 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
618 ETL_STATIC_INLINE(sse_simd_long)
loadu(
const int64_t* memory) {
619 return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
625 ETL_STATIC_INLINE(sse_simd_float)
loadu(
const float* memory) {
626 return _mm_loadu_ps(memory);
632 ETL_STATIC_INLINE(sse_simd_double)
loadu(
const double* memory) {
633 return _mm_loadu_pd(memory);
639 ETL_STATIC_INLINE(sse_simd_complex_float<std::complex<float>>)
loadu(const std::complex<
float>* memory) {
640 return _mm_loadu_ps(reinterpret_cast<const float*>(memory));
646 ETL_STATIC_INLINE(sse_simd_complex_double<std::complex<double>>)
loadu(const std::complex<
double>* memory) {
647 return _mm_loadu_pd(reinterpret_cast<const double*>(memory));
654 return _mm_loadu_ps(reinterpret_cast<const float*>(memory));
661 return _mm_loadu_pd(reinterpret_cast<const double*>(memory));
667 ETL_STATIC_INLINE(sse_simd_byte)
set(int8_t value) {
668 return _mm_set1_epi8(value);
674 ETL_STATIC_INLINE(sse_simd_short)
set(int16_t value) {
675 return _mm_set1_epi16(value);
681 ETL_STATIC_INLINE(sse_simd_int)
set(int32_t value) {
682 return _mm_set1_epi32(value);
688 ETL_STATIC_INLINE(sse_simd_long)
set(int64_t value) {
689 return _mm_set1_epi64x(value);
695 ETL_STATIC_INLINE(sse_simd_double)
set(
double value) {
696 return _mm_set1_pd(value);
702 ETL_STATIC_INLINE(sse_simd_float)
set(
float value) {
703 return _mm_set1_ps(value);
709 ETL_STATIC_INLINE(sse_simd_float) round_up(sse_simd_float x) {
710 return _mm_round_ps(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
716 ETL_STATIC_INLINE(sse_simd_double) round_up(sse_simd_double x) {
717 return _mm_round_pd(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
723 ETL_STATIC_INLINE(sse_simd_complex_float<std::complex<float>>) set(std::complex<
float> value) {
724 std::complex<float> tmp[]{value, value};
731 ETL_STATIC_INLINE(sse_simd_complex_double<std::complex<double>>) set(std::complex<
double> value) {
732 std::complex<double> tmp[]{value};
757 ETL_STATIC_INLINE(sse_simd_byte) add(sse_simd_byte lhs, sse_simd_byte rhs) {
758 return _mm_add_epi8(lhs.value, rhs.value);
764 ETL_STATIC_INLINE(sse_simd_short) add(sse_simd_short lhs, sse_simd_short rhs) {
765 return _mm_add_epi16(lhs.value, rhs.value);
771 ETL_STATIC_INLINE(sse_simd_int) add(sse_simd_int lhs, sse_simd_int rhs) {
772 return _mm_add_epi32(lhs.value, rhs.value);
778 ETL_STATIC_INLINE(sse_simd_long) add(sse_simd_long lhs, sse_simd_long rhs) {
779 return _mm_add_epi64(lhs.value, rhs.value);
785 ETL_STATIC_INLINE(sse_simd_float) add(sse_simd_float lhs, sse_simd_float rhs) {
786 return _mm_add_ps(lhs.value, rhs.value);
792 ETL_STATIC_INLINE(sse_simd_double) add(sse_simd_double lhs, sse_simd_double rhs) {
793 return _mm_add_pd(lhs.value, rhs.value);
799 template <
typename T>
800 ETL_STATIC_INLINE(sse_simd_complex_float<T>)
801 add(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
802 return _mm_add_ps(lhs.value, rhs.value);
808 template <
typename T>
809 ETL_STATIC_INLINE(sse_simd_complex_double<T>)
810 add(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
811 return _mm_add_pd(lhs.value, rhs.value);
819 ETL_STATIC_INLINE(sse_simd_byte) sub(sse_simd_byte lhs, sse_simd_byte rhs) {
820 return _mm_sub_epi8(lhs.value, rhs.value);
826 ETL_STATIC_INLINE(sse_simd_short) sub(sse_simd_short lhs, sse_simd_short rhs) {
827 return _mm_sub_epi16(lhs.value, rhs.value);
833 ETL_STATIC_INLINE(sse_simd_int) sub(sse_simd_int lhs, sse_simd_int rhs) {
834 return _mm_sub_epi32(lhs.value, rhs.value);
840 ETL_STATIC_INLINE(sse_simd_long) sub(sse_simd_long lhs, sse_simd_long rhs) {
841 return _mm_sub_epi64(lhs.value, rhs.value);
847 ETL_STATIC_INLINE(sse_simd_float) sub(sse_simd_float lhs, sse_simd_float rhs) {
848 return _mm_sub_ps(lhs.value, rhs.value);
854 ETL_STATIC_INLINE(sse_simd_double) sub(sse_simd_double lhs, sse_simd_double rhs) {
855 return _mm_sub_pd(lhs.value, rhs.value);
861 template <
typename T>
862 ETL_STATIC_INLINE(sse_simd_complex_float<T>)
863 sub(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
864 return _mm_sub_ps(lhs.value, rhs.value);
870 template <
typename T>
871 ETL_STATIC_INLINE(sse_simd_complex_double<T>)
872 sub(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
873 return _mm_sub_pd(lhs.value, rhs.value);
882 ETL_STATIC_INLINE(sse_simd_float)
sqrt(sse_simd_float x) {
883 return _mm_sqrt_ps(x.value);
890 ETL_STATIC_INLINE(sse_simd_double)
sqrt(sse_simd_double x) {
891 return _mm_sqrt_pd(x.value);
902 ETL_STATIC_INLINE(sse_simd_float)
minus(sse_simd_float x) {
903 return _mm_xor_ps(x.value, _mm_set1_ps(-0.f));
910 ETL_STATIC_INLINE(sse_simd_double)
minus(sse_simd_double x) {
911 return _mm_xor_pd(x.value, _mm_set1_pd(-0.));
919 ETL_STATIC_INLINE(sse_simd_byte)
mul(sse_simd_byte lhs, sse_simd_byte rhs) {
920 __m128i even = _mm_mullo_epi16(lhs.value, rhs.value);
921 __m128i odd = _mm_mullo_epi16(_mm_srli_epi16(lhs.value, 8), _mm_srli_epi16(rhs.value, 8));
922 return _mm_or_si128(_mm_slli_epi16(odd, 8), _mm_srli_epi16(_mm_slli_epi16(even, 8), 8));
928 ETL_STATIC_INLINE(sse_simd_short)
mul(sse_simd_short lhs, sse_simd_short rhs) {
929 return _mm_mullo_epi16(lhs.value, rhs.value);
935 ETL_STATIC_INLINE(sse_simd_int)
mul(sse_simd_int lhs, sse_simd_int rhs) {
936 return _mm_mullo_epi32(lhs.value, rhs.value);
942 ETL_STATIC_INLINE(sse_simd_long)
mul(sse_simd_long lhs, sse_simd_long rhs) {
944 result[0] = lhs[0] * rhs[0];
945 result[1] = lhs[1] * rhs[1];
946 return loadu(&result[0]);
952 ETL_STATIC_INLINE(sse_simd_float)
mul(sse_simd_float lhs, sse_simd_float rhs) {
953 return _mm_mul_ps(lhs.value, rhs.value);
959 ETL_STATIC_INLINE(sse_simd_double)
mul(sse_simd_double lhs, sse_simd_double rhs) {
960 return _mm_mul_pd(lhs.value, rhs.value);
966 template <
typename T>
967 ETL_STATIC_INLINE(sse_simd_complex_float<T>)
968 mul(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
973 __m128 ymm1 = _mm_moveldup_ps(rhs.value);
976 __m128 ymm2 = _mm_mul_ps(lhs.value, ymm1);
979 __m128 ymm3 = _mm_shuffle_ps(lhs.value, lhs.value, _MM_SHUFFLE(2, 3, 0, 1));
982 ymm1 = _mm_movehdup_ps(rhs.value);
985 __m128 ymm4 = _mm_mul_ps(ymm3, ymm1);
988 return _mm_addsub_ps(ymm2, ymm4);
994 template <
typename T>
995 ETL_STATIC_INLINE(sse_simd_complex_double<T>)
996 mul(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
1001 __m128d ymm1 = _mm_movedup_pd(rhs.value);
1004 __m128d ymm2 = _mm_mul_pd(lhs.value, ymm1);
1007 ymm1 = _mm_shuffle_pd(lhs.value, lhs.value, _MM_SHUFFLE2(0, 1));
1010 __m128d ymm3 = _mm_shuffle_pd(rhs.value, rhs.value, _MM_SHUFFLE2(1, 1));
1013 __m128d ymm4 = _mm_mul_pd(ymm1, ymm3);
1016 return _mm_addsub_pd(ymm2, ymm4);
1024 ETL_STATIC_INLINE(sse_simd_byte) fmadd(sse_simd_byte a, sse_simd_byte b, sse_simd_byte c) {
1025 return add(
mul(a, b), c);
1031 ETL_STATIC_INLINE(sse_simd_short) fmadd(sse_simd_short a, sse_simd_short b, sse_simd_short c) {
1032 return add(
mul(a, b), c);
1038 ETL_STATIC_INLINE(sse_simd_int) fmadd(sse_simd_int a, sse_simd_int b, sse_simd_int c) {
1039 return add(
mul(a, b), c);
1045 ETL_STATIC_INLINE(sse_simd_long) fmadd(sse_simd_long a, sse_simd_long b, sse_simd_long c) {
1046 return add(
mul(a, b), c);
1052 ETL_STATIC_INLINE(sse_simd_float) fmadd(sse_simd_float a, sse_simd_float b, sse_simd_float c) {
1054 return _mm_fmadd_ps(a.value, b.value, c.value);
1056 return add(
mul(a, b), c);
1063 ETL_STATIC_INLINE(sse_simd_double) fmadd(sse_simd_double a, sse_simd_double b, sse_simd_double c) {
1065 return _mm_fmadd_pd(a.value, b.value, c.value);
1067 return add(
mul(a, b), c);
1074 template <
typename T>
1075 ETL_STATIC_INLINE(sse_simd_complex_float<T>)
1076 fmadd(sse_simd_complex_float<T> a, sse_simd_complex_float<T> b, sse_simd_complex_float<T> c) {
1077 return add(
mul(a, b), c);
1083 template <
typename T>
1084 ETL_STATIC_INLINE(sse_simd_complex_double<T>)
1085 fmadd(sse_simd_complex_double<T> a, sse_simd_complex_double<T> b, sse_simd_complex_double<T> c) {
1086 return add(
mul(a, b), c);
1094 ETL_STATIC_INLINE(sse_simd_float) div(sse_simd_float lhs, sse_simd_float rhs) {
1095 return _mm_div_ps(lhs.value, rhs.value);
1101 ETL_STATIC_INLINE(sse_simd_double) div(sse_simd_double lhs, sse_simd_double rhs) {
1102 return _mm_div_pd(lhs.value, rhs.value);
1108 template <
typename T>
1109 ETL_STATIC_INLINE(sse_simd_complex_float<T>)
1110 div(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
1115 __m128 ymm0 = _mm_moveldup_ps(rhs.value);
1118 __m128 ymm1 = _mm_movehdup_ps(rhs.value);
1121 __m128 ymm2 = _mm_mul_ps(lhs.value, ymm0);
1124 __m128 ymm3 = _mm_shuffle_ps(lhs.value, lhs.value, _MM_SHUFFLE(2, 3, 0, 1));
1127 __m128 ymm4 = _mm_mul_ps(ymm3, ymm1);
1130 ymm3 = _mm_sub_ps(_mm_set1_ps(0.0), ymm4);
1131 ymm4 = _mm_addsub_ps(ymm2, ymm3);
1134 ymm2 = _mm_mul_ps(ymm0, ymm0);
1137 ymm3 = _mm_mul_ps(ymm1, ymm1);
1140 ymm0 = _mm_add_ps(ymm2, ymm3);
1143 return _mm_div_ps(ymm4, ymm0);
1149 template <
typename T>
1150 ETL_STATIC_INLINE(sse_simd_complex_double<T>)
1151 div(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
1156 __m128d ymm0 = _mm_movedup_pd(rhs.value);
1159 __m128d ymm1 = _mm_shuffle_pd(rhs.value, rhs.value, _MM_SHUFFLE2(1, 1));
1162 __m128d ymm2 = _mm_mul_pd(lhs.value, ymm0);
1165 __m128d ymm3 = _mm_shuffle_pd(lhs.value, lhs.value, _MM_SHUFFLE2(0, 1));
1168 __m128d ymm4 = _mm_mul_pd(ymm3, ymm1);
1171 ymm3 = _mm_sub_pd(_mm_set1_pd(0.0), ymm4);
1172 ymm4 = _mm_addsub_pd(ymm2, ymm3);
1175 ymm2 = _mm_mul_pd(ymm0, ymm0);
1178 ymm3 = _mm_mul_pd(ymm1, ymm1);
1181 ymm0 = _mm_add_pd(ymm2, ymm3);
1184 return _mm_div_pd(ymm4, ymm0);
1192 ETL_STATIC_INLINE(sse_simd_float)
cos(sse_simd_float x) {
1193 return etl::cos_ps(x.value);
1201 ETL_STATIC_INLINE(sse_simd_float)
sin(sse_simd_float x) {
1202 return etl::sin_ps(x.value);
1208 #ifndef __INTEL_COMPILER 1213 ETL_STATIC_INLINE(sse_simd_float)
exp(sse_simd_float x) {
1214 return etl::exp_ps(x.value);
1220 ETL_STATIC_INLINE(sse_simd_double)
exp(sse_simd_double x) {
1221 return etl::exp_pd(x.value);
1227 ETL_STATIC_INLINE(sse_simd_float)
log(sse_simd_float x) {
1228 return etl::log_ps(x.value);
1231 #else //__INTEL_COMPILER 1238 ETL_STATIC_INLINE(sse_simd_double)
exp(sse_simd_double x) {
1239 return _mm_exp_pd(x.value);
1245 ETL_STATIC_INLINE(sse_simd_float)
exp(sse_simd_float x) {
1246 return _mm_exp_ps(x.value);
1254 ETL_STATIC_INLINE(sse_simd_double)
log(sse_simd_double x) {
1255 return _mm_log_pd(x.value);
1261 ETL_STATIC_INLINE(sse_simd_float)
log(sse_simd_float x) {
1262 return _mm_log_ps(x.value);
1265 #endif //__INTEL_COMPILER 1272 ETL_STATIC_INLINE(sse_simd_double)
min(sse_simd_double lhs, sse_simd_double rhs) {
1273 return _mm_min_pd(lhs.value, rhs.value);
1279 ETL_STATIC_INLINE(sse_simd_float)
min(sse_simd_float lhs, sse_simd_float rhs) {
1280 return _mm_min_ps(lhs.value, rhs.value);
1288 ETL_STATIC_INLINE(sse_simd_double)
max(sse_simd_double lhs, sse_simd_double rhs) {
1289 return _mm_max_pd(lhs.value, rhs.value);
1295 ETL_STATIC_INLINE(sse_simd_float)
max(sse_simd_float lhs, sse_simd_float rhs) {
1296 return _mm_max_ps(lhs.value, rhs.value);
1304 ETL_STATIC_INLINE(
float) hadd(sse_simd_float in) {
1305 __m128 shuf = _mm_movehdup_ps(in.value);
1306 __m128 sums = _mm_add_ps(in.value, shuf);
1307 shuf = _mm_movehl_ps(shuf, sums);
1308 sums = _mm_add_ss(sums, shuf);
1309 return _mm_cvtss_f32(sums);
1317 ETL_STATIC_INLINE(
double) hadd(sse_simd_double in) {
1318 __m128 undef = _mm_undefined_ps();
1319 __m128 shuftmp = _mm_movehl_ps(undef, _mm_castpd_ps(in.value));
1320 __m128d shuf = _mm_castps_pd(shuftmp);
1321 return _mm_cvtsd_f64(_mm_add_sd(in.value, shuf));
1331 ETL_STATIC_INLINE(int8_t) hadd(sse_simd_byte in) {
1332 return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7] + in[8] + in[9] + in[10] + in[11] + in[12] + in[13] + in[14] + in[15];
1340 ETL_STATIC_INLINE(int16_t) hadd(sse_simd_short in) {
1341 return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7];
1349 ETL_STATIC_INLINE(int32_t) hadd(sse_simd_int in) {
1350 return in[0] + in[1] + in[2] + in[3];
1358 ETL_STATIC_INLINE(int64_t) hadd(sse_simd_long in) {
1359 return in[0] + in[1];
1367 template <
typename T>
1368 ETL_STATIC_INLINE(T)
1369 hadd(sse_simd_complex_float<T> in) {
1370 return in[0] + in[1];
1378 template <
typename T>
1379 ETL_STATIC_INLINE(T)
1380 hadd(sse_simd_complex_double<T> in) {
1389 ETL_OUT_INLINE(sse_simd_byte)
1390 sse_vec::zero<int8_t>() {
1391 return _mm_setzero_si128();
1398 ETL_OUT_INLINE(sse_simd_short)
1399 sse_vec::zero<int16_t>() {
1400 return _mm_setzero_si128();
1407 ETL_OUT_INLINE(sse_simd_int)
1408 sse_vec::zero<int32_t>() {
1409 return _mm_setzero_si128();
1416 ETL_OUT_INLINE(sse_simd_long)
1417 sse_vec::zero<int64_t>() {
1418 return _mm_setzero_si128();
1425 ETL_OUT_INLINE(sse_simd_float)
1426 sse_vec::zero<float>() {
1427 return _mm_setzero_ps();
1434 ETL_OUT_INLINE(sse_simd_double)
1435 sse_vec::zero<double>() {
1436 return _mm_setzero_pd();
1444 sse_vec::zero<
etl::complex<
float>>() {
1445 return _mm_setzero_ps();
1453 sse_vec::zero<
etl::complex<
double>>() {
1454 return _mm_setzero_pd();
1461 ETL_OUT_INLINE(sse_simd_complex_float<std::complex<float>>)
1462 sse_vec::zero<std::complex<
float>>() {
1463 return _mm_setzero_ps();
1470 ETL_OUT_INLINE(sse_simd_complex_double<std::complex<double>>)
1471 sse_vec::zero<std::complex<
double>>() {
1472 return _mm_setzero_pd();
auto sin(E &&value) -> detail::unary_helper< E, sin_unary_op >
Apply sinus on each value of the given expression.
Definition: function_expression_builder.hpp:114
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
Complex number implementation.
Definition: complex.hpp:31
auto mul(A &&a, B &&b)
Multiply two matrices together.
Definition: gemm_expr.hpp:442
void minus([[maybe_unused]] size_t n, [[maybe_unused]] float alpha, [[maybe_unused]] float *A, [[maybe_unused]] size_t lda, [[maybe_unused]] float *B, [[maybe_unused]] size_t ldb)
Wrappers for single-precision egblas minus operation.
Definition: minus.hpp:43
auto sqrt(E &&value) -> detail::unary_helper< E, sqrt_unary_op >
Apply square root on each value of the given expression.
Definition: function_expression_builder.hpp:24
Contains SSE functions for exp and log.
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
auto cos(E &&value) -> detail::unary_helper< E, cos_unary_op >
Apply cosinus on each value of the given expression.
Definition: function_expression_builder.hpp:104
auto load(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:143
Root namespace for the ETL library.
Definition: adapter.hpp:15
void store(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:176
void stream(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once, using non-temporal store.
Definition: dyn_matrix_view.hpp:165
void storeu(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:187
auto loadu(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:154
auto min(L &&lhs, R &&rhs)
Create an expression with the min value of lhs or rhs.
Definition: expression_builder.hpp:77
auto exp(E &&value) -> detail::unary_helper< E, exp_unary_op >
Apply exponential on each value of the given expression.
Definition: function_expression_builder.hpp:154
auto log(E &&value) -> detail::unary_helper< E, log_unary_op >
Apply logarithm (base e) on each value of the given expression.
Definition: function_expression_builder.hpp:64