Expression Templates Library (ETL)
sse_vectorization.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
13 #pragma once
14 
15 #ifdef __SSE3__
16 
17 #include <immintrin.h>
18 #include <emmintrin.h>
19 #include <xmmintrin.h>
20 
21 #include "etl/inline.hpp"
22 #include "etl/sse_exp.hpp"
23 
24 #ifdef VECT_DEBUG
25 #include <iostream>
26 #endif
27 
28 #ifdef __clang__
29 #define _mm_undefined_ps _mm_setzero_ps
30 #define _mm_undefined_pd _mm_setzero_pd
31 #endif
32 
33 namespace etl {
34 
38 using sse_simd_float = simd_pack<vector_mode_t::SSE3, float, __m128>;
39 
43 using sse_simd_double = simd_pack<vector_mode_t::SSE3, double, __m128d>;
44 
48 template <typename T>
49 using sse_simd_complex_float = simd_pack<vector_mode_t::SSE3, T, __m128>;
50 
54 template <typename T>
55 using sse_simd_complex_double = simd_pack<vector_mode_t::SSE3, T, __m128d>;
56 
60 using sse_simd_byte = simd_pack<vector_mode_t::SSE3, int8_t, __m128i>;
61 
65 using sse_simd_short = simd_pack<vector_mode_t::SSE3, int16_t, __m128i>;
66 
70 using sse_simd_int = simd_pack<vector_mode_t::SSE3, int32_t, __m128i>;
71 
75 using sse_simd_long = simd_pack<vector_mode_t::SSE3, int64_t, __m128i>;
76 
80 template <typename T>
81 struct sse_intrinsic_traits {
82  static constexpr bool vectorizable = false;
83  static constexpr size_t size = 1;
84  static constexpr size_t alignment = alignof(T);
85 
86  using intrinsic_type = T;
87 };
88 
92 template <>
93 struct sse_intrinsic_traits<float> {
94  static constexpr bool vectorizable = true;
95  static constexpr size_t size = 4;
96  static constexpr size_t alignment = 16;
97 
98  using intrinsic_type = sse_simd_float;
99 };
100 
104 template <>
105 struct sse_intrinsic_traits<double> {
106  static constexpr bool vectorizable = true;
107  static constexpr size_t size = 2;
108  static constexpr size_t alignment = 16;
109 
110  using intrinsic_type = sse_simd_double;
111 };
112 
116 template <>
117 struct sse_intrinsic_traits<std::complex<float>> {
118  static constexpr bool vectorizable = true;
119  static constexpr size_t size = 2;
120  static constexpr size_t alignment = 16;
121 
122  using intrinsic_type = sse_simd_complex_float<std::complex<float>>;
123 };
124 
128 template <>
129 struct sse_intrinsic_traits<std::complex<double>> {
130  static constexpr bool vectorizable = true;
131  static constexpr size_t size = 1;
132  static constexpr size_t alignment = 16;
133 
134  using intrinsic_type = sse_simd_complex_double<std::complex<double>>;
135 };
136 
140 template <>
141 struct sse_intrinsic_traits<etl::complex<float>> {
142  static constexpr bool vectorizable = true;
143  static constexpr size_t size = 2;
144  static constexpr size_t alignment = 16;
145 
146  using intrinsic_type = sse_simd_complex_float<etl::complex<float>>;
147 };
148 
152 template <>
153 struct sse_intrinsic_traits<etl::complex<double>> {
154  static constexpr bool vectorizable = true;
155  static constexpr size_t size = 1;
156  static constexpr size_t alignment = 16;
157 
158  using intrinsic_type = sse_simd_complex_double<etl::complex<double>>;
159 };
160 
164 template <>
165 struct sse_intrinsic_traits<int8_t> {
166  static constexpr bool vectorizable = true;
167  static constexpr size_t size = 16;
168  static constexpr size_t alignment = 16;
169 
170  using intrinsic_type = sse_simd_byte;
171 };
172 
176 template <>
177 struct sse_intrinsic_traits<int16_t> {
178  static constexpr bool vectorizable = true;
179  static constexpr size_t size = 8;
180  static constexpr size_t alignment = 16;
181 
182  using intrinsic_type = sse_simd_short;
183 };
184 
188 template <>
189 struct sse_intrinsic_traits<int32_t> {
190  static constexpr bool vectorizable = true;
191  static constexpr size_t size = 4;
192  static constexpr size_t alignment = 16;
193 
194  using intrinsic_type = sse_simd_int;
195 };
196 
200 template <>
201 struct sse_intrinsic_traits<int64_t> {
202  static constexpr bool vectorizable = true;
203  static constexpr size_t size = 2;
204  static constexpr size_t alignment = 16;
205 
206  using intrinsic_type = sse_simd_long;
207 };
208 
212 struct sse_vec {
216  template <typename T>
217  using traits = sse_intrinsic_traits<T>;
218 
222  template <typename T>
223  using vec_type = typename traits<T>::intrinsic_type;
224 
225 #ifdef VEC_DEBUG
226 
230  template <typename T>
231  static void debug_d(T value) {
232  union test {
233  __m128d vec; // a data field, maybe a register, maybe not
234  double array[2];
235  test(__m128d vec) : vec(vec) {}
236  };
237 
238  test u_value = value;
239  std::cout << "[" << u_value.array[0] << "," << u_value.array[1] << "]" << std::endl;
240  }
241 
245  template <typename T>
246  static void debug_s(T value) {
247  union test {
248  __m128 vec; // a data field, maybe a register, maybe not
249  float array[4];
250  test(__m128 vec) : vec(vec) {}
251  };
252 
253  test u_value = value;
254  std::cout << "[" << u_value.array[0] << "," << u_value.array[1] << "," << u_value.array[2] << "," << u_value.array[3] << "]" << std::endl;
255  }
256 
257 #else
258 
262  template <typename T>
263  static std::string debug_d(T) {
264  return "";
265  }
266 
270  template <typename T>
271  static std::string debug_s(T) {
272  return "";
273  }
274 
275 #endif
276 
281  ETL_STATIC_INLINE(void) storeu(int8_t* memory, sse_simd_byte value) {
282  _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
283  }
284 
289  ETL_STATIC_INLINE(void) storeu(int16_t* memory, sse_simd_short value) {
290  _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
291  }
292 
297  ETL_STATIC_INLINE(void) storeu(int32_t* memory, sse_simd_int value) {
298  _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
299  }
300 
305  ETL_STATIC_INLINE(void) storeu(int64_t* memory, sse_simd_long value) {
306  _mm_storeu_si128(reinterpret_cast<__m128i*>(memory), value.value);
307  }
308 
313  ETL_STATIC_INLINE(void) storeu(float* memory, sse_simd_float value) {
314  _mm_storeu_ps(memory, value.value);
315  }
316 
321  ETL_STATIC_INLINE(void) storeu(double* memory, sse_simd_double value) {
322  _mm_storeu_pd(memory, value.value);
323  }
324 
329  ETL_STATIC_INLINE(void) storeu(std::complex<float>* memory, sse_simd_complex_float<std::complex<float>> value) {
330  _mm_storeu_ps(reinterpret_cast<float*>(memory), value.value);
331  }
332 
337  ETL_STATIC_INLINE(void) storeu(std::complex<double>* memory, sse_simd_complex_double<std::complex<double>> value) {
338  _mm_storeu_pd(reinterpret_cast<double*>(memory), value.value);
339  }
340 
345  ETL_STATIC_INLINE(void) storeu(etl::complex<float>* memory, sse_simd_complex_float<etl::complex<float>> value) {
346  _mm_storeu_ps(reinterpret_cast<float*>(memory), value.value);
347  }
348 
353  ETL_STATIC_INLINE(void) storeu(etl::complex<double>* memory, sse_simd_complex_double<etl::complex<double>> value) {
354  _mm_storeu_pd(reinterpret_cast<double*>(memory), value.value);
355  }
356 
361  ETL_STATIC_INLINE(void) store(int8_t* memory, sse_simd_byte value) {
362  _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
363  }
364 
369  ETL_STATIC_INLINE(void) store(int16_t* memory, sse_simd_short value) {
370  _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
371  }
372 
377  ETL_STATIC_INLINE(void) store(int32_t* memory, sse_simd_int value) {
378  _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
379  }
380 
385  ETL_STATIC_INLINE(void) store(int64_t* memory, sse_simd_long value) {
386  _mm_store_si128(reinterpret_cast<__m128i*>(memory), value.value);
387  }
388 
393  ETL_STATIC_INLINE(void) store(float* memory, sse_simd_float value) {
394  _mm_store_ps(memory, value.value);
395  }
396 
401  ETL_STATIC_INLINE(void) store(double* memory, sse_simd_double value) {
402  _mm_store_pd(memory, value.value);
403  }
404 
409  ETL_STATIC_INLINE(void) store(std::complex<float>* memory, sse_simd_complex_float<std::complex<float>> value) {
410  _mm_store_ps(reinterpret_cast<float*>(memory), value.value);
411  }
412 
417  ETL_STATIC_INLINE(void) store(std::complex<double>* memory, sse_simd_complex_double<std::complex<double>> value) {
418  _mm_store_pd(reinterpret_cast<double*>(memory), value.value);
419  }
420 
425  ETL_STATIC_INLINE(void) store(etl::complex<float>* memory, sse_simd_complex_float<etl::complex<float>> value) {
426  _mm_store_ps(reinterpret_cast<float*>(memory), value.value);
427  }
428 
433  ETL_STATIC_INLINE(void) store(etl::complex<double>* memory, sse_simd_complex_double<etl::complex<double>> value) {
434  _mm_store_pd(reinterpret_cast<double*>(memory), value.value);
435  }
436 
441  ETL_STATIC_INLINE(void) stream(int8_t* memory, sse_simd_byte value) {
442  _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
443  }
444 
449  ETL_STATIC_INLINE(void) stream(int16_t* memory, sse_simd_short value) {
450  _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
451  }
452 
457  ETL_STATIC_INLINE(void) stream(int32_t* memory, sse_simd_int value) {
458  _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
459  }
460 
465  ETL_STATIC_INLINE(void) stream(int64_t* memory, sse_simd_long value) {
466  _mm_stream_si128(reinterpret_cast<__m128i*>(memory), value.value);
467  }
468 
473  ETL_STATIC_INLINE(void) stream(float* memory, sse_simd_float value) {
474  _mm_stream_ps(memory, value.value);
475  }
476 
481  ETL_STATIC_INLINE(void) stream(double* memory, sse_simd_double value) {
482  _mm_stream_pd(memory, value.value);
483  }
484 
489  ETL_STATIC_INLINE(void) stream(std::complex<float>* memory, sse_simd_complex_float<std::complex<float>> value) {
490  _mm_stream_ps(reinterpret_cast<float*>(memory), value.value);
491  }
492 
497  ETL_STATIC_INLINE(void) stream(std::complex<double>* memory, sse_simd_complex_double<std::complex<double>> value) {
498  _mm_stream_pd(reinterpret_cast<double*>(memory), value.value);
499  }
500 
505  ETL_STATIC_INLINE(void) stream(etl::complex<float>* memory, sse_simd_complex_float<etl::complex<float>> value) {
506  _mm_stream_ps(reinterpret_cast<float*>(memory), value.value);
507  }
508 
513  ETL_STATIC_INLINE(void) stream(etl::complex<double>* memory, sse_simd_complex_double<etl::complex<double>> value) {
514  _mm_stream_pd(reinterpret_cast<double*>(memory), value.value);
515  }
516 
520  template <typename T>
521  ETL_TMP_INLINE(typename sse_intrinsic_traits<T>::intrinsic_type)
522  zero();
523 
527  ETL_STATIC_INLINE(sse_simd_byte) load(const int8_t* memory) {
528  return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
529  }
530 
534  ETL_STATIC_INLINE(sse_simd_short) load(const int16_t* memory) {
535  return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
536  }
537 
541  ETL_STATIC_INLINE(sse_simd_int) load(const int32_t* memory) {
542  return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
543  }
544 
548  ETL_STATIC_INLINE(sse_simd_long) load(const int64_t* memory) {
549  return _mm_load_si128(reinterpret_cast<const __m128i*>(memory));
550  }
551 
555  ETL_STATIC_INLINE(sse_simd_float) load(const float* memory) {
556  return _mm_load_ps(memory);
557  }
558 
562  ETL_STATIC_INLINE(sse_simd_double) load(const double* memory) {
563  return _mm_load_pd(memory);
564  }
565 
569  ETL_STATIC_INLINE(sse_simd_complex_float<std::complex<float>>) load(const std::complex<float>* memory) {
570  return _mm_load_ps(reinterpret_cast<const float*>(memory));
571  }
572 
576  ETL_STATIC_INLINE(sse_simd_complex_double<std::complex<double>>) load(const std::complex<double>* memory) {
577  return _mm_load_pd(reinterpret_cast<const double*>(memory));
578  }
579 
583  ETL_STATIC_INLINE(sse_simd_complex_float<etl::complex<float>>) load(const etl::complex<float>* memory) {
584  return _mm_load_ps(reinterpret_cast<const float*>(memory));
585  }
586 
590  ETL_STATIC_INLINE(sse_simd_complex_double<etl::complex<double>>) load(const etl::complex<double>* memory) {
591  return _mm_load_pd(reinterpret_cast<const double*>(memory));
592  }
593 
597  ETL_STATIC_INLINE(sse_simd_byte) loadu(const int8_t* memory) {
598  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
599  }
600 
604  ETL_STATIC_INLINE(sse_simd_short) loadu(const int16_t* memory) {
605  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
606  }
607 
611  ETL_STATIC_INLINE(sse_simd_int) loadu(const int32_t* memory) {
612  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
613  }
614 
618  ETL_STATIC_INLINE(sse_simd_long) loadu(const int64_t* memory) {
619  return _mm_loadu_si128(reinterpret_cast<const __m128i*>(memory));
620  }
621 
625  ETL_STATIC_INLINE(sse_simd_float) loadu(const float* memory) {
626  return _mm_loadu_ps(memory);
627  }
628 
632  ETL_STATIC_INLINE(sse_simd_double) loadu(const double* memory) {
633  return _mm_loadu_pd(memory);
634  }
635 
639  ETL_STATIC_INLINE(sse_simd_complex_float<std::complex<float>>) loadu(const std::complex<float>* memory) {
640  return _mm_loadu_ps(reinterpret_cast<const float*>(memory));
641  }
642 
646  ETL_STATIC_INLINE(sse_simd_complex_double<std::complex<double>>) loadu(const std::complex<double>* memory) {
647  return _mm_loadu_pd(reinterpret_cast<const double*>(memory));
648  }
649 
653  ETL_STATIC_INLINE(sse_simd_complex_float<etl::complex<float>>) loadu(const etl::complex<float>* memory) {
654  return _mm_loadu_ps(reinterpret_cast<const float*>(memory));
655  }
656 
660  ETL_STATIC_INLINE(sse_simd_complex_double<etl::complex<double>>) loadu(const etl::complex<double>* memory) {
661  return _mm_loadu_pd(reinterpret_cast<const double*>(memory));
662  }
663 
667  ETL_STATIC_INLINE(sse_simd_byte) set(int8_t value) {
668  return _mm_set1_epi8(value);
669  }
670 
674  ETL_STATIC_INLINE(sse_simd_short) set(int16_t value) {
675  return _mm_set1_epi16(value);
676  }
677 
681  ETL_STATIC_INLINE(sse_simd_int) set(int32_t value) {
682  return _mm_set1_epi32(value);
683  }
684 
688  ETL_STATIC_INLINE(sse_simd_long) set(int64_t value) {
689  return _mm_set1_epi64x(value);
690  }
691 
695  ETL_STATIC_INLINE(sse_simd_double) set(double value) {
696  return _mm_set1_pd(value);
697  }
698 
702  ETL_STATIC_INLINE(sse_simd_float) set(float value) {
703  return _mm_set1_ps(value);
704  }
705 
709  ETL_STATIC_INLINE(sse_simd_float) round_up(sse_simd_float x) {
710  return _mm_round_ps(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
711  }
712 
716  ETL_STATIC_INLINE(sse_simd_double) round_up(sse_simd_double x) {
717  return _mm_round_pd(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
718  }
719 
723  ETL_STATIC_INLINE(sse_simd_complex_float<std::complex<float>>) set(std::complex<float> value) {
724  std::complex<float> tmp[]{value, value};
725  return loadu(tmp);
726  }
727 
731  ETL_STATIC_INLINE(sse_simd_complex_double<std::complex<double>>) set(std::complex<double> value) {
732  std::complex<double> tmp[]{value};
733  return loadu(tmp);
734  }
735 
739  ETL_STATIC_INLINE(sse_simd_complex_float<etl::complex<float>>) set(etl::complex<float> value) {
740  etl::complex<float> tmp[]{value, value};
741  return loadu(tmp);
742  }
743 
747  ETL_STATIC_INLINE(sse_simd_complex_double<etl::complex<double>>) set(etl::complex<double> value) {
748  etl::complex<double> tmp[]{value};
749  return loadu(tmp);
750  }
751 
752  // Addition
753 
757  ETL_STATIC_INLINE(sse_simd_byte) add(sse_simd_byte lhs, sse_simd_byte rhs) {
758  return _mm_add_epi8(lhs.value, rhs.value);
759  }
760 
764  ETL_STATIC_INLINE(sse_simd_short) add(sse_simd_short lhs, sse_simd_short rhs) {
765  return _mm_add_epi16(lhs.value, rhs.value);
766  }
767 
771  ETL_STATIC_INLINE(sse_simd_int) add(sse_simd_int lhs, sse_simd_int rhs) {
772  return _mm_add_epi32(lhs.value, rhs.value);
773  }
774 
778  ETL_STATIC_INLINE(sse_simd_long) add(sse_simd_long lhs, sse_simd_long rhs) {
779  return _mm_add_epi64(lhs.value, rhs.value);
780  }
781 
785  ETL_STATIC_INLINE(sse_simd_float) add(sse_simd_float lhs, sse_simd_float rhs) {
786  return _mm_add_ps(lhs.value, rhs.value);
787  }
788 
792  ETL_STATIC_INLINE(sse_simd_double) add(sse_simd_double lhs, sse_simd_double rhs) {
793  return _mm_add_pd(lhs.value, rhs.value);
794  }
795 
799  template <typename T>
800  ETL_STATIC_INLINE(sse_simd_complex_float<T>)
801  add(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
802  return _mm_add_ps(lhs.value, rhs.value);
803  }
804 
808  template <typename T>
809  ETL_STATIC_INLINE(sse_simd_complex_double<T>)
810  add(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
811  return _mm_add_pd(lhs.value, rhs.value);
812  }
813 
814  // Subtraction
815 
819  ETL_STATIC_INLINE(sse_simd_byte) sub(sse_simd_byte lhs, sse_simd_byte rhs) {
820  return _mm_sub_epi8(lhs.value, rhs.value);
821  }
822 
826  ETL_STATIC_INLINE(sse_simd_short) sub(sse_simd_short lhs, sse_simd_short rhs) {
827  return _mm_sub_epi16(lhs.value, rhs.value);
828  }
829 
833  ETL_STATIC_INLINE(sse_simd_int) sub(sse_simd_int lhs, sse_simd_int rhs) {
834  return _mm_sub_epi32(lhs.value, rhs.value);
835  }
836 
840  ETL_STATIC_INLINE(sse_simd_long) sub(sse_simd_long lhs, sse_simd_long rhs) {
841  return _mm_sub_epi64(lhs.value, rhs.value);
842  }
843 
847  ETL_STATIC_INLINE(sse_simd_float) sub(sse_simd_float lhs, sse_simd_float rhs) {
848  return _mm_sub_ps(lhs.value, rhs.value);
849  }
850 
854  ETL_STATIC_INLINE(sse_simd_double) sub(sse_simd_double lhs, sse_simd_double rhs) {
855  return _mm_sub_pd(lhs.value, rhs.value);
856  }
857 
861  template <typename T>
862  ETL_STATIC_INLINE(sse_simd_complex_float<T>)
863  sub(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
864  return _mm_sub_ps(lhs.value, rhs.value);
865  }
866 
870  template <typename T>
871  ETL_STATIC_INLINE(sse_simd_complex_double<T>)
872  sub(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
873  return _mm_sub_pd(lhs.value, rhs.value);
874  }
875 
876  // Square Root
877 
882  ETL_STATIC_INLINE(sse_simd_float) sqrt(sse_simd_float x) {
883  return _mm_sqrt_ps(x.value);
884  }
885 
890  ETL_STATIC_INLINE(sse_simd_double) sqrt(sse_simd_double x) {
891  return _mm_sqrt_pd(x.value);
892  }
893 
894  // Negation
895 
896  // TODO negation epi32
897 
902  ETL_STATIC_INLINE(sse_simd_float) minus(sse_simd_float x) {
903  return _mm_xor_ps(x.value, _mm_set1_ps(-0.f));
904  }
905 
910  ETL_STATIC_INLINE(sse_simd_double) minus(sse_simd_double x) {
911  return _mm_xor_pd(x.value, _mm_set1_pd(-0.));
912  }
913 
914  // Multiplication
915 
919  ETL_STATIC_INLINE(sse_simd_byte) mul(sse_simd_byte lhs, sse_simd_byte rhs) {
920  __m128i even = _mm_mullo_epi16(lhs.value, rhs.value);
921  __m128i odd = _mm_mullo_epi16(_mm_srli_epi16(lhs.value, 8), _mm_srli_epi16(rhs.value, 8));
922  return _mm_or_si128(_mm_slli_epi16(odd, 8), _mm_srli_epi16(_mm_slli_epi16(even, 8), 8));
923  }
924 
928  ETL_STATIC_INLINE(sse_simd_short) mul(sse_simd_short lhs, sse_simd_short rhs) {
929  return _mm_mullo_epi16(lhs.value, rhs.value);
930  }
931 
935  ETL_STATIC_INLINE(sse_simd_int) mul(sse_simd_int lhs, sse_simd_int rhs) {
936  return _mm_mullo_epi32(lhs.value, rhs.value);
937  }
938 
942  ETL_STATIC_INLINE(sse_simd_long) mul(sse_simd_long lhs, sse_simd_long rhs) {
943  int64_t result[2];
944  result[0] = lhs[0] * rhs[0];
945  result[1] = lhs[1] * rhs[1];
946  return loadu(&result[0]);
947  }
948 
952  ETL_STATIC_INLINE(sse_simd_float) mul(sse_simd_float lhs, sse_simd_float rhs) {
953  return _mm_mul_ps(lhs.value, rhs.value);
954  }
955 
959  ETL_STATIC_INLINE(sse_simd_double) mul(sse_simd_double lhs, sse_simd_double rhs) {
960  return _mm_mul_pd(lhs.value, rhs.value);
961  }
962 
966  template <typename T>
967  ETL_STATIC_INLINE(sse_simd_complex_float<T>)
968  mul(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
969  //lhs = [x1.real, x1.img, x2.real, x2.img]
970  //rhs = [y1.real, y1.img, y2.real, y2.img]
971 
972  //ymm1 = [y1.real, y1.real, y2.real, y2.real]
973  __m128 ymm1 = _mm_moveldup_ps(rhs.value);
974 
975  //ymm2 = lhs * ymm1
976  __m128 ymm2 = _mm_mul_ps(lhs.value, ymm1);
977 
978  //ymm3 = [x1.img, x1.real, x2.img, x2.real]
979  __m128 ymm3 = _mm_shuffle_ps(lhs.value, lhs.value, _MM_SHUFFLE(2, 3, 0, 1));
980 
981  //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
982  ymm1 = _mm_movehdup_ps(rhs.value);
983 
984  //ymm4 = ymm3 * ymm1
985  __m128 ymm4 = _mm_mul_ps(ymm3, ymm1);
986 
987  //result = [ymm2 -+ ymm4];
988  return _mm_addsub_ps(ymm2, ymm4);
989  }
990 
994  template <typename T>
995  ETL_STATIC_INLINE(sse_simd_complex_double<T>)
996  mul(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
997  //lhs = [x.real, x.img]
998  //rhs = [y.real, y.img]
999 
1000  //ymm1 = [y.real, y.real]
1001  __m128d ymm1 = _mm_movedup_pd(rhs.value);
1002 
1003  //ymm2 = [x.real * y.real, x.img * y.real]
1004  __m128d ymm2 = _mm_mul_pd(lhs.value, ymm1);
1005 
1006  //ymm1 = [x.img, x.real]
1007  ymm1 = _mm_shuffle_pd(lhs.value, lhs.value, _MM_SHUFFLE2(0, 1));
1008 
1009  //ymm3 = [y.img, y.img]
1010  __m128d ymm3 = _mm_shuffle_pd(rhs.value, rhs.value, _MM_SHUFFLE2(1, 1));
1011 
1012  //ymm4 = [x.img * y.img, x.real * y.img]
1013  __m128d ymm4 = _mm_mul_pd(ymm1, ymm3);
1014 
1015  //result = [x.real * y.real - x.img * y.img, x.img * y.real - x.real * y.img]
1016  return _mm_addsub_pd(ymm2, ymm4);
1017  }
1018 
1019  // Fused-Multiply-Add (FMA)
1020 
1024  ETL_STATIC_INLINE(sse_simd_byte) fmadd(sse_simd_byte a, sse_simd_byte b, sse_simd_byte c) {
1025  return add(mul(a, b), c);
1026  }
1027 
1031  ETL_STATIC_INLINE(sse_simd_short) fmadd(sse_simd_short a, sse_simd_short b, sse_simd_short c) {
1032  return add(mul(a, b), c);
1033  }
1034 
1038  ETL_STATIC_INLINE(sse_simd_int) fmadd(sse_simd_int a, sse_simd_int b, sse_simd_int c) {
1039  return add(mul(a, b), c);
1040  }
1041 
1045  ETL_STATIC_INLINE(sse_simd_long) fmadd(sse_simd_long a, sse_simd_long b, sse_simd_long c) {
1046  return add(mul(a, b), c);
1047  }
1048 
1052  ETL_STATIC_INLINE(sse_simd_float) fmadd(sse_simd_float a, sse_simd_float b, sse_simd_float c) {
1053 #ifdef __FMA__
1054  return _mm_fmadd_ps(a.value, b.value, c.value);
1055 #else
1056  return add(mul(a, b), c);
1057 #endif
1058  }
1059 
1063  ETL_STATIC_INLINE(sse_simd_double) fmadd(sse_simd_double a, sse_simd_double b, sse_simd_double c) {
1064 #ifdef __FMA__
1065  return _mm_fmadd_pd(a.value, b.value, c.value);
1066 #else
1067  return add(mul(a, b), c);
1068 #endif
1069  }
1070 
1074  template <typename T>
1075  ETL_STATIC_INLINE(sse_simd_complex_float<T>)
1076  fmadd(sse_simd_complex_float<T> a, sse_simd_complex_float<T> b, sse_simd_complex_float<T> c) {
1077  return add(mul(a, b), c);
1078  }
1079 
1083  template <typename T>
1084  ETL_STATIC_INLINE(sse_simd_complex_double<T>)
1085  fmadd(sse_simd_complex_double<T> a, sse_simd_complex_double<T> b, sse_simd_complex_double<T> c) {
1086  return add(mul(a, b), c);
1087  }
1088 
1089  // Division
1090 
1094  ETL_STATIC_INLINE(sse_simd_float) div(sse_simd_float lhs, sse_simd_float rhs) {
1095  return _mm_div_ps(lhs.value, rhs.value);
1096  }
1097 
1101  ETL_STATIC_INLINE(sse_simd_double) div(sse_simd_double lhs, sse_simd_double rhs) {
1102  return _mm_div_pd(lhs.value, rhs.value);
1103  }
1104 
1108  template <typename T>
1109  ETL_STATIC_INLINE(sse_simd_complex_float<T>)
1110  div(sse_simd_complex_float<T> lhs, sse_simd_complex_float<T> rhs) {
1111  //lhs = [x1.real, x1.img, x2.real, x2.img]
1112  //rhs = [y1.real, y1.img, y2.real, y2.img]
1113 
1114  //ymm0 = [y1.real, y1.real, y2.real, y2.real]
1115  __m128 ymm0 = _mm_moveldup_ps(rhs.value);
1116 
1117  //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag]
1118  __m128 ymm1 = _mm_movehdup_ps(rhs.value);
1119 
1120  //ymm2 = [x.real * y.real, x.img * y.real, ...]
1121  __m128 ymm2 = _mm_mul_ps(lhs.value, ymm0);
1122 
1123  //ymm3 = [x1.img, x1.real, x2.img, x2.real]
1124  __m128 ymm3 = _mm_shuffle_ps(lhs.value, lhs.value, _MM_SHUFFLE(2, 3, 0, 1));
1125 
1126  //ymm4 = [x.img * y.img, x.real * y.img, ...]
1127  __m128 ymm4 = _mm_mul_ps(ymm3, ymm1);
1128 
1129  //ymm4 = subadd(ymm2, ymm4)
1130  ymm3 = _mm_sub_ps(_mm_set1_ps(0.0), ymm4);
1131  ymm4 = _mm_addsub_ps(ymm2, ymm3);
1132 
1133  //ymm2 = [y.real^2, y.real^2]
1134  ymm2 = _mm_mul_ps(ymm0, ymm0);
1135 
1136  //ymm3 = [y.imag^2, y.imag^2]
1137  ymm3 = _mm_mul_ps(ymm1, ymm1);
1138 
1139  //ymm0 = [y.real^2 + y.imag^2, y.real^2 + y.imag^2]
1140  ymm0 = _mm_add_ps(ymm2, ymm3);
1141 
1142  //result = ymm4 / ymm0
1143  return _mm_div_ps(ymm4, ymm0);
1144  }
1145 
1149  template <typename T>
1150  ETL_STATIC_INLINE(sse_simd_complex_double<T>)
1151  div(sse_simd_complex_double<T> lhs, sse_simd_complex_double<T> rhs) {
1152  //lhs = [x.real, x.img]
1153  //rhs = [y.real, y.img]
1154 
1155  //ymm0 = [y.real, y.real]
1156  __m128d ymm0 = _mm_movedup_pd(rhs.value);
1157 
1158  //ymm1 = [y.img, y.img]
1159  __m128d ymm1 = _mm_shuffle_pd(rhs.value, rhs.value, _MM_SHUFFLE2(1, 1));
1160 
1161  //ymm2 = [x.real * y.real, x.img * y.real]
1162  __m128d ymm2 = _mm_mul_pd(lhs.value, ymm0);
1163 
1164  //ymm3 = [x.img, x.real]
1165  __m128d ymm3 = _mm_shuffle_pd(lhs.value, lhs.value, _MM_SHUFFLE2(0, 1));
1166 
1167  //ymm4 = [x.img * y.img, x.real * y.img]
1168  __m128d ymm4 = _mm_mul_pd(ymm3, ymm1);
1169 
1170  //ymm4 = subadd(ymm2, ymm4)
1171  ymm3 = _mm_sub_pd(_mm_set1_pd(0.0), ymm4);
1172  ymm4 = _mm_addsub_pd(ymm2, ymm3);
1173 
1174  //ymm2 = [y.real^2, y.real^2]
1175  ymm2 = _mm_mul_pd(ymm0, ymm0);
1176 
1177  //ymm3 = [y.imag^2, y.imag^2]
1178  ymm3 = _mm_mul_pd(ymm1, ymm1);
1179 
1180  //ymm0 = [y.real^2 + y.imag^2, y.real^2 + y.imag^2]
1181  ymm0 = _mm_add_pd(ymm2, ymm3);
1182 
1183  //result = ymm4 / ymm0
1184  return _mm_div_pd(ymm4, ymm0);
1185  }
1186 
1187  // Cosinus
1188 
1192  ETL_STATIC_INLINE(sse_simd_float) cos(sse_simd_float x) {
1193  return etl::cos_ps(x.value);
1194  }
1195 
1196  // Sinus
1197 
1201  ETL_STATIC_INLINE(sse_simd_float) sin(sse_simd_float x) {
1202  return etl::sin_ps(x.value);
1203  }
1204 
1205  //The Intel C++ Compiler (icc) has more intrinsics.
1206  //ETL uses them when compiled with icc
1207 
1208 #ifndef __INTEL_COMPILER
1209 
1213  ETL_STATIC_INLINE(sse_simd_float) exp(sse_simd_float x) {
1214  return etl::exp_ps(x.value);
1215  }
1216 
1220  ETL_STATIC_INLINE(sse_simd_double) exp(sse_simd_double x) {
1221  return etl::exp_pd(x.value);
1222  }
1223 
1227  ETL_STATIC_INLINE(sse_simd_float) log(sse_simd_float x) {
1228  return etl::log_ps(x.value);
1229  }
1230 
1231 #else //__INTEL_COMPILER
1232 
1233  //Exponential
1234 
1238  ETL_STATIC_INLINE(sse_simd_double) exp(sse_simd_double x) {
1239  return _mm_exp_pd(x.value);
1240  }
1241 
1245  ETL_STATIC_INLINE(sse_simd_float) exp(sse_simd_float x) {
1246  return _mm_exp_ps(x.value);
1247  }
1248 
1249  //Logarithm
1250 
1254  ETL_STATIC_INLINE(sse_simd_double) log(sse_simd_double x) {
1255  return _mm_log_pd(x.value);
1256  }
1257 
1261  ETL_STATIC_INLINE(sse_simd_float) log(sse_simd_float x) {
1262  return _mm_log_ps(x.value);
1263  }
1264 
1265 #endif //__INTEL_COMPILER
1266 
1267  //Min
1268 
1272  ETL_STATIC_INLINE(sse_simd_double) min(sse_simd_double lhs, sse_simd_double rhs) {
1273  return _mm_min_pd(lhs.value, rhs.value);
1274  }
1275 
1279  ETL_STATIC_INLINE(sse_simd_float) min(sse_simd_float lhs, sse_simd_float rhs) {
1280  return _mm_min_ps(lhs.value, rhs.value);
1281  }
1282 
1283  //Max
1284 
1288  ETL_STATIC_INLINE(sse_simd_double) max(sse_simd_double lhs, sse_simd_double rhs) {
1289  return _mm_max_pd(lhs.value, rhs.value);
1290  }
1291 
1295  ETL_STATIC_INLINE(sse_simd_float) max(sse_simd_float lhs, sse_simd_float rhs) {
1296  return _mm_max_ps(lhs.value, rhs.value);
1297  }
1298 
1304  ETL_STATIC_INLINE(float) hadd(sse_simd_float in) {
1305  __m128 shuf = _mm_movehdup_ps(in.value);
1306  __m128 sums = _mm_add_ps(in.value, shuf);
1307  shuf = _mm_movehl_ps(shuf, sums);
1308  sums = _mm_add_ss(sums, shuf);
1309  return _mm_cvtss_f32(sums);
1310  }
1311 
1317  ETL_STATIC_INLINE(double) hadd(sse_simd_double in) {
1318  __m128 undef = _mm_undefined_ps();
1319  __m128 shuftmp = _mm_movehl_ps(undef, _mm_castpd_ps(in.value));
1320  __m128d shuf = _mm_castps_pd(shuftmp);
1321  return _mm_cvtsd_f64(_mm_add_sd(in.value, shuf));
1322  }
1323 
1324  //TODO Vectorize the following functions
1325 
1331  ETL_STATIC_INLINE(int8_t) hadd(sse_simd_byte in) {
1332  return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7] + in[8] + in[9] + in[10] + in[11] + in[12] + in[13] + in[14] + in[15];
1333  }
1334 
1340  ETL_STATIC_INLINE(int16_t) hadd(sse_simd_short in) {
1341  return in[0] + in[1] + in[2] + in[3] + in[4] + in[5] + in[6] + in[7];
1342  }
1343 
1349  ETL_STATIC_INLINE(int32_t) hadd(sse_simd_int in) {
1350  return in[0] + in[1] + in[2] + in[3];
1351  }
1352 
1358  ETL_STATIC_INLINE(int64_t) hadd(sse_simd_long in) {
1359  return in[0] + in[1];
1360  }
1361 
1367  template <typename T>
1368  ETL_STATIC_INLINE(T)
1369  hadd(sse_simd_complex_float<T> in) {
1370  return in[0] + in[1];
1371  }
1372 
1378  template <typename T>
1379  ETL_STATIC_INLINE(T)
1380  hadd(sse_simd_complex_double<T> in) {
1381  return in[0];
1382  }
1383 };
1384 
1388 template <>
1389 ETL_OUT_INLINE(sse_simd_byte)
1390 sse_vec::zero<int8_t>() {
1391  return _mm_setzero_si128();
1392 }
1393 
1397 template <>
1398 ETL_OUT_INLINE(sse_simd_short)
1399 sse_vec::zero<int16_t>() {
1400  return _mm_setzero_si128();
1401 }
1402 
1406 template <>
1407 ETL_OUT_INLINE(sse_simd_int)
1408 sse_vec::zero<int32_t>() {
1409  return _mm_setzero_si128();
1410 }
1411 
1415 template <>
1416 ETL_OUT_INLINE(sse_simd_long)
1417 sse_vec::zero<int64_t>() {
1418  return _mm_setzero_si128();
1419 }
1420 
1424 template <>
1425 ETL_OUT_INLINE(sse_simd_float)
1426 sse_vec::zero<float>() {
1427  return _mm_setzero_ps();
1428 }
1429 
1433 template <>
1434 ETL_OUT_INLINE(sse_simd_double)
1435 sse_vec::zero<double>() {
1436  return _mm_setzero_pd();
1437 }
1438 
1442 template <>
1443 ETL_OUT_INLINE(sse_simd_complex_float<etl::complex<float>>)
1444 sse_vec::zero<etl::complex<float>>() {
1445  return _mm_setzero_ps();
1446 }
1447 
1451 template <>
1452 ETL_OUT_INLINE(sse_simd_complex_double<etl::complex<double>>)
1453 sse_vec::zero<etl::complex<double>>() {
1454  return _mm_setzero_pd();
1455 }
1456 
1460 template <>
1461 ETL_OUT_INLINE(sse_simd_complex_float<std::complex<float>>)
1462 sse_vec::zero<std::complex<float>>() {
1463  return _mm_setzero_ps();
1464 }
1465 
1469 template <>
1470 ETL_OUT_INLINE(sse_simd_complex_double<std::complex<double>>)
1471 sse_vec::zero<std::complex<double>>() {
1472  return _mm_setzero_pd();
1473 }
1474 
1475 } //end of namespace etl
1476 
1477 #endif //__SSE3__
auto sin(E &&value) -> detail::unary_helper< E, sin_unary_op >
Apply sinus on each value of the given expression.
Definition: function_expression_builder.hpp:114
auto max(L &&lhs, R &&rhs)
Create an expression with the max value of lhs or rhs.
Definition: expression_builder.hpp:65
Complex number implementation.
Definition: complex.hpp:31
auto mul(A &&a, B &&b)
Multiply two matrices together.
Definition: gemm_expr.hpp:442
void minus([[maybe_unused]] size_t n, [[maybe_unused]] float alpha, [[maybe_unused]] float *A, [[maybe_unused]] size_t lda, [[maybe_unused]] float *B, [[maybe_unused]] size_t ldb)
Wrappers for single-precision egblas minus operation.
Definition: minus.hpp:43
auto sqrt(E &&value) -> detail::unary_helper< E, sqrt_unary_op >
Apply square root on each value of the given expression.
Definition: function_expression_builder.hpp:24
Contains SSE functions for exp and log.
typename V::template vec_type< value_type > vec_type
The vectorization type for V.
Definition: dyn_matrix_view.hpp:43
auto cos(E &&value) -> detail::unary_helper< E, cos_unary_op >
Apply cosinus on each value of the given expression.
Definition: function_expression_builder.hpp:104
auto load(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:143
Root namespace for the ETL library.
Definition: adapter.hpp:15
void store(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:176
void stream(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once, using non-temporal store.
Definition: dyn_matrix_view.hpp:165
void storeu(vec_type< V > in, size_t i) noexcept
Store several elements in the matrix at once.
Definition: dyn_matrix_view.hpp:187
auto loadu(size_t x) const noexcept
Load several elements of the expression at once.
Definition: dyn_matrix_view.hpp:154
auto min(L &&lhs, R &&rhs)
Create an expression with the min value of lhs or rhs.
Definition: expression_builder.hpp:77
auto exp(E &&value) -> detail::unary_helper< E, exp_unary_op >
Apply exponential on each value of the given expression.
Definition: function_expression_builder.hpp:154
Inlining macros.
auto log(E &&value) -> detail::unary_helper< E, log_unary_op >
Apply logarithm (base e) on each value of the given expression.
Definition: function_expression_builder.hpp:64