GraphicsAPI_2020C
matrix.h
Go to the documentation of this file.
1 
4 #pragma once
5 
6 #include "geometric.h"
7 
8 #if GLM_ARCH & GLM_ARCH_SSE2_BIT
9 
10 GLM_FUNC_QUALIFIER void glm_mat4_matrixCompMult(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
11 {
12  out[0] = _mm_mul_ps(in1[0], in2[0]);
13  out[1] = _mm_mul_ps(in1[1], in2[1]);
14  out[2] = _mm_mul_ps(in1[2], in2[2]);
15  out[3] = _mm_mul_ps(in1[3], in2[3]);
16 }
17 
18 GLM_FUNC_QUALIFIER void glm_mat4_add(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
19 {
20  out[0] = _mm_add_ps(in1[0], in2[0]);
21  out[1] = _mm_add_ps(in1[1], in2[1]);
22  out[2] = _mm_add_ps(in1[2], in2[2]);
23  out[3] = _mm_add_ps(in1[3], in2[3]);
24 }
25 
26 GLM_FUNC_QUALIFIER void glm_mat4_sub(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
27 {
28  out[0] = _mm_sub_ps(in1[0], in2[0]);
29  out[1] = _mm_sub_ps(in1[1], in2[1]);
30  out[2] = _mm_sub_ps(in1[2], in2[2]);
31  out[3] = _mm_sub_ps(in1[3], in2[3]);
32 }
33 
34 GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_mul_vec4(glm_vec4 const m[4], glm_vec4 v)
35 {
36  __m128 v0 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0));
37  __m128 v1 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1));
38  __m128 v2 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2));
39  __m128 v3 = _mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3));
40 
41  __m128 m0 = _mm_mul_ps(m[0], v0);
42  __m128 m1 = _mm_mul_ps(m[1], v1);
43  __m128 m2 = _mm_mul_ps(m[2], v2);
44  __m128 m3 = _mm_mul_ps(m[3], v3);
45 
46  __m128 a0 = _mm_add_ps(m0, m1);
47  __m128 a1 = _mm_add_ps(m2, m3);
48  __m128 a2 = _mm_add_ps(a0, a1);
49 
50  return a2;
51 }
52 
53 GLM_FUNC_QUALIFIER __m128 glm_vec4_mul_mat4(glm_vec4 v, glm_vec4 const m[4])
54 {
55  __m128 i0 = m[0];
56  __m128 i1 = m[1];
57  __m128 i2 = m[2];
58  __m128 i3 = m[3];
59 
60  __m128 m0 = _mm_mul_ps(v, i0);
61  __m128 m1 = _mm_mul_ps(v, i1);
62  __m128 m2 = _mm_mul_ps(v, i2);
63  __m128 m3 = _mm_mul_ps(v, i3);
64 
65  __m128 u0 = _mm_unpacklo_ps(m0, m1);
66  __m128 u1 = _mm_unpackhi_ps(m0, m1);
67  __m128 a0 = _mm_add_ps(u0, u1);
68 
69  __m128 u2 = _mm_unpacklo_ps(m2, m3);
70  __m128 u3 = _mm_unpackhi_ps(m2, m3);
71  __m128 a1 = _mm_add_ps(u2, u3);
72 
73  __m128 f0 = _mm_movelh_ps(a0, a1);
74  __m128 f1 = _mm_movehl_ps(a1, a0);
75  __m128 f2 = _mm_add_ps(f0, f1);
76 
77  return f2;
78 }
79 
80 GLM_FUNC_QUALIFIER void glm_mat4_mul(glm_vec4 const in1[4], glm_vec4 const in2[4], glm_vec4 out[4])
81 {
82  {
83  __m128 e0 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(0, 0, 0, 0));
84  __m128 e1 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(1, 1, 1, 1));
85  __m128 e2 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(2, 2, 2, 2));
86  __m128 e3 = _mm_shuffle_ps(in2[0], in2[0], _MM_SHUFFLE(3, 3, 3, 3));
87 
88  __m128 m0 = _mm_mul_ps(in1[0], e0);
89  __m128 m1 = _mm_mul_ps(in1[1], e1);
90  __m128 m2 = _mm_mul_ps(in1[2], e2);
91  __m128 m3 = _mm_mul_ps(in1[3], e3);
92 
93  __m128 a0 = _mm_add_ps(m0, m1);
94  __m128 a1 = _mm_add_ps(m2, m3);
95  __m128 a2 = _mm_add_ps(a0, a1);
96 
97  out[0] = a2;
98  }
99 
100  {
101  __m128 e0 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(0, 0, 0, 0));
102  __m128 e1 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(1, 1, 1, 1));
103  __m128 e2 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(2, 2, 2, 2));
104  __m128 e3 = _mm_shuffle_ps(in2[1], in2[1], _MM_SHUFFLE(3, 3, 3, 3));
105 
106  __m128 m0 = _mm_mul_ps(in1[0], e0);
107  __m128 m1 = _mm_mul_ps(in1[1], e1);
108  __m128 m2 = _mm_mul_ps(in1[2], e2);
109  __m128 m3 = _mm_mul_ps(in1[3], e3);
110 
111  __m128 a0 = _mm_add_ps(m0, m1);
112  __m128 a1 = _mm_add_ps(m2, m3);
113  __m128 a2 = _mm_add_ps(a0, a1);
114 
115  out[1] = a2;
116  }
117 
118  {
119  __m128 e0 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(0, 0, 0, 0));
120  __m128 e1 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(1, 1, 1, 1));
121  __m128 e2 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(2, 2, 2, 2));
122  __m128 e3 = _mm_shuffle_ps(in2[2], in2[2], _MM_SHUFFLE(3, 3, 3, 3));
123 
124  __m128 m0 = _mm_mul_ps(in1[0], e0);
125  __m128 m1 = _mm_mul_ps(in1[1], e1);
126  __m128 m2 = _mm_mul_ps(in1[2], e2);
127  __m128 m3 = _mm_mul_ps(in1[3], e3);
128 
129  __m128 a0 = _mm_add_ps(m0, m1);
130  __m128 a1 = _mm_add_ps(m2, m3);
131  __m128 a2 = _mm_add_ps(a0, a1);
132 
133  out[2] = a2;
134  }
135 
136  {
137  //(__m128&)_mm_shuffle_epi32(__m128i&)in2[0], _MM_SHUFFLE(3, 3, 3, 3))
138  __m128 e0 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(0, 0, 0, 0));
139  __m128 e1 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(1, 1, 1, 1));
140  __m128 e2 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(2, 2, 2, 2));
141  __m128 e3 = _mm_shuffle_ps(in2[3], in2[3], _MM_SHUFFLE(3, 3, 3, 3));
142 
143  __m128 m0 = _mm_mul_ps(in1[0], e0);
144  __m128 m1 = _mm_mul_ps(in1[1], e1);
145  __m128 m2 = _mm_mul_ps(in1[2], e2);
146  __m128 m3 = _mm_mul_ps(in1[3], e3);
147 
148  __m128 a0 = _mm_add_ps(m0, m1);
149  __m128 a1 = _mm_add_ps(m2, m3);
150  __m128 a2 = _mm_add_ps(a0, a1);
151 
152  out[3] = a2;
153  }
154 }
155 
156 GLM_FUNC_QUALIFIER void glm_mat4_transpose(glm_vec4 const in[4], glm_vec4 out[4])
157 {
158  __m128 tmp0 = _mm_shuffle_ps(in[0], in[1], 0x44);
159  __m128 tmp2 = _mm_shuffle_ps(in[0], in[1], 0xEE);
160  __m128 tmp1 = _mm_shuffle_ps(in[2], in[3], 0x44);
161  __m128 tmp3 = _mm_shuffle_ps(in[2], in[3], 0xEE);
162 
163  out[0] = _mm_shuffle_ps(tmp0, tmp1, 0x88);
164  out[1] = _mm_shuffle_ps(tmp0, tmp1, 0xDD);
165  out[2] = _mm_shuffle_ps(tmp2, tmp3, 0x88);
166  out[3] = _mm_shuffle_ps(tmp2, tmp3, 0xDD);
167 }
168 
169 GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_highp(glm_vec4 const in[4])
170 {
171  __m128 Fac0;
172  {
173  // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
174  // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
175  // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
176  // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
177 
178  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
179  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
180 
181  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
182  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
183  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
184  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
185 
186  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
187  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
188  Fac0 = _mm_sub_ps(Mul00, Mul01);
189  }
190 
191  __m128 Fac1;
192  {
193  // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
194  // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
195  // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
196  // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
197 
198  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
199  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
200 
201  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
202  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
203  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
204  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
205 
206  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
207  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
208  Fac1 = _mm_sub_ps(Mul00, Mul01);
209  }
210 
211 
212  __m128 Fac2;
213  {
214  // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
215  // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
216  // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
217  // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
218 
219  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
220  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
221 
222  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
223  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
224  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
225  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
226 
227  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
228  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
229  Fac2 = _mm_sub_ps(Mul00, Mul01);
230  }
231 
232  __m128 Fac3;
233  {
234  // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
235  // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
236  // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
237  // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
238 
239  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
240  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
241 
242  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
243  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
244  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
245  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
246 
247  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
248  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
249  Fac3 = _mm_sub_ps(Mul00, Mul01);
250  }
251 
252  __m128 Fac4;
253  {
254  // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
255  // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
256  // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
257  // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
258 
259  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
260  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
261 
262  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
263  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
264  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
265  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
266 
267  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
268  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
269  Fac4 = _mm_sub_ps(Mul00, Mul01);
270  }
271 
272  __m128 Fac5;
273  {
274  // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
275  // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
276  // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
277  // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
278 
279  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
280  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
281 
282  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
283  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
284  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
285  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
286 
287  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
288  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
289  Fac5 = _mm_sub_ps(Mul00, Mul01);
290  }
291 
292  __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
293  __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
294 
295  // m[1][0]
296  // m[0][0]
297  // m[0][0]
298  // m[0][0]
299  __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
300  __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
301 
302  // m[1][1]
303  // m[0][1]
304  // m[0][1]
305  // m[0][1]
306  __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
307  __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
308 
309  // m[1][2]
310  // m[0][2]
311  // m[0][2]
312  // m[0][2]
313  __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
314  __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
315 
316  // m[1][3]
317  // m[0][3]
318  // m[0][3]
319  // m[0][3]
320  __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
321  __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
322 
323  // col0
324  // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
325  // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
326  // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
327  // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
328  __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
329  __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
330  __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
331  __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
332  __m128 Add00 = _mm_add_ps(Sub00, Mul02);
333  __m128 Inv0 = _mm_mul_ps(SignB, Add00);
334 
335  // col1
336  // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
337  // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
338  // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
339  // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
340  __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
341  __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
342  __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
343  __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
344  __m128 Add01 = _mm_add_ps(Sub01, Mul05);
345  __m128 Inv1 = _mm_mul_ps(SignA, Add01);
346 
347  // col2
348  // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
349  // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
350  // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
351  // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
352  __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
353  __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
354  __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
355  __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
356  __m128 Add02 = _mm_add_ps(Sub02, Mul08);
357  __m128 Inv2 = _mm_mul_ps(SignB, Add02);
358 
359  // col3
360  // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
361  // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
362  // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
363  // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
364  __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
365  __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
366  __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
367  __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
368  __m128 Add03 = _mm_add_ps(Sub03, Mul11);
369  __m128 Inv3 = _mm_mul_ps(SignA, Add03);
370 
371  __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
372  __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
373  __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
374 
375  // valType Determinant = m[0][0] * Inverse[0][0]
376  // + m[0][1] * Inverse[1][0]
377  // + m[0][2] * Inverse[2][0]
378  // + m[0][3] * Inverse[3][0];
379  __m128 Det0 = glm_vec4_dot(in[0], Row2);
380  return Det0;
381 }
382 
383 GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant_lowp(glm_vec4 const m[4])
384 {
385  // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(
386 
387  //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
388  //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
389  //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
390  //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
391  //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
392  //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
393 
394  // First 2 columns
395  __m128 Swp2A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 1, 1, 2)));
396  __m128 Swp3A = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(3, 2, 3, 3)));
397  __m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
398 
399  // Second 2 columns
400  __m128 Swp2B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(3, 2, 3, 3)));
401  __m128 Swp3B = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(0, 1, 1, 2)));
402  __m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
403 
404  // Columns subtraction
405  __m128 SubE = _mm_sub_ps(MulA, MulB);
406 
407  // Last 2 rows
408  __m128 Swp2C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[2]), _MM_SHUFFLE(0, 0, 1, 2)));
409  __m128 Swp3C = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[3]), _MM_SHUFFLE(1, 2, 0, 0)));
410  __m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
411  __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
412 
413  //vec<4, T, Q> DetCof(
414  // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
415  // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
416  // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
417  // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
418 
419  __m128 SubFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubE), _MM_SHUFFLE(2, 1, 0, 0)));
420  __m128 SwpFacA = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(0, 0, 0, 1)));
421  __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
422 
423  __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
424  __m128 SubFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpB), _MM_SHUFFLE(3, 1, 1, 0)));//SubF[0], SubE[3], SubE[3], SubE[1];
425  __m128 SwpFacB = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(1, 1, 2, 2)));
426  __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
427 
428  __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
429 
430  __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
431  __m128 SubFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(SubTmpC), _MM_SHUFFLE(3, 3, 2, 0)));
432  __m128 SwpFacC = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(m[1]), _MM_SHUFFLE(2, 3, 3, 3)));
433  __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
434 
435  __m128 AddRes = _mm_add_ps(SubRes, MulFacC);
436  __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
437 
438  //return m[0][0] * DetCof[0]
439  // + m[0][1] * DetCof[1]
440  // + m[0][2] * DetCof[2]
441  // + m[0][3] * DetCof[3];
442 
443  return glm_vec4_dot(m[0], DetCof);
444 }
445 
446 GLM_FUNC_QUALIFIER glm_vec4 glm_mat4_determinant(glm_vec4 const m[4])
447 {
448  // _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(add)
449 
450  //T SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
451  //T SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
452  //T SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
453  //T SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
454  //T SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
455  //T SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
456 
457  // First 2 columns
458  __m128 Swp2A = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 1, 1, 2));
459  __m128 Swp3A = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(3, 2, 3, 3));
460  __m128 MulA = _mm_mul_ps(Swp2A, Swp3A);
461 
462  // Second 2 columns
463  __m128 Swp2B = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(3, 2, 3, 3));
464  __m128 Swp3B = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(0, 1, 1, 2));
465  __m128 MulB = _mm_mul_ps(Swp2B, Swp3B);
466 
467  // Columns subtraction
468  __m128 SubE = _mm_sub_ps(MulA, MulB);
469 
470  // Last 2 rows
471  __m128 Swp2C = _mm_shuffle_ps(m[2], m[2], _MM_SHUFFLE(0, 0, 1, 2));
472  __m128 Swp3C = _mm_shuffle_ps(m[3], m[3], _MM_SHUFFLE(1, 2, 0, 0));
473  __m128 MulC = _mm_mul_ps(Swp2C, Swp3C);
474  __m128 SubF = _mm_sub_ps(_mm_movehl_ps(MulC, MulC), MulC);
475 
476  //vec<4, T, Q> DetCof(
477  // + (m[1][1] * SubFactor00 - m[1][2] * SubFactor01 + m[1][3] * SubFactor02),
478  // - (m[1][0] * SubFactor00 - m[1][2] * SubFactor03 + m[1][3] * SubFactor04),
479  // + (m[1][0] * SubFactor01 - m[1][1] * SubFactor03 + m[1][3] * SubFactor05),
480  // - (m[1][0] * SubFactor02 - m[1][1] * SubFactor04 + m[1][2] * SubFactor05));
481 
482  __m128 SubFacA = _mm_shuffle_ps(SubE, SubE, _MM_SHUFFLE(2, 1, 0, 0));
483  __m128 SwpFacA = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(0, 0, 0, 1));
484  __m128 MulFacA = _mm_mul_ps(SwpFacA, SubFacA);
485 
486  __m128 SubTmpB = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(0, 0, 3, 1));
487  __m128 SubFacB = _mm_shuffle_ps(SubTmpB, SubTmpB, _MM_SHUFFLE(3, 1, 1, 0));//SubF[0], SubE[3], SubE[3], SubE[1];
488  __m128 SwpFacB = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(1, 1, 2, 2));
489  __m128 MulFacB = _mm_mul_ps(SwpFacB, SubFacB);
490 
491  __m128 SubRes = _mm_sub_ps(MulFacA, MulFacB);
492 
493  __m128 SubTmpC = _mm_shuffle_ps(SubE, SubF, _MM_SHUFFLE(1, 0, 2, 2));
494  __m128 SubFacC = _mm_shuffle_ps(SubTmpC, SubTmpC, _MM_SHUFFLE(3, 3, 2, 0));
495  __m128 SwpFacC = _mm_shuffle_ps(m[1], m[1], _MM_SHUFFLE(2, 3, 3, 3));
496  __m128 MulFacC = _mm_mul_ps(SwpFacC, SubFacC);
497 
498  __m128 AddRes = _mm_add_ps(SubRes, MulFacC);
499  __m128 DetCof = _mm_mul_ps(AddRes, _mm_setr_ps( 1.0f,-1.0f, 1.0f,-1.0f));
500 
501  //return m[0][0] * DetCof[0]
502  // + m[0][1] * DetCof[1]
503  // + m[0][2] * DetCof[2]
504  // + m[0][3] * DetCof[3];
505 
506  return glm_vec4_dot(m[0], DetCof);
507 }
508 
509 GLM_FUNC_QUALIFIER void glm_mat4_inverse(glm_vec4 const in[4], glm_vec4 out[4])
510 {
511  __m128 Fac0;
512  {
513  // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
514  // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
515  // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
516  // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
517 
518  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
519  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
520 
521  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
522  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
523  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
524  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
525 
526  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
527  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
528  Fac0 = _mm_sub_ps(Mul00, Mul01);
529  }
530 
531  __m128 Fac1;
532  {
533  // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
534  // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
535  // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
536  // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
537 
538  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
539  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
540 
541  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
542  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
543  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
544  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
545 
546  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
547  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
548  Fac1 = _mm_sub_ps(Mul00, Mul01);
549  }
550 
551 
552  __m128 Fac2;
553  {
554  // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
555  // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
556  // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
557  // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
558 
559  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
560  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
561 
562  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
563  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
564  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
565  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
566 
567  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
568  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
569  Fac2 = _mm_sub_ps(Mul00, Mul01);
570  }
571 
572  __m128 Fac3;
573  {
574  // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
575  // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
576  // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
577  // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
578 
579  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
580  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
581 
582  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
583  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
584  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
585  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
586 
587  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
588  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
589  Fac3 = _mm_sub_ps(Mul00, Mul01);
590  }
591 
592  __m128 Fac4;
593  {
594  // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
595  // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
596  // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
597  // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
598 
599  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
600  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
601 
602  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
603  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
604  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
605  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
606 
607  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
608  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
609  Fac4 = _mm_sub_ps(Mul00, Mul01);
610  }
611 
612  __m128 Fac5;
613  {
614  // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
615  // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
616  // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
617  // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
618 
619  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
620  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
621 
622  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
623  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
624  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
625  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
626 
627  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
628  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
629  Fac5 = _mm_sub_ps(Mul00, Mul01);
630  }
631 
632  __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
633  __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
634 
635  // m[1][0]
636  // m[0][0]
637  // m[0][0]
638  // m[0][0]
639  __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
640  __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
641 
642  // m[1][1]
643  // m[0][1]
644  // m[0][1]
645  // m[0][1]
646  __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
647  __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
648 
649  // m[1][2]
650  // m[0][2]
651  // m[0][2]
652  // m[0][2]
653  __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
654  __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
655 
656  // m[1][3]
657  // m[0][3]
658  // m[0][3]
659  // m[0][3]
660  __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
661  __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
662 
663  // col0
664  // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
665  // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
666  // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
667  // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
668  __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
669  __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
670  __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
671  __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
672  __m128 Add00 = _mm_add_ps(Sub00, Mul02);
673  __m128 Inv0 = _mm_mul_ps(SignB, Add00);
674 
675  // col1
676  // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
677  // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
678  // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
679  // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
680  __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
681  __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
682  __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
683  __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
684  __m128 Add01 = _mm_add_ps(Sub01, Mul05);
685  __m128 Inv1 = _mm_mul_ps(SignA, Add01);
686 
687  // col2
688  // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
689  // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
690  // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
691  // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
692  __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
693  __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
694  __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
695  __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
696  __m128 Add02 = _mm_add_ps(Sub02, Mul08);
697  __m128 Inv2 = _mm_mul_ps(SignB, Add02);
698 
699  // col3
700  // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
701  // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
702  // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
703  // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
704  __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
705  __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
706  __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
707  __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
708  __m128 Add03 = _mm_add_ps(Sub03, Mul11);
709  __m128 Inv3 = _mm_mul_ps(SignA, Add03);
710 
711  __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
712  __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
713  __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
714 
715  // valType Determinant = m[0][0] * Inverse[0][0]
716  // + m[0][1] * Inverse[1][0]
717  // + m[0][2] * Inverse[2][0]
718  // + m[0][3] * Inverse[3][0];
719  __m128 Det0 = glm_vec4_dot(in[0], Row2);
720  __m128 Rcp0 = _mm_div_ps(_mm_set1_ps(1.0f), Det0);
721  //__m128 Rcp0 = _mm_rcp_ps(Det0);
722 
723  // Inverse /= Determinant;
724  out[0] = _mm_mul_ps(Inv0, Rcp0);
725  out[1] = _mm_mul_ps(Inv1, Rcp0);
726  out[2] = _mm_mul_ps(Inv2, Rcp0);
727  out[3] = _mm_mul_ps(Inv3, Rcp0);
728 }
729 
730 GLM_FUNC_QUALIFIER void glm_mat4_inverse_lowp(glm_vec4 const in[4], glm_vec4 out[4])
731 {
732  __m128 Fac0;
733  {
734  // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
735  // valType SubFactor00 = m[2][2] * m[3][3] - m[3][2] * m[2][3];
736  // valType SubFactor06 = m[1][2] * m[3][3] - m[3][2] * m[1][3];
737  // valType SubFactor13 = m[1][2] * m[2][3] - m[2][2] * m[1][3];
738 
739  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
740  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
741 
742  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
743  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
744  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
745  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
746 
747  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
748  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
749  Fac0 = _mm_sub_ps(Mul00, Mul01);
750  }
751 
752  __m128 Fac1;
753  {
754  // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
755  // valType SubFactor01 = m[2][1] * m[3][3] - m[3][1] * m[2][3];
756  // valType SubFactor07 = m[1][1] * m[3][3] - m[3][1] * m[1][3];
757  // valType SubFactor14 = m[1][1] * m[2][3] - m[2][1] * m[1][3];
758 
759  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
760  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
761 
762  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
763  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
764  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
765  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
766 
767  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
768  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
769  Fac1 = _mm_sub_ps(Mul00, Mul01);
770  }
771 
772 
773  __m128 Fac2;
774  {
775  // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
776  // valType SubFactor02 = m[2][1] * m[3][2] - m[3][1] * m[2][2];
777  // valType SubFactor08 = m[1][1] * m[3][2] - m[3][1] * m[1][2];
778  // valType SubFactor15 = m[1][1] * m[2][2] - m[2][1] * m[1][2];
779 
780  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
781  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
782 
783  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
784  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
785  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
786  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
787 
788  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
789  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
790  Fac2 = _mm_sub_ps(Mul00, Mul01);
791  }
792 
793  __m128 Fac3;
794  {
795  // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
796  // valType SubFactor03 = m[2][0] * m[3][3] - m[3][0] * m[2][3];
797  // valType SubFactor09 = m[1][0] * m[3][3] - m[3][0] * m[1][3];
798  // valType SubFactor16 = m[1][0] * m[2][3] - m[2][0] * m[1][3];
799 
800  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(3, 3, 3, 3));
801  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
802 
803  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
804  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
805  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
806  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(3, 3, 3, 3));
807 
808  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
809  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
810  Fac3 = _mm_sub_ps(Mul00, Mul01);
811  }
812 
813  __m128 Fac4;
814  {
815  // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
816  // valType SubFactor04 = m[2][0] * m[3][2] - m[3][0] * m[2][2];
817  // valType SubFactor10 = m[1][0] * m[3][2] - m[3][0] * m[1][2];
818  // valType SubFactor17 = m[1][0] * m[2][2] - m[2][0] * m[1][2];
819 
820  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(2, 2, 2, 2));
821  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
822 
823  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
824  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
825  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
826  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(2, 2, 2, 2));
827 
828  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
829  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
830  Fac4 = _mm_sub_ps(Mul00, Mul01);
831  }
832 
833  __m128 Fac5;
834  {
835  // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
836  // valType SubFactor05 = m[2][0] * m[3][1] - m[3][0] * m[2][1];
837  // valType SubFactor12 = m[1][0] * m[3][1] - m[3][0] * m[1][1];
838  // valType SubFactor18 = m[1][0] * m[2][1] - m[2][0] * m[1][1];
839 
840  __m128 Swp0a = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(1, 1, 1, 1));
841  __m128 Swp0b = _mm_shuffle_ps(in[3], in[2], _MM_SHUFFLE(0, 0, 0, 0));
842 
843  __m128 Swp00 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(0, 0, 0, 0));
844  __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
845  __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
846  __m128 Swp03 = _mm_shuffle_ps(in[2], in[1], _MM_SHUFFLE(1, 1, 1, 1));
847 
848  __m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
849  __m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
850  Fac5 = _mm_sub_ps(Mul00, Mul01);
851  }
852 
853  __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
854  __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);
855 
856  // m[1][0]
857  // m[0][0]
858  // m[0][0]
859  // m[0][0]
860  __m128 Temp0 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(0, 0, 0, 0));
861  __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));
862 
863  // m[1][1]
864  // m[0][1]
865  // m[0][1]
866  // m[0][1]
867  __m128 Temp1 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(1, 1, 1, 1));
868  __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));
869 
870  // m[1][2]
871  // m[0][2]
872  // m[0][2]
873  // m[0][2]
874  __m128 Temp2 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(2, 2, 2, 2));
875  __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));
876 
877  // m[1][3]
878  // m[0][3]
879  // m[0][3]
880  // m[0][3]
881  __m128 Temp3 = _mm_shuffle_ps(in[1], in[0], _MM_SHUFFLE(3, 3, 3, 3));
882  __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));
883 
884  // col0
885  // + (Vec1[0] * Fac0[0] - Vec2[0] * Fac1[0] + Vec3[0] * Fac2[0]),
886  // - (Vec1[1] * Fac0[1] - Vec2[1] * Fac1[1] + Vec3[1] * Fac2[1]),
887  // + (Vec1[2] * Fac0[2] - Vec2[2] * Fac1[2] + Vec3[2] * Fac2[2]),
888  // - (Vec1[3] * Fac0[3] - Vec2[3] * Fac1[3] + Vec3[3] * Fac2[3]),
889  __m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
890  __m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
891  __m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
892  __m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
893  __m128 Add00 = _mm_add_ps(Sub00, Mul02);
894  __m128 Inv0 = _mm_mul_ps(SignB, Add00);
895 
896  // col1
897  // - (Vec0[0] * Fac0[0] - Vec2[0] * Fac3[0] + Vec3[0] * Fac4[0]),
898  // + (Vec0[0] * Fac0[1] - Vec2[1] * Fac3[1] + Vec3[1] * Fac4[1]),
899  // - (Vec0[0] * Fac0[2] - Vec2[2] * Fac3[2] + Vec3[2] * Fac4[2]),
900  // + (Vec0[0] * Fac0[3] - Vec2[3] * Fac3[3] + Vec3[3] * Fac4[3]),
901  __m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
902  __m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
903  __m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
904  __m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
905  __m128 Add01 = _mm_add_ps(Sub01, Mul05);
906  __m128 Inv1 = _mm_mul_ps(SignA, Add01);
907 
908  // col2
909  // + (Vec0[0] * Fac1[0] - Vec1[0] * Fac3[0] + Vec3[0] * Fac5[0]),
910  // - (Vec0[0] * Fac1[1] - Vec1[1] * Fac3[1] + Vec3[1] * Fac5[1]),
911  // + (Vec0[0] * Fac1[2] - Vec1[2] * Fac3[2] + Vec3[2] * Fac5[2]),
912  // - (Vec0[0] * Fac1[3] - Vec1[3] * Fac3[3] + Vec3[3] * Fac5[3]),
913  __m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
914  __m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
915  __m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
916  __m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
917  __m128 Add02 = _mm_add_ps(Sub02, Mul08);
918  __m128 Inv2 = _mm_mul_ps(SignB, Add02);
919 
920  // col3
921  // - (Vec1[0] * Fac2[0] - Vec1[0] * Fac4[0] + Vec2[0] * Fac5[0]),
922  // + (Vec1[0] * Fac2[1] - Vec1[1] * Fac4[1] + Vec2[1] * Fac5[1]),
923  // - (Vec1[0] * Fac2[2] - Vec1[2] * Fac4[2] + Vec2[2] * Fac5[2]),
924  // + (Vec1[0] * Fac2[3] - Vec1[3] * Fac4[3] + Vec2[3] * Fac5[3]));
925  __m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
926  __m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
927  __m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
928  __m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
929  __m128 Add03 = _mm_add_ps(Sub03, Mul11);
930  __m128 Inv3 = _mm_mul_ps(SignA, Add03);
931 
932  __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
933  __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
934  __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));
935 
936  // valType Determinant = m[0][0] * Inverse[0][0]
937  // + m[0][1] * Inverse[1][0]
938  // + m[0][2] * Inverse[2][0]
939  // + m[0][3] * Inverse[3][0];
940  __m128 Det0 = glm_vec4_dot(in[0], Row2);
941  __m128 Rcp0 = _mm_rcp_ps(Det0);
942  //__m128 Rcp0 = _mm_div_ps(one, Det0);
943  // Inverse /= Determinant;
944  out[0] = _mm_mul_ps(Inv0, Rcp0);
945  out[1] = _mm_mul_ps(Inv1, Rcp0);
946  out[2] = _mm_mul_ps(Inv2, Rcp0);
947  out[3] = _mm_mul_ps(Inv3, Rcp0);
948 }
949 /*
950 GLM_FUNC_QUALIFIER void glm_mat4_rotate(__m128 const in[4], float Angle, float const v[3], __m128 out[4])
951 {
952  float a = glm::radians(Angle);
953  float c = cos(a);
954  float s = sin(a);
955 
956  glm::vec4 AxisA(v[0], v[1], v[2], float(0));
957  __m128 AxisB = _mm_set_ps(AxisA.w, AxisA.z, AxisA.y, AxisA.x);
958  __m128 AxisC = detail::sse_nrm_ps(AxisB);
959 
960  __m128 Cos0 = _mm_set_ss(c);
961  __m128 CosA = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(0, 0, 0, 0));
962  __m128 Sin0 = _mm_set_ss(s);
963  __m128 SinA = _mm_shuffle_ps(Sin0, Sin0, _MM_SHUFFLE(0, 0, 0, 0));
964 
965  // vec<3, T, Q> temp = (valType(1) - c) * axis;
966  __m128 Temp0 = _mm_sub_ps(one, CosA);
967  __m128 Temp1 = _mm_mul_ps(Temp0, AxisC);
968 
969  //Rotate[0][0] = c + temp[0] * axis[0];
970  //Rotate[0][1] = 0 + temp[0] * axis[1] + s * axis[2];
971  //Rotate[0][2] = 0 + temp[0] * axis[2] - s * axis[1];
972  __m128 Axis0 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(0, 0, 0, 0));
973  __m128 TmpA0 = _mm_mul_ps(Axis0, AxisC);
974  __m128 CosA0 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 1, 0));
975  __m128 TmpA1 = _mm_add_ps(CosA0, TmpA0);
976  __m128 SinA0 = SinA;//_mm_set_ps(0.0f, s, -s, 0.0f);
977  __m128 TmpA2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 1, 2, 3));
978  __m128 TmpA3 = _mm_mul_ps(SinA0, TmpA2);
979  __m128 TmpA4 = _mm_add_ps(TmpA1, TmpA3);
980 
981  //Rotate[1][0] = 0 + temp[1] * axis[0] - s * axis[2];
982  //Rotate[1][1] = c + temp[1] * axis[1];
983  //Rotate[1][2] = 0 + temp[1] * axis[2] + s * axis[0];
984  __m128 Axis1 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(1, 1, 1, 1));
985  __m128 TmpB0 = _mm_mul_ps(Axis1, AxisC);
986  __m128 CosA1 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 1, 0, 1));
987  __m128 TmpB1 = _mm_add_ps(CosA1, TmpB0);
988  __m128 SinB0 = SinA;//_mm_set_ps(-s, 0.0f, s, 0.0f);
989  __m128 TmpB2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 0, 3, 2));
990  __m128 TmpB3 = _mm_mul_ps(SinA0, TmpB2);
991  __m128 TmpB4 = _mm_add_ps(TmpB1, TmpB3);
992 
993  //Rotate[2][0] = 0 + temp[2] * axis[0] + s * axis[1];
994  //Rotate[2][1] = 0 + temp[2] * axis[1] - s * axis[0];
995  //Rotate[2][2] = c + temp[2] * axis[2];
996  __m128 Axis2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(2, 2, 2, 2));
997  __m128 TmpC0 = _mm_mul_ps(Axis2, AxisC);
998  __m128 CosA2 = _mm_shuffle_ps(Cos0, Cos0, _MM_SHUFFLE(1, 0, 1, 1));
999  __m128 TmpC1 = _mm_add_ps(CosA2, TmpC0);
1000  __m128 SinC0 = SinA;//_mm_set_ps(s, -s, 0.0f, 0.0f);
1001  __m128 TmpC2 = _mm_shuffle_ps(AxisC, AxisC, _MM_SHUFFLE(3, 3, 0, 1));
1002  __m128 TmpC3 = _mm_mul_ps(SinA0, TmpC2);
1003  __m128 TmpC4 = _mm_add_ps(TmpC1, TmpC3);
1004 
1005  __m128 Result[4];
1006  Result[0] = TmpA4;
1007  Result[1] = TmpB4;
1008  Result[2] = TmpC4;
1009  Result[3] = _mm_set_ps(1, 0, 0, 0);
1010 
1011  //mat<4, 4, valType> Result;
1012  //Result[0] = m[0] * Rotate[0][0] + m[1] * Rotate[0][1] + m[2] * Rotate[0][2];
1013  //Result[1] = m[0] * Rotate[1][0] + m[1] * Rotate[1][1] + m[2] * Rotate[1][2];
1014  //Result[2] = m[0] * Rotate[2][0] + m[1] * Rotate[2][1] + m[2] * Rotate[2][2];
1015  //Result[3] = m[3];
1016  //return Result;
1017  sse_mul_ps(in, Result, out);
1018 }
1019 */
1020 GLM_FUNC_QUALIFIER void glm_mat4_outerProduct(__m128 const& c, __m128 const& r, __m128 out[4])
1021 {
1022  out[0] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(0, 0, 0, 0)));
1023  out[1] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(1, 1, 1, 1)));
1024  out[2] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(2, 2, 2, 2)));
1025  out[3] = _mm_mul_ps(c, _mm_shuffle_ps(r, r, _MM_SHUFFLE(3, 3, 3, 3)));
1026 }
1027 
1028 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT