29 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
30 element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
31 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
32 element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
33 __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
34 outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
36 return outer_inner_vec;
39 inline __m256d
complex_mult_AVX(__m256d input_vec, __m256d unitary_row_vec, __m256d neg){
41 __m256d vec3 = _mm256_mul_pd(input_vec, unitary_row_vec);
42 __m256d unitary_row_switched = _mm256_permute_pd(unitary_row_vec, 0x5);
43 unitary_row_switched = _mm256_mul_pd(unitary_row_switched, neg);
44 __m256d vec4 = _mm256_mul_pd(input_vec, unitary_row_switched);
45 __m256d result_vec = _mm256_hsub_pd(vec3, vec4);
46 result_vec = _mm256_permute4x64_pd(result_vec,0b11011000);
76 int index_step_outer = 1 << outer_qbit;
77 int index_step_inner = 1 << inner_qbit;
79 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
80 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
82 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
84 for (
int idx=0; idx<index_step_inner; idx++){
87 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
88 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
89 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
90 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
91 double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
93 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
94 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
95 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
96 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
98 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
99 element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
100 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
101 element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
102 __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
103 outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
106 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
107 element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
108 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
109 element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
110 __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
111 outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
116 for (
int mult_idx=0; mult_idx<4; mult_idx++){
117 double* unitary_row_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
118 double* unitary_row_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
120 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
121 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
123 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
125 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
127 __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
128 result_vec = _mm256_hadd_pd(result_vec,result_vec);
129 double*
result = (
double*)&result_vec;
130 results[mult_idx*2] = result[0];
131 results[mult_idx*2+1] = result[2];
133 input[current_idx_outer_loc].real = results[0];
134 input[current_idx_outer_loc].imag = results[1];
135 input[current_idx_inner_loc].real = results[2];
136 input[current_idx_inner_loc].imag = results[3];
137 input[current_idx_outer_pair_loc].real = results[4];
138 input[current_idx_outer_pair_loc].imag = results[5];
139 input[current_idx_inner_pair_loc].real = results[6];
140 input[current_idx_inner_pair_loc].imag = results[7];
145 for (
int idx=0; idx<index_step_inner; idx=idx+2){
147 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
149 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
150 double* element_inner = element_outer + 2 * index_step_inner;
152 double* element_outer_pair = element_outer + 2 * index_step_outer;
153 double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
155 double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
157 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
158 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
160 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
161 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
163 __m256d result_vecs[4];
165 for (
int mult_idx=0; mult_idx<4; mult_idx++){
166 double* unitary_col_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
167 double* unitary_col_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
169 __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01);
170 unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000);
171 __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000);
172 __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111);
173 unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000);
174 unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000);
176 __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23);
177 unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000);
178 __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000);
179 __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111);
180 unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000);
181 unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000);
183 __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
184 __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
185 unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
186 __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
187 __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
190 __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
191 __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
192 unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
193 __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
194 __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
196 __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
197 __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
198 unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
199 __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
200 __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
203 __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
204 __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
205 unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
206 __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
207 __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
209 __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
210 result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
211 result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
212 result_vecs[mult_idx] = result_vec;
214 for (
int row_idx=0; row_idx<4;row_idx++){
215 _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
222 current_idx = current_idx + (index_step_outer << 1);
228 int index_step_outer = 1 << outer_qbit;
229 int index_step_inner = 1 << inner_qbit;
231 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
232 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
234 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
235 for (
int idx=0; idx<index_step_inner; idx++){
237 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
238 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
239 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
240 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
242 int row_offset_outer = current_idx_outer_loc*input.
cols;
243 int row_offset_inner = current_idx_inner_loc*input.
cols;
244 int row_offset_outer_pair = current_idx_outer_pair_loc*input.
cols;
245 int row_offset_inner_pair = current_idx_inner_pair_loc*input.
cols;
246 #pragma omp parallel for 247 for (
int col_idx=0; col_idx<input.
cols;col_idx++){
249 int current_idx_outer = row_offset_outer + col_idx;
250 int current_idx_inner = row_offset_inner + col_idx;
251 int current_idx_outer_pair = row_offset_outer_pair + col_idx;
252 int current_idx_inner_pair = row_offset_inner_pair + col_idx;
254 double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
256 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer;
257 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner;
258 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair;
259 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair;
261 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
262 element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
263 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
264 element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
265 __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
266 outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
269 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
270 element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
271 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
272 element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
273 __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
274 outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
279 for (
int mult_idx=0; mult_idx<4; mult_idx++){
280 double* unitary_row_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
281 double* unitary_row_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
283 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
284 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
286 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
288 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
290 __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
291 result_vec = _mm256_hadd_pd(result_vec,result_vec);
292 double*
result = (
double*)&result_vec;
293 results[mult_idx*2] = result[0];
294 results[mult_idx*2+1] = result[2];
296 input[current_idx_outer].real = results[0];
297 input[current_idx_outer].imag = results[1];
298 input[current_idx_inner].real = results[2];
299 input[current_idx_inner].imag = results[3];
300 input[current_idx_outer_pair].real = results[4];
301 input[current_idx_outer_pair].imag = results[5];
302 input[current_idx_inner_pair].real = results[6];
303 input[current_idx_inner_pair].imag = results[7];
308 current_idx = current_idx + (index_step_outer << 1);
314 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
316 int inner_qbit = involved_qbits[0];
318 int outer_qbit = involved_qbits[1];
320 int index_step_outer = 1 << outer_qbit;
322 int index_step_inner = 1 << inner_qbit;
325 int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
327 int parallel_inner_cycles = index_step_outer/(index_step_inner << 1);
332 int outer_grain_size;
333 if ( index_step_outer <= 2 ) {
334 outer_grain_size = 64;
336 else if ( index_step_outer <= 4 ) {
337 outer_grain_size = 32;
339 else if ( index_step_outer <= 8 ) {
340 outer_grain_size = 16;
342 else if ( index_step_outer <= 16 ) {
343 outer_grain_size = 8;
346 outer_grain_size = 2;
348 outer_grain_size = 6000000;
349 int inner_grain_size = 64000000;
352 tbb::parallel_for( tbb::blocked_range<int>(0, parallel_outer_cycles, outer_grain_size), [&](tbb::blocked_range<int> r) {
354 int current_idx = r.begin()*(index_step_outer<<1);
355 int current_idx_pair_outer = current_idx + index_step_outer;
358 for (
int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
360 tbb::parallel_for( tbb::blocked_range<int>(0, parallel_inner_cycles, inner_grain_size), [&](tbb::blocked_range<int> r) {
362 int current_idx_inner = r.begin()*(index_step_inner<<1);
364 for (
int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
367 for (
int idx=0; idx<index_step_inner; ++idx){
369 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
370 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
371 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
372 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
373 double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
375 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
376 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
377 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
378 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
380 __m256d outer_inner_vec =
get_AVX_vector(element_outer, element_inner);
384 __m256d outer_inner_pair_vec =
get_AVX_vector(element_outer_pair, element_inner_pair);
387 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
389 for (
int mult_idx=0; mult_idx<4; mult_idx++){
390 double* unitary_row_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
391 double* unitary_row_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
393 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
394 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
396 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
398 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
401 __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
402 result_vec = _mm256_hadd_pd(result_vec,result_vec);
403 double*
result = (
double*)&result_vec;
404 results[mult_idx*2] = result[0];
405 results[mult_idx*2+1] = result[2];
408 input[current_idx_outer_loc].real = results[0];
409 input[current_idx_outer_loc].imag = results[1];
410 input[current_idx_inner_loc].real = results[2];
411 input[current_idx_inner_loc].imag = results[3];
412 input[current_idx_outer_pair_loc].real = results[4];
413 input[current_idx_outer_pair_loc].imag = results[5];
414 input[current_idx_inner_pair_loc].real = results[6];
415 input[current_idx_inner_pair_loc].imag = results[7];
421 for (
int alt_idx=0; alt_idx<index_step_inner/2; ++alt_idx){
424 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
426 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
427 double* element_inner = element_outer + 2 * index_step_inner;
429 double* element_outer_pair = element_outer + 2 * index_step_outer;
430 double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
432 double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
434 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
435 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
437 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
438 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
440 __m256d result_vecs[4];
442 for (
int mult_idx=0; mult_idx<4; mult_idx++){
443 double* unitary_col_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
444 double* unitary_col_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
446 __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01);
447 unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000);
448 __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000);
449 __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111);
450 unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000);
451 unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000);
453 __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23);
454 unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000);
455 __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000);
456 __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111);
457 unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000);
458 unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000);
460 __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
461 __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
462 unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
463 __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
464 __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
467 __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
468 __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
469 unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
470 __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
471 __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
473 __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
474 __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
475 unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
476 __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
477 __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
480 __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
481 __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
482 unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
483 __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
484 __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
486 __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
487 result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
488 result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
489 result_vecs[mult_idx] = result_vec;
492 for (
int row_idx=0; row_idx<4;row_idx++){
493 _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
499 current_idx_inner = current_idx_inner +(index_step_inner << 1);
504 current_idx = current_idx + (index_step_outer << 1);
505 current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
515 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
517 int index_step_inner = 1 << involved_qbits[0];
519 int index_step_middle = 1 << involved_qbits[1];
521 int index_step_outer = 1 << involved_qbits[2];
524 int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
526 int parallel_inner_cycles = index_step_middle/(index_step_inner << 1);
528 int parallel_middle_cycles = index_step_outer/(index_step_middle << 1);
535 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
537 int current_idx = r.begin()*(index_step_outer<<1);
539 int current_idx_pair_outer = current_idx + index_step_outer;
542 for (
int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
544 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle_cycles,middle_grain_size), [&](tbb::blocked_range<int> r) {
546 int current_idx_middle = r.begin()*(index_step_middle<<1);
547 for (
int middle_rdx=r.begin(); middle_rdx<r.end(); middle_rdx++){
549 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
550 int current_idx_inner = r.begin()*(index_step_inner<<1);
552 for (
int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
554 tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
556 for (
int idx=r.begin(); idx<r.end(); ++idx){
558 int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
559 int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
561 int current_idx_outer_loc = current_idx_loc;
562 int current_idx_inner_loc = current_idx_loc + index_step_inner;
564 int current_idx_middle_loc = current_idx_loc + index_step_middle;
565 int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
567 int current_idx_outer_pair_loc = current_idx_pair_loc;
568 int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
570 int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
571 int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
575 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
576 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
578 double* element_middle = (
double*)input.
get_data() + 2 * current_idx_middle_loc;
579 double* element_middle_inner = (
double*)input.
get_data() + 2 * current_idx_middle_inner_loc;
581 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
582 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
584 double* element_middle_pair = (
double*)input.
get_data() + 2 * current_idx_middle_pair_loc;
585 double* element_middle_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle_inner_pair_loc;
587 __m256d outer_inner_vec =
get_AVX_vector(element_outer, element_inner);
589 __m256d middle_inner_vec =
get_AVX_vector(element_middle,element_middle_inner);
591 __m256d outer_inner_pair_vec =
get_AVX_vector(element_outer_pair, element_inner_pair);
593 __m256d middle_inner_pair_vec =
get_AVX_vector(element_middle_pair, element_middle_inner_pair);
597 for (
int mult_idx=0; mult_idx<8; mult_idx++){
599 double* unitary_row_01 = (
double*)unitary.
get_data() + 16*mult_idx;
600 double* unitary_row_23 = (
double*)unitary.
get_data() + 16*mult_idx + 4;
601 double* unitary_row_45 = (
double*)unitary.
get_data() + 16*mult_idx + 8;
602 double* unitary_row_67 = (
double*)unitary.
get_data() + 16*mult_idx + 12;
604 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
605 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
606 __m256d unitary_row_45_vec = _mm256_loadu_pd(unitary_row_45);
607 __m256d unitary_row_67_vec = _mm256_loadu_pd(unitary_row_67);
610 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
612 __m256d result_upper_middle_vec =
complex_mult_AVX(middle_inner_vec,unitary_row_23_vec,neg);
614 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_45_vec,neg);
616 __m256d result_lower_middle_vec =
complex_mult_AVX(middle_inner_pair_vec,unitary_row_67_vec,neg);
619 __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle_vec);
620 __m256d result2_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle_vec);
621 __m256d result_vec = _mm256_hadd_pd(result1_vec,result2_vec);
622 result_vec = _mm256_hadd_pd(result_vec,result_vec);
623 double*
result = (
double*)&result_vec;
624 results[mult_idx].
real = result[0];
625 results[mult_idx].
imag = result[2];
627 input[current_idx_outer_loc] = results[0];
628 input[current_idx_inner_loc] = results[1];
629 input[current_idx_middle_loc] = results[2];
630 input[current_idx_middle_inner_loc] = results[3];
631 input[current_idx_outer_pair_loc] = results[4];
632 input[current_idx_inner_pair_loc] = results[5];
633 input[current_idx_middle_pair_loc] = results[6];
634 input[current_idx_middle_inner_pair_loc] = results[7];
639 current_idx_inner = current_idx_inner +(index_step_inner << 1);
642 current_idx_middle = current_idx_middle +(index_step_middle << 1);
645 current_idx = current_idx + (index_step_outer << 1);
646 current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
654 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
656 int index_step_inner = 1 << involved_qbits[0];
658 int index_step_middle1 = 1 << involved_qbits[1];
660 int index_step_middle2 = 1 << involved_qbits[2];
662 int index_step_outer = 1 << involved_qbits[3];
664 int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
666 int parallel_inner_cycles = index_step_middle1/(index_step_inner << 1);
668 int parallel_middle1_cycles = index_step_middle2/(index_step_middle1 << 1);
670 int parallel_middle2_cycles = index_step_outer/(index_step_middle2 << 1);
677 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
679 int current_idx = r.begin()*(index_step_outer<<1);
681 int current_idx_pair_outer = current_idx + index_step_outer;
684 for (
int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
686 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle2_cycles,middle2_grain_size), [&](tbb::blocked_range<int> r) {
688 int current_idx_middle2 = r.begin()*(index_step_middle2<<1);
690 for (
int middle2_rdx=r.begin(); middle2_rdx<r.end(); middle2_rdx++){
692 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle1_cycles,middle1_grain_size), [&](tbb::blocked_range<int> r) {
694 int current_idx_middle1 = r.begin()*(index_step_middle1<<1);
696 for (
int middle1_rdx=r.begin(); middle1_rdx<r.end(); middle1_rdx++){
698 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
700 int current_idx_inner = r.begin()*(index_step_inner<<1);
702 for (
int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
704 tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
706 for (
int idx=r.begin(); idx<r.end(); ++idx){
708 int current_idx_loc = current_idx + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2 ;
709 int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2;
711 int current_idx_outer_loc = current_idx_loc;
712 int current_idx_inner_loc = current_idx_loc + index_step_inner;
714 int current_idx_middle1_loc = current_idx_loc + index_step_middle1;
715 int current_idx_middle1_inner_loc = current_idx_loc + index_step_middle1 + index_step_inner;
717 int current_idx_middle2_loc = current_idx_loc + index_step_middle2;
718 int current_idx_middle2_inner_loc = current_idx_loc + index_step_middle2 + index_step_inner;
720 int current_idx_middle12_loc = current_idx_loc + index_step_middle1 + index_step_middle2;
721 int current_idx_middle12_inner_loc = current_idx_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
723 int current_idx_outer_pair_loc = current_idx_pair_loc;
724 int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
726 int current_idx_middle1_pair_loc =current_idx_pair_loc + index_step_middle1;
727 int current_idx_middle1_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_inner;
729 int current_idx_middle2_pair_loc = current_idx_pair_loc + index_step_middle2;
730 int current_idx_middle2_inner_pair_loc = current_idx_pair_loc + index_step_middle2 + index_step_inner;
732 int current_idx_middle12_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2;
733 int current_idx_middle12_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
737 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
738 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
740 double* element_middle1 = (
double*)input.
get_data() + 2 * current_idx_middle1_loc;
741 double* element_middle1_inner = (
double*)input.
get_data() + 2 * current_idx_middle1_inner_loc;
743 double* element_middle2 = (
double*)input.
get_data() + 2 * current_idx_middle2_loc;
744 double* element_middle2_inner = (
double*)input.
get_data() + 2 * current_idx_middle2_inner_loc;
746 double* element_middle12 = (
double*)input.
get_data() + 2 * current_idx_middle12_loc;
747 double* element_middle12_inner = (
double*)input.
get_data() + 2 * current_idx_middle12_inner_loc;
749 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
750 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
752 double* element_middle1_pair = (
double*)input.
get_data() + 2 * current_idx_middle1_pair_loc;
753 double* element_middle1_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle1_inner_pair_loc;
755 double* element_middle2_pair = (
double*)input.
get_data() + 2 * current_idx_middle2_pair_loc;
756 double* element_middle2_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle2_inner_pair_loc;
758 double* element_middle12_pair = (
double*)input.
get_data() + 2 * current_idx_middle12_pair_loc;
759 double* element_middle12_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle12_inner_pair_loc;
762 __m256d outer_inner_vec =
get_AVX_vector(element_outer, element_inner);
764 __m256d middle1_inner_vec =
get_AVX_vector(element_middle1,element_middle1_inner);
766 __m256d middle2_inner_vec =
get_AVX_vector(element_middle2, element_middle2_inner);
768 __m256d middle12_inner_vec =
get_AVX_vector(element_middle12,element_middle12_inner);
770 __m256d outer_inner_pair_vec =
get_AVX_vector(element_outer_pair, element_inner_pair);
772 __m256d middle1_inner_pair_vec =
get_AVX_vector(element_middle1_pair, element_middle1_inner_pair);
774 __m256d middle2_inner_pair_vec =
get_AVX_vector(element_middle2_pair, element_middle2_inner_pair);
776 __m256d middle12_inner_pair_vec =
get_AVX_vector(element_middle12_pair, element_middle12_inner_pair);
779 for (
int mult_idx=0; mult_idx<16; mult_idx++){
781 double* unitary_row_1 = (
double*)unitary.
get_data() + 32*mult_idx;
782 double* unitary_row_2 = (
double*)unitary.
get_data() + 32*mult_idx + 4;
783 double* unitary_row_3 = (
double*)unitary.
get_data() + 32*mult_idx + 8;
784 double* unitary_row_4 = (
double*)unitary.
get_data() + 32*mult_idx + 12;
785 double* unitary_row_5 = (
double*)unitary.
get_data() + 32*mult_idx + 16;
786 double* unitary_row_6 = (
double*)unitary.
get_data() + 32*mult_idx + 20;
787 double* unitary_row_7 = (
double*)unitary.
get_data() + 32*mult_idx + 24;
788 double* unitary_row_8 = (
double*)unitary.
get_data() + 32*mult_idx + 28;
790 __m256d unitary_row_1_vec = _mm256_loadu_pd(unitary_row_1);
791 __m256d unitary_row_2_vec = _mm256_loadu_pd(unitary_row_2);
792 __m256d unitary_row_3_vec = _mm256_loadu_pd(unitary_row_3);
793 __m256d unitary_row_4_vec = _mm256_loadu_pd(unitary_row_4);
794 __m256d unitary_row_5_vec = _mm256_loadu_pd(unitary_row_5);
795 __m256d unitary_row_6_vec = _mm256_loadu_pd(unitary_row_6);
796 __m256d unitary_row_7_vec = _mm256_loadu_pd(unitary_row_7);
797 __m256d unitary_row_8_vec = _mm256_loadu_pd(unitary_row_8);
799 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_1_vec,neg);
801 __m256d result_upper_middle1_vec =
complex_mult_AVX(middle1_inner_vec,unitary_row_2_vec,neg);
803 __m256d result_upper_middle2_vec =
complex_mult_AVX(middle2_inner_vec,unitary_row_3_vec,neg);
805 __m256d result_upper_middle12_vec =
complex_mult_AVX(middle12_inner_vec,unitary_row_4_vec,neg);
807 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_5_vec,neg);
809 __m256d result_lower_middle1_vec =
complex_mult_AVX(middle1_inner_pair_vec,unitary_row_6_vec,neg);
811 __m256d result_lower_middle2_vec =
complex_mult_AVX(middle2_inner_pair_vec,unitary_row_7_vec,neg);
813 __m256d result_lower_middle12_vec =
complex_mult_AVX(middle12_inner_pair_vec,unitary_row_8_vec,neg);
816 __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle1_vec);
817 __m256d result2_vec = _mm256_hadd_pd(result_upper_middle2_vec,result_upper_middle12_vec);
818 __m256d result3_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle1_vec);
819 __m256d result4_vec = _mm256_hadd_pd(result_lower_middle2_vec,result_lower_middle12_vec);
820 __m256d result5_vec = _mm256_hadd_pd(result1_vec,result2_vec);
821 __m256d result6_vec = _mm256_hadd_pd(result3_vec,result4_vec);
822 __m256d result_vec = _mm256_hadd_pd(result5_vec,result6_vec);
823 result_vec = _mm256_hadd_pd(result_vec,result_vec);
824 double*
result = (
double*)&result_vec;
825 results[mult_idx].
real = result[0];
826 results[mult_idx].
imag = result[2];
828 input[current_idx_outer_loc] = results[0];
829 input[current_idx_inner_loc] = results[1];
830 input[current_idx_middle1_loc] = results[2];
831 input[current_idx_middle1_inner_loc] = results[3];
832 input[current_idx_middle2_loc] = results[4];
833 input[current_idx_middle2_inner_loc] = results[5];
834 input[current_idx_middle12_loc] = results[6];
835 input[current_idx_middle12_inner_loc] = results[7];
836 input[current_idx_outer_pair_loc] = results[8];
837 input[current_idx_inner_pair_loc] = results[9];
838 input[current_idx_middle1_pair_loc] = results[10];
839 input[current_idx_middle1_inner_pair_loc] = results[11];
840 input[current_idx_middle2_pair_loc] = results[12];
841 input[current_idx_middle2_inner_pair_loc] = results[13];
842 input[current_idx_middle12_pair_loc] = results[14];
843 input[current_idx_middle12_inner_pair_loc] = results[15];
848 current_idx_inner = current_idx_inner +(index_step_inner << 1);
851 current_idx_middle1 = current_idx_middle1 +(index_step_middle1 << 1);
854 current_idx_middle2 = current_idx_middle2 +(index_step_middle2 << 1);
857 current_idx = current_idx + (index_step_outer << 1);
858 current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
873 __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].
real);
874 __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
875 __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
876 __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
877 __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
878 __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
879 __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
880 __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
882 __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
883 __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
884 __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
885 __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
886 __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
887 __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
888 __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
889 __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
892 int parallel_outer_cycles = matrix_size/(index_step_target << 1);
893 int outer_grain_size;
894 if ( index_step_target <= 2 ) {
895 outer_grain_size = 32;
897 else if ( index_step_target <= 4 ) {
898 outer_grain_size = 16;
900 else if ( index_step_target <= 8 ) {
901 outer_grain_size = 8;
903 else if ( index_step_target <= 16 ) {
904 outer_grain_size = 4;
907 outer_grain_size = 2;
911 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
913 int current_idx = r.begin()*(index_step_target << 1);
914 int current_idx_pair = index_step_target + r.begin()*(index_step_target << 1);
916 for (
int rdx=r.begin(); rdx<r.end(); rdx++) {
919 tbb::parallel_for( tbb::blocked_range<int>(0,index_step_target,32), [&](tbb::blocked_range<int> r) {
920 for (
int idx=r.begin(); idx<r.end(); ++idx) {
923 int current_idx_loc = current_idx + idx;
924 int current_idx_pair_loc = current_idx_pair + idx;
926 int row_offset = current_idx_loc * input.
stride;
927 int row_offset_pair = current_idx_pair_loc * input.
stride;
929 if ((current_idx_loc >> control_qbit) & 1) {
932 double* element = (
double*)input.
get_data() + 2 * row_offset;
933 double* element_pair = (
double*)input.
get_data() + 2 * row_offset_pair;
936 for (
int col_idx = 0; col_idx < 2 * (input.
cols - 3); col_idx = col_idx + 8) {
939 __m256d element_vec = _mm256_load_pd(element + col_idx);
940 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
941 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
942 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
945 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
946 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
947 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
948 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
949 element_pair_vec = tmp;
951 __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
952 vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
953 __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
954 vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
955 vec3 = _mm256_add_pd(vec3, vec4);
956 __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
957 vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
958 __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
959 vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
960 vec5 = _mm256_add_pd(vec5, vec6);
963 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
964 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
966 _mm256_store_pd(element + col_idx, vec3);
967 _mm256_store_pd(element + col_idx + 4, vec5);
969 __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
970 vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
971 __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
972 vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
973 vec7 = _mm256_add_pd(vec7, vec8);
974 __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
975 vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
976 __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
977 vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
978 vec9 = _mm256_add_pd(vec9, vec10);
981 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
982 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
984 _mm256_store_pd(element_pair + col_idx, vec7);
985 _mm256_store_pd(element_pair + col_idx + 4, vec9);
988 int remainder = input.
cols % 4;
989 if (remainder != 0) {
991 for (
int col_idx = input.
cols-remainder; col_idx < input.
cols; col_idx++) {
992 int index = row_offset + col_idx;
993 int index_pair = row_offset_pair + col_idx;
1001 input[index].real = tmp1.
real + tmp2.
real;
1002 input[index].imag = tmp1.
imag + tmp2.
imag;
1004 tmp1 =
mult(u3_1qbit1[2], element);
1005 tmp2 =
mult(u3_1qbit1[3], element_pair);
1007 input[index_pair].real = tmp1.
real + tmp2.
real;
1008 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
1017 double* element = (
double*)input.
get_data() + 2 * row_offset;
1018 double* element_pair = (
double*)input.
get_data() + 2 * row_offset_pair;
1021 for (
int col_idx = 0; col_idx < 2 * (input.
cols - 3); col_idx = col_idx + 8) {
1024 __m256d element_vec = _mm256_load_pd(element + col_idx);
1025 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
1026 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
1027 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
1030 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
1031 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
1032 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
1033 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
1034 element_pair_vec = tmp;
1036 __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
1037 vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
1038 __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
1039 vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
1040 vec3 = _mm256_add_pd(vec3, vec4);
1041 __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
1042 vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
1043 __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
1044 vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
1045 vec5 = _mm256_add_pd(vec5, vec6);
1048 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
1049 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
1051 _mm256_store_pd(element + col_idx, vec3);
1052 _mm256_store_pd(element + col_idx + 4, vec5);
1054 __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
1055 vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
1056 __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
1057 vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
1058 vec7 = _mm256_add_pd(vec7, vec8);
1059 __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
1060 vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
1061 __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
1062 vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
1063 vec9 = _mm256_add_pd(vec9, vec10);
1066 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
1067 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
1069 _mm256_store_pd(element_pair + col_idx, vec7);
1070 _mm256_store_pd(element_pair + col_idx + 4, vec9);
1073 int remainder = input.
cols % 4;
1074 if (remainder != 0) {
1076 for (
int col_idx = input.
cols-remainder; col_idx < input.
cols; col_idx++) {
1077 int index = row_offset + col_idx;
1078 int index_pair = row_offset_pair + col_idx;
1086 input[index].real = tmp1.
real + tmp2.
real;
1087 input[index].imag = tmp1.
imag + tmp2.
imag;
1089 tmp1 =
mult(u3_1qbit2[2], element);
1090 tmp2 =
mult(u3_1qbit2[3], element_pair);
1092 input[index_pair].real = tmp1.
real + tmp2.
real;
1093 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
1108 current_idx = current_idx + (index_step_target << 1);
1109 current_idx_pair = current_idx_pair + (index_step_target << 1);
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
scalar * get_data() const
Call to get the pointer to the stored data.
int rows
The number of rows.
int cols
The number of columns.
Structure type representing complex numbers in the SQUANDER package.
Class to store data of complex arrays and its properties.
double real
the real part of a complex number
result
the result of the Qiskit job
double imag
the imaginary part of a complex number