29 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
30 element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
31 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
32 element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
33 __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
34 outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
36 return outer_inner_vec;
39 inline __m256d
complex_mult_AVX(__m256d input_vec, __m256d unitary_row_vec, __m256d neg){
41 __m256d vec3 = _mm256_mul_pd(input_vec, unitary_row_vec);
42 __m256d unitary_row_switched = _mm256_permute_pd(unitary_row_vec, 0x5);
43 unitary_row_switched = _mm256_mul_pd(unitary_row_switched, neg);
44 __m256d vec4 = _mm256_mul_pd(input_vec, unitary_row_switched);
45 __m256d result_vec = _mm256_hsub_pd(vec3, vec4);
46 result_vec = _mm256_permute4x64_pd(result_vec,0b11011000);
89 int index_step_outer = 1 << outer_qbit;
90 int index_step_inner = 1 << inner_qbit;
92 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
93 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
95 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
97 for (
int idx=0; idx<index_step_inner; idx++){
100 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
101 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
102 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
103 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
104 double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
106 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
107 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
108 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
109 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
111 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
112 element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
113 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
114 element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
115 __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
116 outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
119 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
120 element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
121 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
122 element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
123 __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
124 outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
129 for (
int mult_idx=0; mult_idx<4; mult_idx++){
130 double* unitary_row_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
131 double* unitary_row_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
133 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
134 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
136 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
138 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
140 __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
141 result_vec = _mm256_hadd_pd(result_vec,result_vec);
142 double*
result = (
double*)&result_vec;
143 results[mult_idx*2] = result[0];
144 results[mult_idx*2+1] = result[2];
146 input[current_idx_outer_loc].real = results[0];
147 input[current_idx_outer_loc].imag = results[1];
148 input[current_idx_inner_loc].real = results[2];
149 input[current_idx_inner_loc].imag = results[3];
150 input[current_idx_outer_pair_loc].real = results[4];
151 input[current_idx_outer_pair_loc].imag = results[5];
152 input[current_idx_inner_pair_loc].real = results[6];
153 input[current_idx_inner_pair_loc].imag = results[7];
158 for (
int idx=0; idx<index_step_inner; idx=idx+2){
160 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
162 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
163 double* element_inner = element_outer + 2 * index_step_inner;
165 double* element_outer_pair = element_outer + 2 * index_step_outer;
166 double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
168 double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
170 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
171 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
173 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
174 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
176 __m256d result_vecs[4];
178 for (
int mult_idx=0; mult_idx<4; mult_idx++){
179 double* unitary_col_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
180 double* unitary_col_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
182 __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01);
183 unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000);
184 __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000);
185 __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111);
186 unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000);
187 unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000);
189 __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23);
190 unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000);
191 __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000);
192 __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111);
193 unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000);
194 unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000);
196 __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
197 __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
198 unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
199 __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
200 __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
203 __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
204 __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
205 unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
206 __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
207 __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
209 __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
210 __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
211 unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
212 __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
213 __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
216 __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
217 __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
218 unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
219 __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
220 __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
222 __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
223 result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
224 result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
225 result_vecs[mult_idx] = result_vec;
227 for (
int row_idx=0; row_idx<4;row_idx++){
228 _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
235 current_idx = current_idx + (index_step_outer << 1);
249 int index_step_outer = 1 << outer_qbit;
250 int index_step_inner = 1 << inner_qbit;
252 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
253 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
255 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
256 for (
int idx=0; idx<index_step_inner; idx++){
258 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
259 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
260 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
261 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
263 int row_offset_outer = current_idx_outer_loc*input.
cols;
264 int row_offset_inner = current_idx_inner_loc*input.
cols;
265 int row_offset_outer_pair = current_idx_outer_pair_loc*input.
cols;
266 int row_offset_inner_pair = current_idx_inner_pair_loc*input.
cols;
267 #pragma omp parallel for 268 for (
int col_idx=0; col_idx<input.
cols;col_idx++){
270 int current_idx_outer = row_offset_outer + col_idx;
271 int current_idx_inner = row_offset_inner + col_idx;
272 int current_idx_outer_pair = row_offset_outer_pair + col_idx;
273 int current_idx_inner_pair = row_offset_inner_pair + col_idx;
275 double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
277 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer;
278 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner;
279 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair;
280 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair;
282 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
283 element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
284 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
285 element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
286 __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
287 outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
290 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
291 element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
292 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
293 element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
294 __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
295 outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
300 for (
int mult_idx=0; mult_idx<4; mult_idx++){
301 double* unitary_row_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
302 double* unitary_row_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
304 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
305 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
307 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
309 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
311 __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
312 result_vec = _mm256_hadd_pd(result_vec,result_vec);
313 double*
result = (
double*)&result_vec;
314 results[mult_idx*2] = result[0];
315 results[mult_idx*2+1] = result[2];
317 input[current_idx_outer].real = results[0];
318 input[current_idx_outer].imag = results[1];
319 input[current_idx_inner].real = results[2];
320 input[current_idx_inner].imag = results[3];
321 input[current_idx_outer_pair].real = results[4];
322 input[current_idx_outer_pair].imag = results[5];
323 input[current_idx_inner_pair].real = results[6];
324 input[current_idx_inner_pair].imag = results[7];
329 current_idx = current_idx + (index_step_outer << 1);
344 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
346 int inner_qbit = involved_qbits[0];
348 int outer_qbit = involved_qbits[1];
350 int index_step_outer = 1 << outer_qbit;
352 int index_step_inner = 1 << inner_qbit;
355 int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
357 int parallel_inner_cycles = index_step_outer/(index_step_inner << 1);
362 int outer_grain_size;
363 if ( index_step_outer <= 2 ) {
364 outer_grain_size = 64;
366 else if ( index_step_outer <= 4 ) {
367 outer_grain_size = 32;
369 else if ( index_step_outer <= 8 ) {
370 outer_grain_size = 16;
372 else if ( index_step_outer <= 16 ) {
373 outer_grain_size = 8;
376 outer_grain_size = 2;
378 outer_grain_size = 6000000;
379 int inner_grain_size = 64000000;
382 tbb::parallel_for( tbb::blocked_range<int>(0, parallel_outer_cycles, outer_grain_size), [&](tbb::blocked_range<int> r) {
384 int current_idx = r.begin()*(index_step_outer<<1);
385 int current_idx_pair_outer = current_idx + index_step_outer;
388 for (
int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
390 tbb::parallel_for( tbb::blocked_range<int>(0, parallel_inner_cycles, inner_grain_size), [&](tbb::blocked_range<int> r) {
392 int current_idx_inner = r.begin()*(index_step_inner<<1);
394 for (
int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
397 for (
int idx=0; idx<index_step_inner; ++idx){
399 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
400 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
401 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
402 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
403 double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
405 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
406 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
407 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
408 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
410 __m256d outer_inner_vec =
get_AVX_vector(element_outer, element_inner);
414 __m256d outer_inner_pair_vec =
get_AVX_vector(element_outer_pair, element_inner_pair);
417 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
419 for (
int mult_idx=0; mult_idx<4; mult_idx++){
420 double* unitary_row_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
421 double* unitary_row_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
423 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
424 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
426 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
428 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
431 __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
432 result_vec = _mm256_hadd_pd(result_vec,result_vec);
433 double*
result = (
double*)&result_vec;
434 results[mult_idx*2] = result[0];
435 results[mult_idx*2+1] = result[2];
438 input[current_idx_outer_loc].real = results[0];
439 input[current_idx_outer_loc].imag = results[1];
440 input[current_idx_inner_loc].real = results[2];
441 input[current_idx_inner_loc].imag = results[3];
442 input[current_idx_outer_pair_loc].real = results[4];
443 input[current_idx_outer_pair_loc].imag = results[5];
444 input[current_idx_inner_pair_loc].real = results[6];
445 input[current_idx_inner_pair_loc].imag = results[7];
451 for (
int alt_idx=0; alt_idx<index_step_inner/2; ++alt_idx){
454 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
456 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
457 double* element_inner = element_outer + 2 * index_step_inner;
459 double* element_outer_pair = element_outer + 2 * index_step_outer;
460 double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
462 double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
464 __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
465 __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
467 __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
468 __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
470 __m256d result_vecs[4];
472 for (
int mult_idx=0; mult_idx<4; mult_idx++){
473 double* unitary_col_01 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx;
474 double* unitary_col_23 = (
double*)two_qbit_unitary.
get_data() + 8*mult_idx + 4;
476 __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01);
477 unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000);
478 __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000);
479 __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111);
480 unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000);
481 unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000);
483 __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23);
484 unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000);
485 __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000);
486 __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111);
487 unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000);
488 unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000);
490 __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
491 __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
492 unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
493 __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
494 __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
497 __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
498 __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
499 unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
500 __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
501 __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
503 __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
504 __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
505 unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
506 __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
507 __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
510 __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
511 __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
512 unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
513 __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
514 __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
516 __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
517 result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
518 result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
519 result_vecs[mult_idx] = result_vec;
522 for (
int row_idx=0; row_idx<4;row_idx++){
523 _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
529 current_idx_inner = current_idx_inner +(index_step_inner << 1);
534 current_idx = current_idx + (index_step_outer << 1);
535 current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
551 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
553 int index_step_inner = 1 << involved_qbits[0];
555 int index_step_middle = 1 << involved_qbits[1];
557 int index_step_outer = 1 << involved_qbits[2];
560 int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
562 int parallel_inner_cycles = index_step_middle/(index_step_inner << 1);
564 int parallel_middle_cycles = index_step_outer/(index_step_middle << 1);
571 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
573 int current_idx = r.begin()*(index_step_outer<<1);
575 int current_idx_pair_outer = current_idx + index_step_outer;
578 for (
int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
580 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle_cycles,middle_grain_size), [&](tbb::blocked_range<int> r) {
582 int current_idx_middle = r.begin()*(index_step_middle<<1);
583 for (
int middle_rdx=r.begin(); middle_rdx<r.end(); middle_rdx++){
585 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
586 int current_idx_inner = r.begin()*(index_step_inner<<1);
588 for (
int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
590 tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
592 for (
int idx=r.begin(); idx<r.end(); ++idx){
594 int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
595 int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
597 int current_idx_outer_loc = current_idx_loc;
598 int current_idx_inner_loc = current_idx_loc + index_step_inner;
600 int current_idx_middle_loc = current_idx_loc + index_step_middle;
601 int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
603 int current_idx_outer_pair_loc = current_idx_pair_loc;
604 int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
606 int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
607 int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
611 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
612 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
614 double* element_middle = (
double*)input.
get_data() + 2 * current_idx_middle_loc;
615 double* element_middle_inner = (
double*)input.
get_data() + 2 * current_idx_middle_inner_loc;
617 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
618 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
620 double* element_middle_pair = (
double*)input.
get_data() + 2 * current_idx_middle_pair_loc;
621 double* element_middle_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle_inner_pair_loc;
623 __m256d outer_inner_vec =
get_AVX_vector(element_outer, element_inner);
625 __m256d middle_inner_vec =
get_AVX_vector(element_middle,element_middle_inner);
627 __m256d outer_inner_pair_vec =
get_AVX_vector(element_outer_pair, element_inner_pair);
629 __m256d middle_inner_pair_vec =
get_AVX_vector(element_middle_pair, element_middle_inner_pair);
633 for (
int mult_idx=0; mult_idx<8; mult_idx++){
635 double* unitary_row_01 = (
double*)unitary.
get_data() + 16*mult_idx;
636 double* unitary_row_23 = (
double*)unitary.
get_data() + 16*mult_idx + 4;
637 double* unitary_row_45 = (
double*)unitary.
get_data() + 16*mult_idx + 8;
638 double* unitary_row_67 = (
double*)unitary.
get_data() + 16*mult_idx + 12;
640 __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
641 __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
642 __m256d unitary_row_45_vec = _mm256_loadu_pd(unitary_row_45);
643 __m256d unitary_row_67_vec = _mm256_loadu_pd(unitary_row_67);
646 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
648 __m256d result_upper_middle_vec =
complex_mult_AVX(middle_inner_vec,unitary_row_23_vec,neg);
650 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_45_vec,neg);
652 __m256d result_lower_middle_vec =
complex_mult_AVX(middle_inner_pair_vec,unitary_row_67_vec,neg);
655 __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle_vec);
656 __m256d result2_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle_vec);
657 __m256d result_vec = _mm256_hadd_pd(result1_vec,result2_vec);
658 result_vec = _mm256_hadd_pd(result_vec,result_vec);
659 double*
result = (
double*)&result_vec;
660 results[mult_idx].
real = result[0];
661 results[mult_idx].
imag = result[2];
663 input[current_idx_outer_loc] = results[0];
664 input[current_idx_inner_loc] = results[1];
665 input[current_idx_middle_loc] = results[2];
666 input[current_idx_middle_inner_loc] = results[3];
667 input[current_idx_outer_pair_loc] = results[4];
668 input[current_idx_inner_pair_loc] = results[5];
669 input[current_idx_middle_pair_loc] = results[6];
670 input[current_idx_middle_inner_pair_loc] = results[7];
675 current_idx_inner = current_idx_inner +(index_step_inner << 1);
678 current_idx_middle = current_idx_middle +(index_step_middle << 1);
681 current_idx = current_idx + (index_step_outer << 1);
682 current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
697 __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
699 int index_step_inner = 1 << involved_qbits[0];
701 int index_step_middle1 = 1 << involved_qbits[1];
703 int index_step_middle2 = 1 << involved_qbits[2];
705 int index_step_outer = 1 << involved_qbits[3];
707 int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
709 int parallel_inner_cycles = index_step_middle1/(index_step_inner << 1);
711 int parallel_middle1_cycles = index_step_middle2/(index_step_middle1 << 1);
713 int parallel_middle2_cycles = index_step_outer/(index_step_middle2 << 1);
720 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
722 int current_idx = r.begin()*(index_step_outer<<1);
724 int current_idx_pair_outer = current_idx + index_step_outer;
727 for (
int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
729 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle2_cycles,middle2_grain_size), [&](tbb::blocked_range<int> r) {
731 int current_idx_middle2 = r.begin()*(index_step_middle2<<1);
733 for (
int middle2_rdx=r.begin(); middle2_rdx<r.end(); middle2_rdx++){
735 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle1_cycles,middle1_grain_size), [&](tbb::blocked_range<int> r) {
737 int current_idx_middle1 = r.begin()*(index_step_middle1<<1);
739 for (
int middle1_rdx=r.begin(); middle1_rdx<r.end(); middle1_rdx++){
741 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
743 int current_idx_inner = r.begin()*(index_step_inner<<1);
745 for (
int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
747 tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
749 for (
int idx=r.begin(); idx<r.end(); ++idx){
751 int current_idx_loc = current_idx + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2 ;
752 int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2;
754 int current_idx_outer_loc = current_idx_loc;
755 int current_idx_inner_loc = current_idx_loc + index_step_inner;
757 int current_idx_middle1_loc = current_idx_loc + index_step_middle1;
758 int current_idx_middle1_inner_loc = current_idx_loc + index_step_middle1 + index_step_inner;
760 int current_idx_middle2_loc = current_idx_loc + index_step_middle2;
761 int current_idx_middle2_inner_loc = current_idx_loc + index_step_middle2 + index_step_inner;
763 int current_idx_middle12_loc = current_idx_loc + index_step_middle1 + index_step_middle2;
764 int current_idx_middle12_inner_loc = current_idx_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
766 int current_idx_outer_pair_loc = current_idx_pair_loc;
767 int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
769 int current_idx_middle1_pair_loc =current_idx_pair_loc + index_step_middle1;
770 int current_idx_middle1_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_inner;
772 int current_idx_middle2_pair_loc = current_idx_pair_loc + index_step_middle2;
773 int current_idx_middle2_inner_pair_loc = current_idx_pair_loc + index_step_middle2 + index_step_inner;
775 int current_idx_middle12_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2;
776 int current_idx_middle12_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
780 double* element_outer = (
double*)input.
get_data() + 2 * current_idx_outer_loc;
781 double* element_inner = (
double*)input.
get_data() + 2 * current_idx_inner_loc;
783 double* element_middle1 = (
double*)input.
get_data() + 2 * current_idx_middle1_loc;
784 double* element_middle1_inner = (
double*)input.
get_data() + 2 * current_idx_middle1_inner_loc;
786 double* element_middle2 = (
double*)input.
get_data() + 2 * current_idx_middle2_loc;
787 double* element_middle2_inner = (
double*)input.
get_data() + 2 * current_idx_middle2_inner_loc;
789 double* element_middle12 = (
double*)input.
get_data() + 2 * current_idx_middle12_loc;
790 double* element_middle12_inner = (
double*)input.
get_data() + 2 * current_idx_middle12_inner_loc;
792 double* element_outer_pair = (
double*)input.
get_data() + 2 * current_idx_outer_pair_loc;
793 double* element_inner_pair = (
double*)input.
get_data() + 2 * current_idx_inner_pair_loc;
795 double* element_middle1_pair = (
double*)input.
get_data() + 2 * current_idx_middle1_pair_loc;
796 double* element_middle1_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle1_inner_pair_loc;
798 double* element_middle2_pair = (
double*)input.
get_data() + 2 * current_idx_middle2_pair_loc;
799 double* element_middle2_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle2_inner_pair_loc;
801 double* element_middle12_pair = (
double*)input.
get_data() + 2 * current_idx_middle12_pair_loc;
802 double* element_middle12_inner_pair = (
double*)input.
get_data() + 2 * current_idx_middle12_inner_pair_loc;
805 __m256d outer_inner_vec =
get_AVX_vector(element_outer, element_inner);
807 __m256d middle1_inner_vec =
get_AVX_vector(element_middle1,element_middle1_inner);
809 __m256d middle2_inner_vec =
get_AVX_vector(element_middle2, element_middle2_inner);
811 __m256d middle12_inner_vec =
get_AVX_vector(element_middle12,element_middle12_inner);
813 __m256d outer_inner_pair_vec =
get_AVX_vector(element_outer_pair, element_inner_pair);
815 __m256d middle1_inner_pair_vec =
get_AVX_vector(element_middle1_pair, element_middle1_inner_pair);
817 __m256d middle2_inner_pair_vec =
get_AVX_vector(element_middle2_pair, element_middle2_inner_pair);
819 __m256d middle12_inner_pair_vec =
get_AVX_vector(element_middle12_pair, element_middle12_inner_pair);
822 for (
int mult_idx=0; mult_idx<16; mult_idx++){
824 double* unitary_row_1 = (
double*)unitary.
get_data() + 32*mult_idx;
825 double* unitary_row_2 = (
double*)unitary.
get_data() + 32*mult_idx + 4;
826 double* unitary_row_3 = (
double*)unitary.
get_data() + 32*mult_idx + 8;
827 double* unitary_row_4 = (
double*)unitary.
get_data() + 32*mult_idx + 12;
828 double* unitary_row_5 = (
double*)unitary.
get_data() + 32*mult_idx + 16;
829 double* unitary_row_6 = (
double*)unitary.
get_data() + 32*mult_idx + 20;
830 double* unitary_row_7 = (
double*)unitary.
get_data() + 32*mult_idx + 24;
831 double* unitary_row_8 = (
double*)unitary.
get_data() + 32*mult_idx + 28;
833 __m256d unitary_row_1_vec = _mm256_loadu_pd(unitary_row_1);
834 __m256d unitary_row_2_vec = _mm256_loadu_pd(unitary_row_2);
835 __m256d unitary_row_3_vec = _mm256_loadu_pd(unitary_row_3);
836 __m256d unitary_row_4_vec = _mm256_loadu_pd(unitary_row_4);
837 __m256d unitary_row_5_vec = _mm256_loadu_pd(unitary_row_5);
838 __m256d unitary_row_6_vec = _mm256_loadu_pd(unitary_row_6);
839 __m256d unitary_row_7_vec = _mm256_loadu_pd(unitary_row_7);
840 __m256d unitary_row_8_vec = _mm256_loadu_pd(unitary_row_8);
842 __m256d result_upper_vec =
complex_mult_AVX(outer_inner_vec,unitary_row_1_vec,neg);
844 __m256d result_upper_middle1_vec =
complex_mult_AVX(middle1_inner_vec,unitary_row_2_vec,neg);
846 __m256d result_upper_middle2_vec =
complex_mult_AVX(middle2_inner_vec,unitary_row_3_vec,neg);
848 __m256d result_upper_middle12_vec =
complex_mult_AVX(middle12_inner_vec,unitary_row_4_vec,neg);
850 __m256d result_lower_vec =
complex_mult_AVX(outer_inner_pair_vec,unitary_row_5_vec,neg);
852 __m256d result_lower_middle1_vec =
complex_mult_AVX(middle1_inner_pair_vec,unitary_row_6_vec,neg);
854 __m256d result_lower_middle2_vec =
complex_mult_AVX(middle2_inner_pair_vec,unitary_row_7_vec,neg);
856 __m256d result_lower_middle12_vec =
complex_mult_AVX(middle12_inner_pair_vec,unitary_row_8_vec,neg);
859 __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle1_vec);
860 __m256d result2_vec = _mm256_hadd_pd(result_upper_middle2_vec,result_upper_middle12_vec);
861 __m256d result3_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle1_vec);
862 __m256d result4_vec = _mm256_hadd_pd(result_lower_middle2_vec,result_lower_middle12_vec);
863 __m256d result5_vec = _mm256_hadd_pd(result1_vec,result2_vec);
864 __m256d result6_vec = _mm256_hadd_pd(result3_vec,result4_vec);
865 __m256d result_vec = _mm256_hadd_pd(result5_vec,result6_vec);
866 result_vec = _mm256_hadd_pd(result_vec,result_vec);
867 double*
result = (
double*)&result_vec;
868 results[mult_idx].
real = result[0];
869 results[mult_idx].
imag = result[2];
871 input[current_idx_outer_loc] = results[0];
872 input[current_idx_inner_loc] = results[1];
873 input[current_idx_middle1_loc] = results[2];
874 input[current_idx_middle1_inner_loc] = results[3];
875 input[current_idx_middle2_loc] = results[4];
876 input[current_idx_middle2_inner_loc] = results[5];
877 input[current_idx_middle12_loc] = results[6];
878 input[current_idx_middle12_inner_loc] = results[7];
879 input[current_idx_outer_pair_loc] = results[8];
880 input[current_idx_inner_pair_loc] = results[9];
881 input[current_idx_middle1_pair_loc] = results[10];
882 input[current_idx_middle1_inner_pair_loc] = results[11];
883 input[current_idx_middle2_pair_loc] = results[12];
884 input[current_idx_middle2_inner_pair_loc] = results[13];
885 input[current_idx_middle12_pair_loc] = results[14];
886 input[current_idx_middle12_inner_pair_loc] = results[15];
891 current_idx_inner = current_idx_inner +(index_step_inner << 1);
894 current_idx_middle1 = current_idx_middle1 +(index_step_middle1 << 1);
897 current_idx_middle2 = current_idx_middle2 +(index_step_middle2 << 1);
900 current_idx = current_idx + (index_step_outer << 1);
901 current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
925 __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].
real);
926 __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
927 __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
928 __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
929 __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
930 __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
931 __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
932 __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
934 __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
935 __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
936 __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
937 __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
938 __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
939 __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
940 __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
941 __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
944 int parallel_outer_cycles = matrix_size/(index_step_target << 1);
945 int outer_grain_size;
946 if ( index_step_target <= 2 ) {
947 outer_grain_size = 32;
949 else if ( index_step_target <= 4 ) {
950 outer_grain_size = 16;
952 else if ( index_step_target <= 8 ) {
953 outer_grain_size = 8;
955 else if ( index_step_target <= 16 ) {
956 outer_grain_size = 4;
959 outer_grain_size = 2;
963 tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
965 int current_idx = r.begin()*(index_step_target << 1);
966 int current_idx_pair = index_step_target + r.begin()*(index_step_target << 1);
968 for (
int rdx=r.begin(); rdx<r.end(); rdx++) {
971 tbb::parallel_for( tbb::blocked_range<int>(0,index_step_target,32), [&](tbb::blocked_range<int> r) {
972 for (
int idx=r.begin(); idx<r.end(); ++idx) {
975 int current_idx_loc = current_idx + idx;
976 int current_idx_pair_loc = current_idx_pair + idx;
978 int row_offset = current_idx_loc * input.
stride;
979 int row_offset_pair = current_idx_pair_loc * input.
stride;
981 if ((current_idx_loc >> control_qbit) & 1) {
984 double* element = (
double*)input.
get_data() + 2 * row_offset;
985 double* element_pair = (
double*)input.
get_data() + 2 * row_offset_pair;
988 for (
int col_idx = 0; col_idx < 2 * (input.
cols - 3); col_idx = col_idx + 8) {
991 __m256d element_vec = _mm256_load_pd(element + col_idx);
992 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
993 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
994 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
997 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
998 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
999 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
1000 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
1001 element_pair_vec = tmp;
1003 __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
1004 vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
1005 __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
1006 vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
1007 vec3 = _mm256_add_pd(vec3, vec4);
1008 __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
1009 vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
1010 __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
1011 vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
1012 vec5 = _mm256_add_pd(vec5, vec6);
1015 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
1016 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
1018 _mm256_store_pd(element + col_idx, vec3);
1019 _mm256_store_pd(element + col_idx + 4, vec5);
1021 __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
1022 vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
1023 __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
1024 vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
1025 vec7 = _mm256_add_pd(vec7, vec8);
1026 __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
1027 vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
1028 __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
1029 vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
1030 vec9 = _mm256_add_pd(vec9, vec10);
1033 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
1034 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
1036 _mm256_store_pd(element_pair + col_idx, vec7);
1037 _mm256_store_pd(element_pair + col_idx + 4, vec9);
1040 int remainder = input.
cols % 4;
1041 if (remainder != 0) {
1043 for (
int col_idx = input.
cols-remainder; col_idx < input.
cols; col_idx++) {
1044 int index = row_offset + col_idx;
1045 int index_pair = row_offset_pair + col_idx;
1053 input[index].real = tmp1.
real + tmp2.
real;
1054 input[index].imag = tmp1.
imag + tmp2.
imag;
1056 tmp1 =
mult(u3_1qbit1[2], element);
1057 tmp2 =
mult(u3_1qbit1[3], element_pair);
1059 input[index_pair].real = tmp1.
real + tmp2.
real;
1060 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
1069 double* element = (
double*)input.
get_data() + 2 * row_offset;
1070 double* element_pair = (
double*)input.
get_data() + 2 * row_offset_pair;
1073 for (
int col_idx = 0; col_idx < 2 * (input.
cols - 3); col_idx = col_idx + 8) {
1076 __m256d element_vec = _mm256_load_pd(element + col_idx);
1077 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
1078 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
1079 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
1082 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
1083 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
1084 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
1085 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
1086 element_pair_vec = tmp;
1088 __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
1089 vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
1090 __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
1091 vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
1092 vec3 = _mm256_add_pd(vec3, vec4);
1093 __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
1094 vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
1095 __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
1096 vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
1097 vec5 = _mm256_add_pd(vec5, vec6);
1100 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
1101 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
1103 _mm256_store_pd(element + col_idx, vec3);
1104 _mm256_store_pd(element + col_idx + 4, vec5);
1106 __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
1107 vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
1108 __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
1109 vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
1110 vec7 = _mm256_add_pd(vec7, vec8);
1111 __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
1112 vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
1113 __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
1114 vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
1115 vec9 = _mm256_add_pd(vec9, vec10);
1118 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
1119 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
1121 _mm256_store_pd(element_pair + col_idx, vec7);
1122 _mm256_store_pd(element_pair + col_idx + 4, vec9);
1125 int remainder = input.
cols % 4;
1126 if (remainder != 0) {
1128 for (
int col_idx = input.
cols-remainder; col_idx < input.
cols; col_idx++) {
1129 int index = row_offset + col_idx;
1130 int index_pair = row_offset_pair + col_idx;
1138 input[index].real = tmp1.
real + tmp2.
real;
1139 input[index].imag = tmp1.
imag + tmp2.
imag;
1141 tmp1 =
mult(u3_1qbit2[2], element);
1142 tmp2 =
mult(u3_1qbit2[3], element_pair);
1144 input[index_pair].real = tmp1.
real + tmp2.
real;
1145 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
1160 current_idx = current_idx + (index_step_target << 1);
1161 current_idx_pair = current_idx_pair + (index_step_target << 1);
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
scalar * get_data() const
Call to get the pointer to the stored data.
int rows
The number of rows.
int cols
The number of columns.
Structure type representing complex numbers in the SQUANDER package.
Class to store data of complex arrays and its properties.
double real
the real part of a complex number
result
the result of the Qiskit job
double imag
the imaginary part of a complex number