25 #include <immintrin.h> 48 unsigned int bitmask_high = ~bitmask_low;
52 if ( control_qbit == 0 ) {
54 for (
int idx=0; idx<matrix_size/2; idx++ ) {
57 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
60 int current_idx_pair = current_idx | (1<<
target_qbit);
62 if ( current_idx & control_qbit_step_index ) {
71 input[current_idx].real = tmp1.
real + tmp2.
real;
72 input[current_idx].imag = tmp1.
imag + tmp2.
imag;
77 input[current_idx_pair].real = tmp3.
real + tmp4.
real;
78 input[current_idx_pair].imag = tmp3.
imag + tmp4.
imag;
97 else if ( target_qbit == 0 ) {
117 __m256d mv00 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].
real, -u3_1qbit[0].imag, u3_1qbit[0].real);
118 __m256d mv01 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
119 __m256d mv20 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
120 __m256d mv21 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
122 for (
int idx=0; idx<matrix_size/2; idx++ ) {
125 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
127 if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
130 double *ptr = (
double*)input.
get_data() + 2 * current_idx;
131 __m256d
data = _mm256_loadu_pd(ptr);
133 __m256d data_u0 = _mm256_mul_pd(data, mv00);
134 __m256d data_u1 = _mm256_mul_pd(data, mv01);
135 __m256d data_u2 = _mm256_hadd_pd(data_u0, data_u1);
136 data_u2 = _mm256_permute4x64_pd(data_u2, 216);
138 __m256d data_d0 = _mm256_mul_pd(data, mv20);
139 __m256d data_d1 = _mm256_mul_pd(data, mv21);
140 __m256d data_d2 = _mm256_hadd_pd(data_d0, data_d1);
141 data_d2 = _mm256_permute4x64_pd(data_d2, 216);
143 __m256d data_r = _mm256_hadd_pd(data_u2, data_d2);
145 data_r = _mm256_permute4x64_pd(data_r, 216);
146 _mm256_storeu_pd(ptr, data_r);
183 __m256d mv00 = _mm256_set_pd(-u3_1qbit[0].imag, u3_1qbit[0].
real, -u3_1qbit[0].imag, u3_1qbit[0].real);
184 __m256d mv01 = _mm256_set_pd( u3_1qbit[0].real, u3_1qbit[0].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
185 __m256d mv10 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[1].imag, u3_1qbit[1].real);
186 __m256d mv11 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[1].real, u3_1qbit[1].imag);
187 __m256d mv20 = _mm256_set_pd(-u3_1qbit[2].imag, u3_1qbit[2].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
188 __m256d mv21 = _mm256_set_pd( u3_1qbit[2].real, u3_1qbit[2].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
189 __m256d mv30 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[3].imag, u3_1qbit[3].real);
190 __m256d mv31 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[3].real, u3_1qbit[3].imag);
193 for (
int idx=0; idx<matrix_size/2; idx=idx+2 ) {
196 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
199 int current_idx_pair = current_idx | (1<<
target_qbit);
201 if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
204 double* element = (
double*)input.
get_data() + 2 * current_idx;
205 double* element_pair = (
double*)input.
get_data() + 2 * current_idx_pair;
208 __m256d data0 = _mm256_loadu_pd(element);
209 __m256d data1 = _mm256_loadu_pd(element_pair);
211 __m256d data_u2 = _mm256_mul_pd(data0, mv00);
212 __m256d data_u3 = _mm256_mul_pd(data1, mv10);
213 __m256d data_u4 = _mm256_mul_pd(data0, mv01);
214 __m256d data_u5 = _mm256_mul_pd(data1, mv11);
216 __m256d data_u6 = _mm256_hadd_pd(data_u2, data_u4);
217 __m256d data_u7 = _mm256_hadd_pd(data_u3, data_u5);
219 __m256d data_d2 = _mm256_mul_pd(data0, mv20);
220 __m256d data_d3 = _mm256_mul_pd(data1, mv30);
221 __m256d data_d4 = _mm256_mul_pd(data0, mv21);
222 __m256d data_d5 = _mm256_mul_pd(data1, mv31);
224 __m256d data_d6 = _mm256_hadd_pd(data_d2, data_d4);
225 __m256d data_d7 = _mm256_hadd_pd(data_d3, data_d5);
227 __m256d data_r0 = _mm256_add_pd(data_u6, data_u7);
228 __m256d data_r1 = _mm256_add_pd(data_d6, data_d7);
230 _mm256_storeu_pd(element, data_r0);
231 _mm256_storeu_pd(element_pair, data_r1);
279 unsigned int bitmask_high = ~bitmask_low;
283 if ( control_qbit == 0 ) {
284 #pragma omp parallel for 285 for (
int idx=0; idx<matrix_size/2; idx++ ) {
288 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
291 int current_idx_pair = current_idx | (1<<
target_qbit);
293 if ( current_idx & control_qbit_step_index ) {
302 input[current_idx].real = tmp1.
real + tmp2.
real;
303 input[current_idx].imag = tmp1.
imag + tmp2.
imag;
308 input[current_idx_pair].real = tmp3.
real + tmp4.
real;
309 input[current_idx_pair].imag = tmp3.
imag + tmp4.
imag;
328 else if ( target_qbit == 0 ) {
348 __m256d mv00 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].
real, -u3_1qbit[0].imag, u3_1qbit[0].real);
349 __m256d mv01 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
350 __m256d mv20 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
351 __m256d mv21 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
353 #pragma omp parallel for 354 for (
int idx=0; idx<matrix_size/2; idx++ ) {
357 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
359 if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
362 double *ptr = (
double*)input.
get_data() + 2 * current_idx;
363 __m256d
data = _mm256_loadu_pd(ptr);
365 __m256d data_u0 = _mm256_mul_pd(data, mv00);
366 __m256d data_u1 = _mm256_mul_pd(data, mv01);
367 __m256d data_u2 = _mm256_hadd_pd(data_u0, data_u1);
368 data_u2 = _mm256_permute4x64_pd(data_u2, 216);
370 __m256d data_d0 = _mm256_mul_pd(data, mv20);
371 __m256d data_d1 = _mm256_mul_pd(data, mv21);
372 __m256d data_d2 = _mm256_hadd_pd(data_d0, data_d1);
373 data_d2 = _mm256_permute4x64_pd(data_d2, 216);
375 __m256d data_r = _mm256_hadd_pd(data_u2, data_d2);
377 data_r = _mm256_permute4x64_pd(data_r, 216);
378 _mm256_storeu_pd(ptr, data_r);
415 __m256d mv00 = _mm256_set_pd(-u3_1qbit[0].imag, u3_1qbit[0].
real, -u3_1qbit[0].imag, u3_1qbit[0].real);
416 __m256d mv01 = _mm256_set_pd( u3_1qbit[0].real, u3_1qbit[0].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
417 __m256d mv10 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[1].imag, u3_1qbit[1].real);
418 __m256d mv11 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[1].real, u3_1qbit[1].imag);
419 __m256d mv20 = _mm256_set_pd(-u3_1qbit[2].imag, u3_1qbit[2].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
420 __m256d mv21 = _mm256_set_pd( u3_1qbit[2].real, u3_1qbit[2].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
421 __m256d mv30 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[3].imag, u3_1qbit[3].real);
422 __m256d mv31 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[3].real, u3_1qbit[3].imag);
424 #pragma omp parallel for 425 for (
int idx=0; idx<matrix_size/2; idx=idx+2 ) {
428 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
431 int current_idx_pair = current_idx | (1<<
target_qbit);
433 if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
436 double* element = (
double*)input.
get_data() + 2 * current_idx;
437 double* element_pair = (
double*)input.
get_data() + 2 * current_idx_pair;
440 __m256d data0 = _mm256_loadu_pd(element);
441 __m256d data1 = _mm256_loadu_pd(element_pair);
443 __m256d data_u2 = _mm256_mul_pd(data0, mv00);
444 __m256d data_u3 = _mm256_mul_pd(data1, mv10);
445 __m256d data_u4 = _mm256_mul_pd(data0, mv01);
446 __m256d data_u5 = _mm256_mul_pd(data1, mv11);
448 __m256d data_u6 = _mm256_hadd_pd(data_u2, data_u4);
449 __m256d data_u7 = _mm256_hadd_pd(data_u3, data_u5);
451 __m256d data_d2 = _mm256_mul_pd(data0, mv20);
452 __m256d data_d3 = _mm256_mul_pd(data1, mv30);
453 __m256d data_d4 = _mm256_mul_pd(data0, mv21);
454 __m256d data_d5 = _mm256_mul_pd(data1, mv31);
456 __m256d data_d6 = _mm256_hadd_pd(data_d2, data_d4);
457 __m256d data_d7 = _mm256_hadd_pd(data_d3, data_d5);
459 __m256d data_r0 = _mm256_add_pd(data_u6, data_u7);
460 __m256d data_r1 = _mm256_add_pd(data_d6, data_d7);
462 _mm256_storeu_pd(element, data_r0);
463 _mm256_storeu_pd(element_pair, data_r1);
510 unsigned int bitmask_high = ~bitmask_low;
514 tbb::affinity_partitioner aff_p;
516 if ( control_qbit == 0 ) {
517 tbb::parallel_for( tbb::blocked_range<int>(0,matrix_size/2,grain_size), [&](tbb::blocked_range<int> r) {
519 for (
int idx=r.begin(); idx<r.end(); idx++) {
523 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
526 int current_idx_pair = current_idx | (1<<
target_qbit);
528 if ( current_idx & control_qbit_step_index ) {
537 input[current_idx].real = tmp1.
real + tmp2.
real;
538 input[current_idx].imag = tmp1.
imag + tmp2.
imag;
543 input[current_idx_pair].real = tmp3.
real + tmp4.
real;
544 input[current_idx_pair].imag = tmp3.
imag + tmp4.
imag;
566 else if ( target_qbit == 0 ) {
586 __m256d mv00 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].
real, -u3_1qbit[0].imag, u3_1qbit[0].real);
587 __m256d mv01 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
588 __m256d mv20 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
589 __m256d mv21 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
591 tbb::parallel_for( tbb::blocked_range<int>(0,matrix_size/2,grain_size), [&](tbb::blocked_range<int> r) {
593 for (
int idx=r.begin(); idx<r.end(); idx++) {
596 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
598 if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
601 double *ptr = (
double*)input.
get_data() + 2 * current_idx;
602 __m256d
data = _mm256_loadu_pd(ptr);
604 __m256d data_u0 = _mm256_mul_pd(data, mv00);
605 __m256d data_u1 = _mm256_mul_pd(data, mv01);
606 __m256d data_u2 = _mm256_hadd_pd(data_u0, data_u1);
607 data_u2 = _mm256_permute4x64_pd(data_u2, 216);
609 __m256d data_d0 = _mm256_mul_pd(data, mv20);
610 __m256d data_d1 = _mm256_mul_pd(data, mv21);
611 __m256d data_d2 = _mm256_hadd_pd(data_d0, data_d1);
612 data_d2 = _mm256_permute4x64_pd(data_d2, 216);
614 __m256d data_r = _mm256_hadd_pd(data_u2, data_d2);
616 data_r = _mm256_permute4x64_pd(data_r, 216);
617 _mm256_storeu_pd(ptr, data_r);
653 __m256d mv00 = _mm256_set_pd(-u3_1qbit[0].imag, u3_1qbit[0].
real, -u3_1qbit[0].imag, u3_1qbit[0].real);
654 __m256d mv01 = _mm256_set_pd( u3_1qbit[0].real, u3_1qbit[0].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
655 __m256d mv10 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[1].imag, u3_1qbit[1].real);
656 __m256d mv11 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[1].real, u3_1qbit[1].imag);
657 __m256d mv20 = _mm256_set_pd(-u3_1qbit[2].imag, u3_1qbit[2].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
658 __m256d mv21 = _mm256_set_pd( u3_1qbit[2].real, u3_1qbit[2].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
659 __m256d mv30 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[3].imag, u3_1qbit[3].real);
660 __m256d mv31 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[3].real, u3_1qbit[3].imag);
663 tbb::parallel_for( tbb::blocked_range<int>(0,matrix_size/2,grain_size), [&](tbb::blocked_range<int> r) {
665 for (
int idx=r.begin(); idx<r.end(); idx=idx+2) {
667 int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
670 int current_idx_pair = current_idx | (1<<
target_qbit);
672 if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
675 double* element = (
double*)input.
get_data() + 2 * current_idx;
676 double* element_pair = (
double*)input.
get_data() + 2 * current_idx_pair;
679 __m256d data0 = _mm256_loadu_pd(element);
680 __m256d data1 = _mm256_loadu_pd(element_pair);
682 __m256d data_u2 = _mm256_mul_pd(data0, mv00);
683 __m256d data_u3 = _mm256_mul_pd(data1, mv10);
684 __m256d data_u4 = _mm256_mul_pd(data0, mv01);
685 __m256d data_u5 = _mm256_mul_pd(data1, mv11);
687 __m256d data_u6 = _mm256_hadd_pd(data_u2, data_u4);
688 __m256d data_u7 = _mm256_hadd_pd(data_u3, data_u5);
690 __m256d data_d2 = _mm256_mul_pd(data0, mv20);
691 __m256d data_d3 = _mm256_mul_pd(data1, mv30);
692 __m256d data_d4 = _mm256_mul_pd(data0, mv21);
693 __m256d data_d5 = _mm256_mul_pd(data1, mv31);
695 __m256d data_d6 = _mm256_hadd_pd(data_d2, data_d4);
696 __m256d data_d7 = _mm256_hadd_pd(data_d3, data_d5);
698 __m256d data_r0 = _mm256_add_pd(data_u6, data_u7);
699 __m256d data_r1 = _mm256_add_pd(data_d6, data_d7);
701 _mm256_storeu_pd(element, data_r0);
702 _mm256_storeu_pd(element_pair, data_r1);
data
load the unitary from file
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
scalar * get_data() const
Call to get the pointer to the stored data.
int cols
The number of columns.
Structure type representing complex numbers in the SQUANDER package.
Class to store data of complex arrays and its properties.
double real
the real part of a complex number
double imag
the imaginary part of a complex number