30 for (
int step=1; step<7; step++){
31 if (index_step <= 1<<step){
32 grain_size = 256/(1<<step);
66 int index_step_outer = 1 << outer_qbit;
67 int index_step_inner = 1 << inner_qbit;
70 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
72 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
74 for (
int idx=0; idx<index_step_inner; idx++){
77 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
78 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
79 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
80 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
81 int indexes[4] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc};
84 QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
86 QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
92 for (
int mult_idx=0; mult_idx<4; mult_idx++){
94 tmp1 =
mult(two_qbit_unitary[mult_idx*4], element_outer);
95 tmp2 =
mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
96 tmp3 =
mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
97 tmp4 =
mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
98 input[indexes[mult_idx]].real = tmp1.
real + tmp2.
real + tmp3.
real + tmp4.
real;
99 input[indexes[mult_idx]].imag = tmp1.
imag + tmp2.
imag + tmp3.
imag + tmp4.
imag;
103 current_idx = current_idx + (index_step_outer << 1);
117 int index_step_outer = 1 << outer_qbit;
118 int index_step_inner = 1 << inner_qbit;
121 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
123 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
125 for (
int idx=0; idx<index_step_inner; idx++){
127 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
128 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
129 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
130 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
132 int row_offset_outer = current_idx_outer_loc*input.
stride;
133 int row_offset_outer_pair = current_idx_outer_pair_loc*input.
stride;
134 int row_offset_inner = current_idx_inner_loc*input.
stride;
135 int row_offset_inner_pair = current_idx_inner_pair_loc*input.
stride;
137 for (
int col_idx=0; col_idx<input.
cols; col_idx++) {
138 int index_outer = row_offset_outer+col_idx;
139 int index_outer_pair = row_offset_outer_pair+col_idx;
140 int index_inner = row_offset_inner+col_idx;
141 int index_inner_pair = row_offset_inner_pair + col_idx;
142 int indexes[4] = {index_outer,index_inner,index_outer_pair,index_inner_pair};
152 for (
int mult_idx=0; mult_idx<4; mult_idx++){
154 tmp1 =
mult(two_qbit_unitary[mult_idx*4], element_outer);
155 tmp2 =
mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
156 tmp3 =
mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
157 tmp4 =
mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
158 input[indexes[mult_idx]].real = tmp1.
real + tmp2.
real + tmp3.
real + tmp4.
real;
159 input[indexes[mult_idx]].imag = tmp1.
imag + tmp2.
imag + tmp3.
imag + tmp4.
imag;
164 current_idx = current_idx + (index_step_outer << 1);
178 int index_step_inner = 1 << involved_qbits[0];
179 int index_step_middle = 1 << involved_qbits[1];
180 int index_step_outer = 1 << involved_qbits[2];
183 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<
matrix_size; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
185 for (
int current_idx_middle = 0; current_idx_middle < index_step_outer; current_idx_middle=current_idx_middle+(index_step_middle<<1)){
187 for (
int current_idx_inner = 0; current_idx_inner < index_step_middle; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
189 for (
int idx=0; idx<index_step_inner; idx++){
191 int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
192 int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
194 int current_idx_outer_loc = current_idx_loc;
195 int current_idx_inner_loc = current_idx_loc + index_step_inner;
197 int current_idx_middle_loc = current_idx_loc + index_step_middle;
198 int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
200 int current_idx_outer_pair_loc = current_idx_pair_loc;
201 int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
203 int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
204 int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
206 int indexes[8] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_middle_loc,current_idx_middle_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc,current_idx_middle_pair_loc,current_idx_middle_inner_pair_loc};
209 QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
212 QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
214 QGD_Complex16 element_middle = input[current_idx_middle_loc];
215 QGD_Complex16 element_middle_pair = input[current_idx_middle_pair_loc];
217 QGD_Complex16 element_middle_inner = input[current_idx_middle_inner_loc];
218 QGD_Complex16 element_middle_inner_pair = input[current_idx_middle_inner_pair_loc];
228 for (
int mult_idx=0; mult_idx<8; mult_idx++){
229 tmp1 =
mult(unitary[mult_idx*8], element_outer);
230 tmp2 =
mult(unitary[mult_idx*8 + 1], element_inner);
231 tmp3 =
mult(unitary[mult_idx*8 + 2], element_middle);
232 tmp4 =
mult(unitary[mult_idx*8 + 3], element_middle_inner);
233 tmp5 =
mult(unitary[mult_idx*8 + 4], element_outer_pair);
234 tmp6 =
mult(unitary[mult_idx*8 + 5], element_inner_pair);
235 tmp7 =
mult(unitary[mult_idx*8 + 6], element_middle_pair);
236 tmp8 =
mult(unitary[mult_idx*8 + 7], element_middle_inner_pair);
242 current_idx = current_idx + (index_step_outer << 1);
263 for (
int current_idx_pair=current_idx + index_step_target; current_idx_pair<
matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
265 for(
int idx=0; idx<index_step_target; idx++) {
268 int current_idx_loc = current_idx + idx;
269 int current_idx_pair_loc = current_idx_pair + idx;
271 int row_offset = current_idx_loc*input.
stride;
272 int row_offset_pair = current_idx_pair_loc*input.
stride;
273 for (
int col_idx=0; col_idx<input.
cols; col_idx++) {
275 int index = row_offset+col_idx;
276 int index_pair = row_offset_pair+col_idx;
277 if ( (current_idx_loc >> control_qbit) & 1 ) {
287 input[index].real = tmp1.
real + tmp2.
real;
288 input[index].imag = tmp1.
imag + tmp2.
imag;
290 tmp1 =
mult(u3_1qbit1[2], element);
291 tmp2 =
mult(u3_1qbit1[3], element_pair);
293 input[index_pair].real = tmp1.
real + tmp2.
real;
294 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
305 input[index].real = tmp1.
real + tmp2.
real;
306 input[index].imag = tmp1.
imag + tmp2.
imag;
308 tmp1 =
mult(u3_1qbit2[2], element);
309 tmp2 =
mult(u3_1qbit2[3], element_pair);
311 input[index_pair].real = tmp1.
real + tmp2.
real;
312 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
321 current_idx = current_idx + (index_step_target << 1);
346 __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].
real);
347 __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
348 __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
349 __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
350 __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
351 __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
352 __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
353 __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
355 __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
356 __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
357 __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
358 __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
359 __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
360 __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
361 __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
362 __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
365 for (
int current_idx_pair=current_idx + index_step_target; current_idx_pair<
matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
368 for (
int idx = 0; idx < index_step_target; idx++) {
371 int current_idx_loc = current_idx + idx;
372 int current_idx_pair_loc = current_idx_pair + idx;
374 int row_offset = current_idx_loc * input.
stride;
375 int row_offset_pair = current_idx_pair_loc * input.
stride;
376 for (
int col_idx = 0; col_idx < 2 * (input.
cols - 3); col_idx = col_idx + 8) {
377 double* element = (
double*)input.
get_data() + 2 * row_offset;
378 double* element_pair = (
double*)input.
get_data() + 2 * row_offset_pair;
379 if ((current_idx_loc >> control_qbit) & 1) {
383 __m256d element_vec = _mm256_load_pd(element + col_idx);
384 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
385 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
386 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
389 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
390 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
391 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
392 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
393 element_pair_vec = tmp;
395 __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
396 vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
397 __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
398 vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
399 vec3 = _mm256_add_pd(vec3, vec4);
400 __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
401 vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
402 __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
403 vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
404 vec5 = _mm256_add_pd(vec5, vec6);
407 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
408 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
410 _mm256_store_pd(element + col_idx, vec3);
411 _mm256_store_pd(element + col_idx + 4, vec5);
413 __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
414 vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
415 __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
416 vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
417 vec7 = _mm256_add_pd(vec7, vec8);
418 __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
419 vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
420 __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
421 vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
422 vec9 = _mm256_add_pd(vec9, vec10);
425 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
426 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
428 _mm256_store_pd(element_pair + col_idx, vec7);
429 _mm256_store_pd(element_pair + col_idx + 4, vec9);
438 __m256d element_vec = _mm256_load_pd(element + col_idx);
439 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
440 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
441 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
444 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
445 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
446 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
447 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
448 element_pair_vec = tmp;
450 __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
451 vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
452 __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
453 vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
454 vec3 = _mm256_add_pd(vec3, vec4);
455 __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
456 vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
457 __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
458 vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
459 vec5 = _mm256_add_pd(vec5, vec6);
462 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
463 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
465 _mm256_store_pd(element + col_idx, vec3);
466 _mm256_store_pd(element + col_idx + 4, vec5);
468 __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
469 vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
470 __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
471 vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
472 vec7 = _mm256_add_pd(vec7, vec8);
473 __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
474 vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
475 __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
476 vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
477 vec9 = _mm256_add_pd(vec9, vec10);
480 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
481 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
483 _mm256_store_pd(element_pair + col_idx, vec7);
484 _mm256_store_pd(element_pair + col_idx + 4, vec9);
488 int remainder = input.
cols % 4;
489 if (remainder != 0) {
491 for (
int col_idx = input.
cols-remainder; col_idx < input.
cols; col_idx++) {
492 int index = row_offset+col_idx;
493 int index_pair = row_offset_pair+col_idx;
494 if ( (current_idx_loc >> control_qbit) & 1 ) {
504 input[index].real = tmp1.
real + tmp2.
real;
505 input[index].imag = tmp1.
imag + tmp2.
imag;
507 tmp1 =
mult(u3_1qbit1[2], element);
508 tmp2 =
mult(u3_1qbit1[3], element_pair);
510 input[index_pair].real = tmp1.
real + tmp2.
real;
511 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
522 input[index].real = tmp1.
real + tmp2.
real;
523 input[index].imag = tmp1.
imag + tmp2.
imag;
525 tmp1 =
mult(u3_1qbit2[2], element);
526 tmp2 =
mult(u3_1qbit2[3], element_pair);
528 input[index_pair].real = tmp1.
real + tmp2.
real;
529 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
541 current_idx = current_idx + (index_step_target << 1);
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
scalar * get_data() const
Call to get the pointer to the stored data.
int rows
The number of rows.
int cols
The number of columns.
Structure type representing complex numbers in the SQUANDER package.
Class to store data of complex arrays and its properties.
double real
the real part of a complex number
double imag
the imaginary part of a complex number