30 for (
int step=1; step<7; step++){
31 if (index_step <= 1<<step){
32 grain_size = 256/(1<<step);
58 int index_step_outer = 1 << outer_qbit;
59 int index_step_inner = 1 << inner_qbit;
62 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
64 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
66 for (
int idx=0; idx<index_step_inner; idx++){
69 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
70 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
71 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
72 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
73 int indexes[4] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc};
76 QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
78 QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
84 for (
int mult_idx=0; mult_idx<4; mult_idx++){
86 tmp1 =
mult(two_qbit_unitary[mult_idx*4], element_outer);
87 tmp2 =
mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
88 tmp3 =
mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
89 tmp4 =
mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
90 input[indexes[mult_idx]].real = tmp1.
real + tmp2.
real + tmp3.
real + tmp4.
real;
91 input[indexes[mult_idx]].imag = tmp1.
imag + tmp2.
imag + tmp3.
imag + tmp4.
imag;
95 current_idx = current_idx + (index_step_outer << 1);
102 int index_step_outer = 1 << outer_qbit;
103 int index_step_inner = 1 << inner_qbit;
106 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.
rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
108 for (
int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
110 for (
int idx=0; idx<index_step_inner; idx++){
112 int current_idx_outer_loc = current_idx + current_idx_inner + idx;
113 int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
114 int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
115 int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
117 int row_offset_outer = current_idx_outer_loc*input.
stride;
118 int row_offset_outer_pair = current_idx_outer_pair_loc*input.
stride;
119 int row_offset_inner = current_idx_inner_loc*input.
stride;
120 int row_offset_inner_pair = current_idx_inner_pair_loc*input.
stride;
122 for (
int col_idx=0; col_idx<input.
cols; col_idx++) {
123 int index_outer = row_offset_outer+col_idx;
124 int index_outer_pair = row_offset_outer_pair+col_idx;
125 int index_inner = row_offset_inner+col_idx;
126 int index_inner_pair = row_offset_inner_pair + col_idx;
127 int indexes[4] = {index_outer,index_inner,index_outer_pair,index_inner_pair};
137 for (
int mult_idx=0; mult_idx<4; mult_idx++){
139 tmp1 =
mult(two_qbit_unitary[mult_idx*4], element_outer);
140 tmp2 =
mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
141 tmp3 =
mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
142 tmp4 =
mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
143 input[indexes[mult_idx]].real = tmp1.
real + tmp2.
real + tmp3.
real + tmp4.
real;
144 input[indexes[mult_idx]].imag = tmp1.
imag + tmp2.
imag + tmp3.
imag + tmp4.
imag;
149 current_idx = current_idx + (index_step_outer << 1);
157 int index_step_inner = 1 << involved_qbits[0];
158 int index_step_middle = 1 << involved_qbits[1];
159 int index_step_outer = 1 << involved_qbits[2];
162 for (
int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<
matrix_size; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
164 for (
int current_idx_middle = 0; current_idx_middle < index_step_outer; current_idx_middle=current_idx_middle+(index_step_middle<<1)){
166 for (
int current_idx_inner = 0; current_idx_inner < index_step_middle; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
168 for (
int idx=0; idx<index_step_inner; idx++){
170 int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
171 int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
173 int current_idx_outer_loc = current_idx_loc;
174 int current_idx_inner_loc = current_idx_loc + index_step_inner;
176 int current_idx_middle_loc = current_idx_loc + index_step_middle;
177 int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
179 int current_idx_outer_pair_loc = current_idx_pair_loc;
180 int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
182 int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
183 int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
185 int indexes[8] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_middle_loc,current_idx_middle_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc,current_idx_middle_pair_loc,current_idx_middle_inner_pair_loc};
188 QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
191 QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
193 QGD_Complex16 element_middle = input[current_idx_middle_loc];
194 QGD_Complex16 element_middle_pair = input[current_idx_middle_pair_loc];
196 QGD_Complex16 element_middle_inner = input[current_idx_middle_inner_loc];
197 QGD_Complex16 element_middle_inner_pair = input[current_idx_middle_inner_pair_loc];
207 for (
int mult_idx=0; mult_idx<8; mult_idx++){
208 tmp1 =
mult(unitary[mult_idx*8], element_outer);
209 tmp2 =
mult(unitary[mult_idx*8 + 1], element_inner);
210 tmp3 =
mult(unitary[mult_idx*8 + 2], element_middle);
211 tmp4 =
mult(unitary[mult_idx*8 + 3], element_middle_inner);
212 tmp5 =
mult(unitary[mult_idx*8 + 4], element_outer_pair);
213 tmp6 =
mult(unitary[mult_idx*8 + 5], element_inner_pair);
214 tmp7 =
mult(unitary[mult_idx*8 + 6], element_middle_pair);
215 tmp8 =
mult(unitary[mult_idx*8 + 7], element_middle_inner_pair);
221 current_idx = current_idx + (index_step_outer << 1);
234 for (
int current_idx_pair=current_idx + index_step_target; current_idx_pair<
matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
236 for(
int idx=0; idx<index_step_target; idx++) {
239 int current_idx_loc = current_idx + idx;
240 int current_idx_pair_loc = current_idx_pair + idx;
242 int row_offset = current_idx_loc*input.
stride;
243 int row_offset_pair = current_idx_pair_loc*input.
stride;
244 for (
int col_idx=0; col_idx<input.
cols; col_idx++) {
246 int index = row_offset+col_idx;
247 int index_pair = row_offset_pair+col_idx;
248 if ( (current_idx_loc >> control_qbit) & 1 ) {
258 input[index].real = tmp1.
real + tmp2.
real;
259 input[index].imag = tmp1.
imag + tmp2.
imag;
261 tmp1 =
mult(u3_1qbit1[2], element);
262 tmp2 =
mult(u3_1qbit1[3], element_pair);
264 input[index_pair].real = tmp1.
real + tmp2.
real;
265 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
276 input[index].real = tmp1.
real + tmp2.
real;
277 input[index].imag = tmp1.
imag + tmp2.
imag;
279 tmp1 =
mult(u3_1qbit2[2], element);
280 tmp2 =
mult(u3_1qbit2[3], element_pair);
282 input[index_pair].real = tmp1.
real + tmp2.
real;
283 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
292 current_idx = current_idx + (index_step_target << 1);
309 __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].
real);
310 __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
311 __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
312 __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
313 __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
314 __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
315 __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
316 __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
318 __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
319 __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
320 __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
321 __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
322 __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
323 __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
324 __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
325 __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
328 for (
int current_idx_pair=current_idx + index_step_target; current_idx_pair<
matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
331 for (
int idx = 0; idx < index_step_target; idx++) {
334 int current_idx_loc = current_idx + idx;
335 int current_idx_pair_loc = current_idx_pair + idx;
337 int row_offset = current_idx_loc * input.
stride;
338 int row_offset_pair = current_idx_pair_loc * input.
stride;
339 for (
int col_idx = 0; col_idx < 2 * (input.
cols - 3); col_idx = col_idx + 8) {
340 double* element = (
double*)input.
get_data() + 2 * row_offset;
341 double* element_pair = (
double*)input.
get_data() + 2 * row_offset_pair;
342 if ((current_idx_loc >> control_qbit) & 1) {
346 __m256d element_vec = _mm256_load_pd(element + col_idx);
347 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
348 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
349 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
352 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
353 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
354 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
355 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
356 element_pair_vec = tmp;
358 __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
359 vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
360 __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
361 vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
362 vec3 = _mm256_add_pd(vec3, vec4);
363 __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
364 vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
365 __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
366 vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
367 vec5 = _mm256_add_pd(vec5, vec6);
370 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
371 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
373 _mm256_store_pd(element + col_idx, vec3);
374 _mm256_store_pd(element + col_idx + 4, vec5);
376 __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
377 vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
378 __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
379 vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
380 vec7 = _mm256_add_pd(vec7, vec8);
381 __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
382 vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
383 __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
384 vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
385 vec9 = _mm256_add_pd(vec9, vec10);
388 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
389 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
391 _mm256_store_pd(element_pair + col_idx, vec7);
392 _mm256_store_pd(element_pair + col_idx + 4, vec9);
401 __m256d element_vec = _mm256_load_pd(element + col_idx);
402 __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
403 __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
404 element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
407 __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
408 __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
409 tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
410 element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
411 element_pair_vec = tmp;
413 __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
414 vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
415 __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
416 vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
417 vec3 = _mm256_add_pd(vec3, vec4);
418 __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
419 vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
420 __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
421 vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
422 vec5 = _mm256_add_pd(vec5, vec6);
425 tmp = _mm256_shuffle_pd(vec3, vec5, 0);
426 vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
428 _mm256_store_pd(element + col_idx, vec3);
429 _mm256_store_pd(element + col_idx + 4, vec5);
431 __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
432 vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
433 __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
434 vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
435 vec7 = _mm256_add_pd(vec7, vec8);
436 __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
437 vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
438 __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
439 vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
440 vec9 = _mm256_add_pd(vec9, vec10);
443 tmp = _mm256_shuffle_pd(vec7, vec9, 0);
444 vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
446 _mm256_store_pd(element_pair + col_idx, vec7);
447 _mm256_store_pd(element_pair + col_idx + 4, vec9);
451 int remainder = input.
cols % 4;
452 if (remainder != 0) {
454 for (
int col_idx = input.
cols-remainder; col_idx < input.
cols; col_idx++) {
455 int index = row_offset+col_idx;
456 int index_pair = row_offset_pair+col_idx;
457 if ( (current_idx_loc >> control_qbit) & 1 ) {
467 input[index].real = tmp1.
real + tmp2.
real;
468 input[index].imag = tmp1.
imag + tmp2.
imag;
470 tmp1 =
mult(u3_1qbit1[2], element);
471 tmp2 =
mult(u3_1qbit1[3], element_pair);
473 input[index_pair].real = tmp1.
real + tmp2.
real;
474 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
485 input[index].real = tmp1.
real + tmp2.
real;
486 input[index].imag = tmp1.
imag + tmp2.
imag;
488 tmp1 =
mult(u3_1qbit2[2], element);
489 tmp2 =
mult(u3_1qbit2[3], element_pair);
491 input[index_pair].real = tmp1.
real + tmp2.
real;
492 input[index_pair].imag = tmp1.
imag + tmp2.
imag;
504 current_idx = current_idx + (index_step_target << 1);
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
scalar * get_data() const
Call to get the pointer to the stored data.
int rows
The number of rows.
int cols
The number of columns.
Structure type representing complex numbers in the SQUANDER package.
Class to store data of complex arrays and its properties.
double real
the real part of a complex number
double imag
the imaginary part of a complex number