Sequential Quantum Gate Decomposer  v1.9.3
Powerful decomposition of general unitarias into one- and two-qubit gates gates
apply_large_kernel_to_input.cpp
Go to the documentation of this file.
1 /*
2 Created on Fri Jun 26 14:13:26 2020
3 Copyright 2020 Peter Rakyta, Ph.D.
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 
17 @author: Peter Rakyta, Ph.D.
18 */
25 #include "tbb/tbb.h"
26 
27 
28 int get_grain_size(int index_step){
29  int grain_size=2;
30  for (int step=1; step<7; step++){
31  if (index_step <= 1<<step){
32  grain_size = 256/(1<<step);
33  }
34  }
35  return grain_size;
36 }
37 
38 void apply_large_kernel_to_input(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
39 
40  if (input.cols==1){
41  //switch((int)involved_qbits.size()){
42  //case 2:{
43  apply_2qbit_kernel_to_state_vector_input(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
44  /*}
45  case 3:{
46  apply_3qbit_kernel_to_state_vector_input(unitary,input,involved_qbits,matrix_size);
47  }
48  }*/
49  }
50  else
51  {
52  apply_2qbit_kernel_to_matrix_input(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
53  }
54 }
55 
64 void apply_2qbit_kernel_to_state_vector_input(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
65 
66  int index_step_outer = 1 << outer_qbit;
67  int index_step_inner = 1 << inner_qbit;
68  int current_idx = 0;
69 
70  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
71 
72  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
73 
74  for (int idx=0; idx<index_step_inner; idx++){
75 
76 
77  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
78  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
79  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
80  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
81  int indexes[4] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc};
82 
83  QGD_Complex16 element_outer = input[current_idx_outer_loc];
84  QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
85  QGD_Complex16 element_inner = input[current_idx_inner_loc];
86  QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
87 
88  QGD_Complex16 tmp1;
89  QGD_Complex16 tmp2;
90  QGD_Complex16 tmp3;
91  QGD_Complex16 tmp4;
92  for (int mult_idx=0; mult_idx<4; mult_idx++){
93 
94  tmp1 = mult(two_qbit_unitary[mult_idx*4], element_outer);
95  tmp2 = mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
96  tmp3 = mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
97  tmp4 = mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
98  input[indexes[mult_idx]].real = tmp1.real + tmp2.real + tmp3.real + tmp4.real;
99  input[indexes[mult_idx]].imag = tmp1.imag + tmp2.imag + tmp3.imag + tmp4.imag;
100  }
101  }
102  }
103  current_idx = current_idx + (index_step_outer << 1);
104  }
105 
106 }
115 void apply_2qbit_kernel_to_matrix_input(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
116 
117  int index_step_outer = 1 << outer_qbit;
118  int index_step_inner = 1 << inner_qbit;
119  int current_idx = 0;
120 
121  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
122 
123  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
124 
125  for (int idx=0; idx<index_step_inner; idx++){
126 
127  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
128  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
129  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
130  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
131 
132  int row_offset_outer = current_idx_outer_loc*input.stride;
133  int row_offset_outer_pair = current_idx_outer_pair_loc*input.stride;
134  int row_offset_inner = current_idx_inner_loc*input.stride;
135  int row_offset_inner_pair = current_idx_inner_pair_loc*input.stride;
136  //input.print_matrix();
137  for ( int col_idx=0; col_idx<input.cols; col_idx++) {
138  int index_outer = row_offset_outer+col_idx;
139  int index_outer_pair = row_offset_outer_pair+col_idx;
140  int index_inner = row_offset_inner+col_idx;
141  int index_inner_pair = row_offset_inner_pair + col_idx;
142  int indexes[4] = {index_outer,index_inner,index_outer_pair,index_inner_pair};
143  QGD_Complex16 element_outer = input[index_outer];
144  QGD_Complex16 element_outer_pair = input[index_outer_pair];
145  QGD_Complex16 element_inner = input[index_inner];
146  QGD_Complex16 element_inner_pair = input[index_inner_pair];
147 
148  QGD_Complex16 tmp1;
149  QGD_Complex16 tmp2;
150  QGD_Complex16 tmp3;
151  QGD_Complex16 tmp4;
152  for (int mult_idx=0; mult_idx<4; mult_idx++){
153 
154  tmp1 = mult(two_qbit_unitary[mult_idx*4], element_outer);
155  tmp2 = mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
156  tmp3 = mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
157  tmp4 = mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
158  input[indexes[mult_idx]].real = tmp1.real + tmp2.real + tmp3.real + tmp4.real;
159  input[indexes[mult_idx]].imag = tmp1.imag + tmp2.imag + tmp3.imag + tmp4.imag;
160  }
161  }
162  }
163  }
164  current_idx = current_idx + (index_step_outer << 1);
165  }
166 
167 }
168 
176 void apply_3qbit_kernel_to_state_vector_input(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
177 
178  int index_step_inner = 1 << involved_qbits[0];
179  int index_step_middle = 1 << involved_qbits[1];
180  int index_step_outer = 1 << involved_qbits[2];
181  int current_idx = 0;
182 
183  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<matrix_size; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
184 
185  for (int current_idx_middle = 0; current_idx_middle < index_step_outer; current_idx_middle=current_idx_middle+(index_step_middle<<1)){
186 
187  for (int current_idx_inner = 0; current_idx_inner < index_step_middle; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
188 
189  for (int idx=0; idx<index_step_inner; idx++){
190 
191  int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
192  int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
193 
194  int current_idx_outer_loc = current_idx_loc;
195  int current_idx_inner_loc = current_idx_loc + index_step_inner;
196 
197  int current_idx_middle_loc = current_idx_loc + index_step_middle;
198  int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
199 
200  int current_idx_outer_pair_loc = current_idx_pair_loc;
201  int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
202 
203  int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
204  int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
205 
206  int indexes[8] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_middle_loc,current_idx_middle_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc,current_idx_middle_pair_loc,current_idx_middle_inner_pair_loc};
207  //input.print_matrix();
208  QGD_Complex16 element_outer = input[current_idx_outer_loc];
209  QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
210 
211  QGD_Complex16 element_inner = input[current_idx_inner_loc];
212  QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
213 
214  QGD_Complex16 element_middle = input[current_idx_middle_loc];
215  QGD_Complex16 element_middle_pair = input[current_idx_middle_pair_loc];
216 
217  QGD_Complex16 element_middle_inner = input[current_idx_middle_inner_loc];
218  QGD_Complex16 element_middle_inner_pair = input[current_idx_middle_inner_pair_loc];
219 
220  QGD_Complex16 tmp1;
221  QGD_Complex16 tmp2;
222  QGD_Complex16 tmp3;
223  QGD_Complex16 tmp4;
224  QGD_Complex16 tmp5;
225  QGD_Complex16 tmp6;
226  QGD_Complex16 tmp7;
227  QGD_Complex16 tmp8;
228  for (int mult_idx=0; mult_idx<8; mult_idx++){
229  tmp1 = mult(unitary[mult_idx*8], element_outer);
230  tmp2 = mult(unitary[mult_idx*8 + 1], element_inner);
231  tmp3 = mult(unitary[mult_idx*8 + 2], element_middle);
232  tmp4 = mult(unitary[mult_idx*8 + 3], element_middle_inner);
233  tmp5 = mult(unitary[mult_idx*8 + 4], element_outer_pair);
234  tmp6 = mult(unitary[mult_idx*8 + 5], element_inner_pair);
235  tmp7 = mult(unitary[mult_idx*8 + 6], element_middle_pair);
236  tmp8 = mult(unitary[mult_idx*8 + 7], element_middle_inner_pair);
237  input[indexes[mult_idx]].real = tmp1.real + tmp2.real + tmp3.real + tmp4.real + tmp5.real + tmp6.real + tmp7.real + tmp8.real;
238  input[indexes[mult_idx]].imag = tmp1.imag + tmp2.imag + tmp3.imag + tmp4.imag + tmp5.imag + tmp6.imag + tmp7.imag + tmp8.imag;
239  }
240  }
241  }
242  current_idx = current_idx + (index_step_outer << 1);
243  }
244  }
245 }
246 
256 void
257 apply_crot_kernel_to_matrix_input(Matrix& u3_1qbit1, Matrix& u3_1qbit2, Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
258 
259  int index_step_target = 1 << target_qbit;
260  int current_idx = 0;
261 
262 
263  for ( int current_idx_pair=current_idx + index_step_target; current_idx_pair<matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
264 
265  for(int idx=0; idx<index_step_target; idx++) {
266  //tbb::parallel_for(0, index_step_target, 1, [&](int idx) {
267 
268  int current_idx_loc = current_idx + idx;
269  int current_idx_pair_loc = current_idx_pair + idx;
270 
271  int row_offset = current_idx_loc*input.stride;
272  int row_offset_pair = current_idx_pair_loc*input.stride;
273  for ( int col_idx=0; col_idx<input.cols; col_idx++) {
274 
275  int index = row_offset+col_idx;
276  int index_pair = row_offset_pair+col_idx;
277  if ( (current_idx_loc >> control_qbit) & 1 ) {
278 
279 
280 
281  QGD_Complex16 element = input[index];
282  QGD_Complex16 element_pair = input[index_pair];
283 
284  QGD_Complex16 tmp1 = mult(u3_1qbit1[0], element);
285  QGD_Complex16 tmp2 = mult(u3_1qbit1[1], element_pair);
286 
287  input[index].real = tmp1.real + tmp2.real;
288  input[index].imag = tmp1.imag + tmp2.imag;
289 
290  tmp1 = mult(u3_1qbit1[2], element);
291  tmp2 = mult(u3_1qbit1[3], element_pair);
292 
293  input[index_pair].real = tmp1.real + tmp2.real;
294  input[index_pair].imag = tmp1.imag + tmp2.imag;
295 
296  }
297 
298  else {
299  QGD_Complex16 element = input[index];
300  QGD_Complex16 element_pair = input[index_pair];
301 
302  QGD_Complex16 tmp1 = mult(u3_1qbit2[0], element);
303  QGD_Complex16 tmp2 = mult(u3_1qbit2[1], element_pair);
304 
305  input[index].real = tmp1.real + tmp2.real;
306  input[index].imag = tmp1.imag + tmp2.imag;
307 
308  tmp1 = mult(u3_1qbit2[2], element);
309  tmp2 = mult(u3_1qbit2[3], element_pair);
310 
311  input[index_pair].real = tmp1.real + tmp2.real;
312  input[index_pair].imag = tmp1.imag + tmp2.imag;
313  }
314  }
315 
316 
317  //});
318  }
319 
320 
321  current_idx = current_idx + (index_step_target << 1);
322 
323 
324  }
325 
326 
327 
328 }
338 void
339 apply_crot_kernel_to_matrix_input_AVX(Matrix& u3_1qbit1, Matrix& u3_1qbit2, Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
340  input.ensure_aligned();
341 
342  int index_step_target = 1 << target_qbit;
343  int current_idx = 0;
344 
345  // load elements of the first U3 unitary into 256bit registers (8 registers)
346  __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].real);
347  __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
348  __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
349  __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
350  __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
351  __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
352  __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
353  __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
354  // load elements of the second U3 unitary into 256bit registers (8 registers)
355  __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
356  __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
357  __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
358  __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
359  __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
360  __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
361  __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
362  __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
363 
364 
365  for ( int current_idx_pair=current_idx + index_step_target; current_idx_pair<matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
366 
367 
368  for (int idx = 0; idx < index_step_target; idx++) {
369 
370 
371  int current_idx_loc = current_idx + idx;
372  int current_idx_pair_loc = current_idx_pair + idx;
373 
374  int row_offset = current_idx_loc * input.stride;
375  int row_offset_pair = current_idx_pair_loc * input.stride;
376  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
377  double* element = (double*)input.get_data() + 2 * row_offset;
378  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
379  if ((current_idx_loc >> control_qbit) & 1) {
380 
381 
382  // extract successive elements from arrays element, element_pair
383  __m256d element_vec = _mm256_load_pd(element + col_idx);
384  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
385  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
386  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
387  element_vec = tmp;
388 
389  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
390  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
391  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
392  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
393  element_pair_vec = tmp;
394 
395  __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
396  vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
397  __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
398  vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
399  vec3 = _mm256_add_pd(vec3, vec4);
400  __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
401  vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
402  __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
403  vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
404  vec5 = _mm256_add_pd(vec5, vec6);
405 
406  // 6 store the transformed elements in vec3
407  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
408  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
409  vec3 = tmp;
410  _mm256_store_pd(element + col_idx, vec3);
411  _mm256_store_pd(element + col_idx + 4, vec5);
412 
413  __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
414  vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
415  __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
416  vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
417  vec7 = _mm256_add_pd(vec7, vec8);
418  __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
419  vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
420  __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
421  vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
422  vec9 = _mm256_add_pd(vec9, vec10);
423 
424  // 6 store the transformed elements in vec3
425  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
426  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
427  vec7 = tmp;
428  _mm256_store_pd(element_pair + col_idx, vec7);
429  _mm256_store_pd(element_pair + col_idx + 4, vec9);
430 
431 
432 
433  }
434  else {
435 
436 
437  // extract successive elements from arrays element, element_pair
438  __m256d element_vec = _mm256_load_pd(element + col_idx);
439  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
440  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
441  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
442  element_vec = tmp;
443 
444  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
445  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
446  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
447  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
448  element_pair_vec = tmp;
449 
450  __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
451  vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
452  __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
453  vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
454  vec3 = _mm256_add_pd(vec3, vec4);
455  __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
456  vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
457  __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
458  vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
459  vec5 = _mm256_add_pd(vec5, vec6);
460 
461  // 6 store the transformed elements in vec3
462  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
463  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
464  vec3 = tmp;
465  _mm256_store_pd(element + col_idx, vec3);
466  _mm256_store_pd(element + col_idx + 4, vec5);
467 
468  __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
469  vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
470  __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
471  vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
472  vec7 = _mm256_add_pd(vec7, vec8);
473  __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
474  vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
475  __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
476  vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
477  vec9 = _mm256_add_pd(vec9, vec10);
478 
479  // 6 store the transformed elements in vec3
480  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
481  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
482  vec7 = tmp;
483  _mm256_store_pd(element_pair + col_idx, vec7);
484  _mm256_store_pd(element_pair + col_idx + 4, vec9);
485  }
486  }
487 
488  int remainder = input.cols % 4;
489  if (remainder != 0) {
490 
491  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
492  int index = row_offset+col_idx;
493  int index_pair = row_offset_pair+col_idx;
494  if ( (current_idx_loc >> control_qbit) & 1 ) {
495 
496 
497 
498  QGD_Complex16 element = input[index];
499  QGD_Complex16 element_pair = input[index_pair];
500 
501  QGD_Complex16 tmp1 = mult(u3_1qbit1[0], element);
502  QGD_Complex16 tmp2 = mult(u3_1qbit1[1], element_pair);
503 
504  input[index].real = tmp1.real + tmp2.real;
505  input[index].imag = tmp1.imag + tmp2.imag;
506 
507  tmp1 = mult(u3_1qbit1[2], element);
508  tmp2 = mult(u3_1qbit1[3], element_pair);
509 
510  input[index_pair].real = tmp1.real + tmp2.real;
511  input[index_pair].imag = tmp1.imag + tmp2.imag;
512 
513  }
514 
515  else {
516  QGD_Complex16 element = input[index];
517  QGD_Complex16 element_pair = input[index_pair];
518 
519  QGD_Complex16 tmp1 = mult(u3_1qbit2[0], element);
520  QGD_Complex16 tmp2 = mult(u3_1qbit2[1], element_pair);
521 
522  input[index].real = tmp1.real + tmp2.real;
523  input[index].imag = tmp1.imag + tmp2.imag;
524 
525  tmp1 = mult(u3_1qbit2[2], element);
526  tmp2 = mult(u3_1qbit2[3], element_pair);
527 
528  input[index_pair].real = tmp1.real + tmp2.real;
529  input[index_pair].imag = tmp1.imag + tmp2.imag;
530  }
531  }
532 
533  }
534  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
535 
536 
537  }
538 
539 
540 
541  current_idx = current_idx + (index_step_target << 1);
542 
543  }
544 
545 
546 }
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
Definition: matrix_base.hpp:46
void apply_large_kernel_to_input(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
void apply_crot_kernel_to_matrix_input(Matrix &u3_1qbit1, Matrix &u3_1qbit2, Matrix &input, const int &target_qbit, const int &control_qbit, const int &matrix_size)
Call to apply crot gate kernel on an input matrix.
void ensure_aligned()
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
Definition: common.cpp:259
scalar * get_data() const
Call to get the pointer to the stored data.
int rows
The number of rows.
Definition: matrix_base.hpp:42
int cols
The number of columns.
Definition: matrix_base.hpp:44
matrix_size
[load Umtx]
Definition: example.py:58
void apply_2qbit_kernel_to_state_vector_input(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
Call to apply kernel to apply two qubit gate kernel on a state vector.
Structure type representing complex numbers in the SQUANDER package.
Definition: QGDTypes.h:38
Class to store data of complex arrays and its properties.
Definition: matrix.h:38
void apply_crot_kernel_to_matrix_input_AVX(Matrix &u3_1qbit1, Matrix &u3_1qbit2, Matrix &input, const int &target_qbit, const int &control_qbit, const int &matrix_size)
Call to apply crot gate kernel on an input matrix using AVX.
double real
the real part of a complex number
Definition: QGDTypes.h:40
int get_grain_size(int index_step)
void apply_3qbit_kernel_to_state_vector_input(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
Call to apply kernel to apply three qubit gate kernel on a state vector.
void apply_2qbit_kernel_to_matrix_input(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
Call to apply kernel to apply two qubit gate kernel on an input matrix using AVX. ...
double imag
the imaginary part of a complex number
Definition: QGDTypes.h:42