Sequential Quantum Gate Decomposer  v1.9.3
Powerful decomposition of general unitarias into one- and two-qubit gates gates
apply_large_kernel_to_input.cpp
Go to the documentation of this file.
1 /*
2 Created on Fri Jun 26 14:13:26 2020
3 Copyright 2020 Peter Rakyta, Ph.D.
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 
17 @author: Peter Rakyta, Ph.D.
18 */
25 #include "tbb/tbb.h"
26 
27 
28 int get_grain_size(int index_step){
29  int grain_size=2;
30  for (int step=1; step<7; step++){
31  if (index_step <= 1<<step){
32  grain_size = 256/(1<<step);
33  }
34  }
35  return grain_size;
36 }
37 
38 void apply_large_kernel_to_input(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
39 
40  if (input.cols==1){
41  //switch((int)involved_qbits.size()){
42  //case 2:{
43  apply_2qbit_kernel_to_state_vector_input(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
44  /*}
45  case 3:{
46  apply_3qbit_kernel_to_state_vector_input(unitary,input,involved_qbits,matrix_size);
47  }
48  }*/
49  }
50  else
51  {
52  apply_2qbit_kernel_to_matrix_input(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
53  }
54 }
55 
56 void apply_2qbit_kernel_to_state_vector_input(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
57 
58  int index_step_outer = 1 << outer_qbit;
59  int index_step_inner = 1 << inner_qbit;
60  int current_idx = 0;
61 
62  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
63 
64  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
65 
66  for (int idx=0; idx<index_step_inner; idx++){
67 
68 
69  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
70  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
71  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
72  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
73  int indexes[4] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc};
74 
75  QGD_Complex16 element_outer = input[current_idx_outer_loc];
76  QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
77  QGD_Complex16 element_inner = input[current_idx_inner_loc];
78  QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
79 
80  QGD_Complex16 tmp1;
81  QGD_Complex16 tmp2;
82  QGD_Complex16 tmp3;
83  QGD_Complex16 tmp4;
84  for (int mult_idx=0; mult_idx<4; mult_idx++){
85 
86  tmp1 = mult(two_qbit_unitary[mult_idx*4], element_outer);
87  tmp2 = mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
88  tmp3 = mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
89  tmp4 = mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
90  input[indexes[mult_idx]].real = tmp1.real + tmp2.real + tmp3.real + tmp4.real;
91  input[indexes[mult_idx]].imag = tmp1.imag + tmp2.imag + tmp3.imag + tmp4.imag;
92  }
93  }
94  }
95  current_idx = current_idx + (index_step_outer << 1);
96  }
97 
98 }
99 
100 void apply_2qbit_kernel_to_matrix_input(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
101 
102  int index_step_outer = 1 << outer_qbit;
103  int index_step_inner = 1 << inner_qbit;
104  int current_idx = 0;
105 
106  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
107 
108  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
109 
110  for (int idx=0; idx<index_step_inner; idx++){
111 
112  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
113  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
114  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
115  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
116 
117  int row_offset_outer = current_idx_outer_loc*input.stride;
118  int row_offset_outer_pair = current_idx_outer_pair_loc*input.stride;
119  int row_offset_inner = current_idx_inner_loc*input.stride;
120  int row_offset_inner_pair = current_idx_inner_pair_loc*input.stride;
121  //input.print_matrix();
122  for ( int col_idx=0; col_idx<input.cols; col_idx++) {
123  int index_outer = row_offset_outer+col_idx;
124  int index_outer_pair = row_offset_outer_pair+col_idx;
125  int index_inner = row_offset_inner+col_idx;
126  int index_inner_pair = row_offset_inner_pair + col_idx;
127  int indexes[4] = {index_outer,index_inner,index_outer_pair,index_inner_pair};
128  QGD_Complex16 element_outer = input[index_outer];
129  QGD_Complex16 element_outer_pair = input[index_outer_pair];
130  QGD_Complex16 element_inner = input[index_inner];
131  QGD_Complex16 element_inner_pair = input[index_inner_pair];
132 
133  QGD_Complex16 tmp1;
134  QGD_Complex16 tmp2;
135  QGD_Complex16 tmp3;
136  QGD_Complex16 tmp4;
137  for (int mult_idx=0; mult_idx<4; mult_idx++){
138 
139  tmp1 = mult(two_qbit_unitary[mult_idx*4], element_outer);
140  tmp2 = mult(two_qbit_unitary[mult_idx*4 + 1], element_inner);
141  tmp3 = mult(two_qbit_unitary[mult_idx*4 + 2], element_outer_pair);
142  tmp4 = mult(two_qbit_unitary[mult_idx*4 + 3], element_inner_pair);
143  input[indexes[mult_idx]].real = tmp1.real + tmp2.real + tmp3.real + tmp4.real;
144  input[indexes[mult_idx]].imag = tmp1.imag + tmp2.imag + tmp3.imag + tmp4.imag;
145  }
146  }
147  }
148  }
149  current_idx = current_idx + (index_step_outer << 1);
150  }
151 
152 }
153 
154 
155 void apply_3qbit_kernel_to_state_vector_input(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
156 
157  int index_step_inner = 1 << involved_qbits[0];
158  int index_step_middle = 1 << involved_qbits[1];
159  int index_step_outer = 1 << involved_qbits[2];
160  int current_idx = 0;
161 
162  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<matrix_size; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
163 
164  for (int current_idx_middle = 0; current_idx_middle < index_step_outer; current_idx_middle=current_idx_middle+(index_step_middle<<1)){
165 
166  for (int current_idx_inner = 0; current_idx_inner < index_step_middle; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
167 
168  for (int idx=0; idx<index_step_inner; idx++){
169 
170  int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
171  int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
172 
173  int current_idx_outer_loc = current_idx_loc;
174  int current_idx_inner_loc = current_idx_loc + index_step_inner;
175 
176  int current_idx_middle_loc = current_idx_loc + index_step_middle;
177  int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
178 
179  int current_idx_outer_pair_loc = current_idx_pair_loc;
180  int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
181 
182  int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
183  int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
184 
185  int indexes[8] = {current_idx_outer_loc,current_idx_inner_loc,current_idx_middle_loc,current_idx_middle_inner_loc,current_idx_outer_pair_loc,current_idx_inner_pair_loc,current_idx_middle_pair_loc,current_idx_middle_inner_pair_loc};
186  //input.print_matrix();
187  QGD_Complex16 element_outer = input[current_idx_outer_loc];
188  QGD_Complex16 element_outer_pair = input[current_idx_outer_pair_loc];
189 
190  QGD_Complex16 element_inner = input[current_idx_inner_loc];
191  QGD_Complex16 element_inner_pair = input[current_idx_inner_pair_loc];
192 
193  QGD_Complex16 element_middle = input[current_idx_middle_loc];
194  QGD_Complex16 element_middle_pair = input[current_idx_middle_pair_loc];
195 
196  QGD_Complex16 element_middle_inner = input[current_idx_middle_inner_loc];
197  QGD_Complex16 element_middle_inner_pair = input[current_idx_middle_inner_pair_loc];
198 
199  QGD_Complex16 tmp1;
200  QGD_Complex16 tmp2;
201  QGD_Complex16 tmp3;
202  QGD_Complex16 tmp4;
203  QGD_Complex16 tmp5;
204  QGD_Complex16 tmp6;
205  QGD_Complex16 tmp7;
206  QGD_Complex16 tmp8;
207  for (int mult_idx=0; mult_idx<8; mult_idx++){
208  tmp1 = mult(unitary[mult_idx*8], element_outer);
209  tmp2 = mult(unitary[mult_idx*8 + 1], element_inner);
210  tmp3 = mult(unitary[mult_idx*8 + 2], element_middle);
211  tmp4 = mult(unitary[mult_idx*8 + 3], element_middle_inner);
212  tmp5 = mult(unitary[mult_idx*8 + 4], element_outer_pair);
213  tmp6 = mult(unitary[mult_idx*8 + 5], element_inner_pair);
214  tmp7 = mult(unitary[mult_idx*8 + 6], element_middle_pair);
215  tmp8 = mult(unitary[mult_idx*8 + 7], element_middle_inner_pair);
216  input[indexes[mult_idx]].real = tmp1.real + tmp2.real + tmp3.real + tmp4.real + tmp5.real + tmp6.real + tmp7.real + tmp8.real;
217  input[indexes[mult_idx]].imag = tmp1.imag + tmp2.imag + tmp3.imag + tmp4.imag + tmp5.imag + tmp6.imag + tmp7.imag + tmp8.imag;
218  }
219  }
220  }
221  current_idx = current_idx + (index_step_outer << 1);
222  }
223  }
224 }
225 
226 //u3_1qbit2 is for when the control qbit is 0
227 void
228 apply_crot_kernel_to_matrix_input(Matrix& u3_1qbit1, Matrix& u3_1qbit2, Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
229 
230  int index_step_target = 1 << target_qbit;
231  int current_idx = 0;
232 
233 
234  for ( int current_idx_pair=current_idx + index_step_target; current_idx_pair<matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
235 
236  for(int idx=0; idx<index_step_target; idx++) {
237  //tbb::parallel_for(0, index_step_target, 1, [&](int idx) {
238 
239  int current_idx_loc = current_idx + idx;
240  int current_idx_pair_loc = current_idx_pair + idx;
241 
242  int row_offset = current_idx_loc*input.stride;
243  int row_offset_pair = current_idx_pair_loc*input.stride;
244  for ( int col_idx=0; col_idx<input.cols; col_idx++) {
245 
246  int index = row_offset+col_idx;
247  int index_pair = row_offset_pair+col_idx;
248  if ( (current_idx_loc >> control_qbit) & 1 ) {
249 
250 
251 
252  QGD_Complex16 element = input[index];
253  QGD_Complex16 element_pair = input[index_pair];
254 
255  QGD_Complex16 tmp1 = mult(u3_1qbit1[0], element);
256  QGD_Complex16 tmp2 = mult(u3_1qbit1[1], element_pair);
257 
258  input[index].real = tmp1.real + tmp2.real;
259  input[index].imag = tmp1.imag + tmp2.imag;
260 
261  tmp1 = mult(u3_1qbit1[2], element);
262  tmp2 = mult(u3_1qbit1[3], element_pair);
263 
264  input[index_pair].real = tmp1.real + tmp2.real;
265  input[index_pair].imag = tmp1.imag + tmp2.imag;
266 
267  }
268 
269  else {
270  QGD_Complex16 element = input[index];
271  QGD_Complex16 element_pair = input[index_pair];
272 
273  QGD_Complex16 tmp1 = mult(u3_1qbit2[0], element);
274  QGD_Complex16 tmp2 = mult(u3_1qbit2[1], element_pair);
275 
276  input[index].real = tmp1.real + tmp2.real;
277  input[index].imag = tmp1.imag + tmp2.imag;
278 
279  tmp1 = mult(u3_1qbit2[2], element);
280  tmp2 = mult(u3_1qbit2[3], element_pair);
281 
282  input[index_pair].real = tmp1.real + tmp2.real;
283  input[index_pair].imag = tmp1.imag + tmp2.imag;
284  }
285  }
286 
287 
288  //});
289  }
290 
291 
292  current_idx = current_idx + (index_step_target << 1);
293 
294 
295  }
296 
297 
298 
299 }
300 
301 void
302 apply_crot_kernel_to_matrix_input_AVX(Matrix& u3_1qbit1, Matrix& u3_1qbit2, Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
303  input.ensure_aligned();
304 
305  int index_step_target = 1 << target_qbit;
306  int current_idx = 0;
307 
308  // load elements of the first U3 unitary into 256bit registers (8 registers)
309  __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].real);
310  __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
311  __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
312  __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
313  __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
314  __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
315  __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
316  __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
317  // load elements of the second U3 unitary into 256bit registers (8 registers)
318  __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
319  __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
320  __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
321  __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
322  __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
323  __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
324  __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
325  __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
326 
327 
328  for ( int current_idx_pair=current_idx + index_step_target; current_idx_pair<matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
329 
330 
331  for (int idx = 0; idx < index_step_target; idx++) {
332 
333 
334  int current_idx_loc = current_idx + idx;
335  int current_idx_pair_loc = current_idx_pair + idx;
336 
337  int row_offset = current_idx_loc * input.stride;
338  int row_offset_pair = current_idx_pair_loc * input.stride;
339  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
340  double* element = (double*)input.get_data() + 2 * row_offset;
341  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
342  if ((current_idx_loc >> control_qbit) & 1) {
343 
344 
345  // extract successive elements from arrays element, element_pair
346  __m256d element_vec = _mm256_load_pd(element + col_idx);
347  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
348  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
349  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
350  element_vec = tmp;
351 
352  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
353  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
354  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
355  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
356  element_pair_vec = tmp;
357 
358  __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
359  vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
360  __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
361  vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
362  vec3 = _mm256_add_pd(vec3, vec4);
363  __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
364  vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
365  __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
366  vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
367  vec5 = _mm256_add_pd(vec5, vec6);
368 
369  // 6 store the transformed elements in vec3
370  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
371  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
372  vec3 = tmp;
373  _mm256_store_pd(element + col_idx, vec3);
374  _mm256_store_pd(element + col_idx + 4, vec5);
375 
376  __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
377  vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
378  __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
379  vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
380  vec7 = _mm256_add_pd(vec7, vec8);
381  __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
382  vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
383  __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
384  vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
385  vec9 = _mm256_add_pd(vec9, vec10);
386 
387  // 6 store the transformed elements in vec3
388  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
389  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
390  vec7 = tmp;
391  _mm256_store_pd(element_pair + col_idx, vec7);
392  _mm256_store_pd(element_pair + col_idx + 4, vec9);
393 
394 
395 
396  }
397  else {
398 
399 
400  // extract successive elements from arrays element, element_pair
401  __m256d element_vec = _mm256_load_pd(element + col_idx);
402  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
403  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
404  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
405  element_vec = tmp;
406 
407  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
408  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
409  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
410  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
411  element_pair_vec = tmp;
412 
413  __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
414  vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
415  __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
416  vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
417  vec3 = _mm256_add_pd(vec3, vec4);
418  __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
419  vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
420  __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
421  vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
422  vec5 = _mm256_add_pd(vec5, vec6);
423 
424  // 6 store the transformed elements in vec3
425  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
426  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
427  vec3 = tmp;
428  _mm256_store_pd(element + col_idx, vec3);
429  _mm256_store_pd(element + col_idx + 4, vec5);
430 
431  __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
432  vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
433  __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
434  vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
435  vec7 = _mm256_add_pd(vec7, vec8);
436  __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
437  vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
438  __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
439  vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
440  vec9 = _mm256_add_pd(vec9, vec10);
441 
442  // 6 store the transformed elements in vec3
443  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
444  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
445  vec7 = tmp;
446  _mm256_store_pd(element_pair + col_idx, vec7);
447  _mm256_store_pd(element_pair + col_idx + 4, vec9);
448  }
449  }
450 
451  int remainder = input.cols % 4;
452  if (remainder != 0) {
453 
454  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
455  int index = row_offset+col_idx;
456  int index_pair = row_offset_pair+col_idx;
457  if ( (current_idx_loc >> control_qbit) & 1 ) {
458 
459 
460 
461  QGD_Complex16 element = input[index];
462  QGD_Complex16 element_pair = input[index_pair];
463 
464  QGD_Complex16 tmp1 = mult(u3_1qbit1[0], element);
465  QGD_Complex16 tmp2 = mult(u3_1qbit1[1], element_pair);
466 
467  input[index].real = tmp1.real + tmp2.real;
468  input[index].imag = tmp1.imag + tmp2.imag;
469 
470  tmp1 = mult(u3_1qbit1[2], element);
471  tmp2 = mult(u3_1qbit1[3], element_pair);
472 
473  input[index_pair].real = tmp1.real + tmp2.real;
474  input[index_pair].imag = tmp1.imag + tmp2.imag;
475 
476  }
477 
478  else {
479  QGD_Complex16 element = input[index];
480  QGD_Complex16 element_pair = input[index_pair];
481 
482  QGD_Complex16 tmp1 = mult(u3_1qbit2[0], element);
483  QGD_Complex16 tmp2 = mult(u3_1qbit2[1], element_pair);
484 
485  input[index].real = tmp1.real + tmp2.real;
486  input[index].imag = tmp1.imag + tmp2.imag;
487 
488  tmp1 = mult(u3_1qbit2[2], element);
489  tmp2 = mult(u3_1qbit2[3], element_pair);
490 
491  input[index_pair].real = tmp1.real + tmp2.real;
492  input[index_pair].imag = tmp1.imag + tmp2.imag;
493  }
494  }
495 
496  }
497  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
498 
499 
500  }
501 
502 
503 
504  current_idx = current_idx + (index_step_target << 1);
505 
506  }
507 
508 
509 }
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
Definition: matrix_base.hpp:46
void apply_large_kernel_to_input(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
void apply_crot_kernel_to_matrix_input(Matrix &u3_1qbit1, Matrix &u3_1qbit2, Matrix &input, const int &target_qbit, const int &control_qbit, const int &matrix_size)
void ensure_aligned()
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
Definition: common.cpp:259
scalar * get_data() const
Call to get the pointer to the stored data.
int rows
The number of rows.
Definition: matrix_base.hpp:42
int cols
The number of columns.
Definition: matrix_base.hpp:44
matrix_size
[load Umtx]
Definition: example.py:58
void apply_2qbit_kernel_to_state_vector_input(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
Structure type representing complex numbers in the SQUANDER package.
Definition: QGDTypes.h:38
Class to store data of complex arrays and its properties.
Definition: matrix.h:38
void apply_crot_kernel_to_matrix_input_AVX(Matrix &u3_1qbit1, Matrix &u3_1qbit2, Matrix &input, const int &target_qbit, const int &control_qbit, const int &matrix_size)
double real
the real part of a complex number
Definition: QGDTypes.h:40
int get_grain_size(int index_step)
void apply_3qbit_kernel_to_state_vector_input(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
void apply_2qbit_kernel_to_matrix_input(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
double imag
the imaginary part of a complex number
Definition: QGDTypes.h:42