Sequential Quantum Gate Decomposer  v1.9.3
Powerful decomposition of general unitarias into one- and two-qubit gates gates
apply_kernel_to_input_AVX.cpp
Go to the documentation of this file.
1 /*
2 Created on Fri Jun 26 14:13:26 2020
3 Copyright 2020 Peter Rakyta, Ph.D.
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 
17 @author: Peter Rakyta, Ph.D.
18 */
25 #include <immintrin.h>
26 #include "tbb/tbb.h"
27 
37 void
38 apply_kernel_to_input_AVX_small(Matrix& u3_1qbit, Matrix& input, const bool& deriv, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
39 
40  input.ensure_aligned();
41 
42  int index_step_target = 1 << target_qbit;
43  int current_idx = 0;
44 
45  // load elements of the U3 unitary into 256bit registers (4 registers)
46  __m128d* u3_1qubit_tmp = (__m128d*) & u3_1qbit[0];
47  __m256d u3_1qbit_00_vec = _mm256_broadcast_pd(u3_1qubit_tmp);
48 
49  u3_1qubit_tmp = (__m128d*) & u3_1qbit[1];
50  __m256d u3_1qbit_01_vec = _mm256_broadcast_pd(u3_1qubit_tmp);
51 
52  u3_1qubit_tmp = (__m128d*) & u3_1qbit[2];
53  __m256d u3_1qbit_10_vec = _mm256_broadcast_pd(u3_1qubit_tmp);
54 
55  u3_1qubit_tmp = (__m128d*) & u3_1qbit[3];
56  __m256d u3_1qbit_11_vec = _mm256_broadcast_pd(u3_1qubit_tmp);
57 
58 
59  for ( int current_idx_pair=current_idx + index_step_target; current_idx_pair<matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
60 
61  for (int idx = 0; idx < index_step_target; idx++) {
62  //tbb::parallel_for(0, index_step_target, 1, [&](int idx) {
63 
64  int current_idx_loc = current_idx + idx;
65  int current_idx_pair_loc = current_idx_pair + idx;
66 
67  int row_offset = current_idx_loc * input.stride;
68  int row_offset_pair = current_idx_pair_loc * input.stride;
69 
70  if (control_qbit < 0 || ((current_idx_loc >> control_qbit) & 1)) {
71 
72 
73  double* element = (double*)input.get_data() + 2 * row_offset;
74  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
75 
76 
77  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0); // 5th register
78 
79 
80  for (int col_idx = 0; col_idx < 2 * (input.cols - 1); col_idx = col_idx + 4) {
81 
82  // extract successive elements from arrays element, element_pair
83  __m256d element_vec = _mm256_load_pd(element + col_idx); // 6th register
84  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx); // 7th register
85 
87 
88  // 1 calculate the multiplications u3_1qbit_00*element_vec
89  __m256d vec3 = _mm256_mul_pd(u3_1qbit_00_vec, element_vec); // 8th register
90 
91  // 2 Switch the real and imaginary elements of element_vec
92  __m256d element_vec_permuted = _mm256_permute_pd(element_vec, 0x5); // 9th register
93 
94  // 3 Negate the imaginary elements of element_vec_permuted
95  element_vec_permuted = _mm256_mul_pd(element_vec_permuted, neg);
96 
97  // 4 Multiply elements of u3_1qbit_00*element_vec_permuted
98  __m256d vec4 = _mm256_mul_pd(u3_1qbit_00_vec, element_vec_permuted);
99 
100  // 5 Horizontally subtract the elements in vec3 and vec4
101  vec3 = _mm256_hsub_pd(vec3, vec4);
102 
103 
105 
106  // 1 calculate the multiplications u3_1qbit_01*element_pair_vec
107  __m256d vec5 = _mm256_mul_pd(u3_1qbit_01_vec, element_pair_vec); // 10th register
108 
109  // 2 Switch the real and imaginary elements of element_vec
110  __m256d element_pair_vec_permuted = _mm256_permute_pd(element_pair_vec, 0x5); // 11th register
111 
112  // 3 Negate the imaginary elements of element_vec_permuted
113  element_pair_vec_permuted = _mm256_mul_pd(element_pair_vec_permuted, neg);
114 
115  // 4 Multiply elements of u3_1qbit_01*element_vec_pair_permuted
116  vec4 = _mm256_mul_pd(u3_1qbit_01_vec, element_pair_vec_permuted);
117 
118  // 5 Horizontally subtract the elements in vec5 and vec4
119  vec5 = _mm256_hsub_pd(vec5, vec4);
120 
122  vec3 = _mm256_add_pd(vec3, vec5);
123 
124 
125  // 6 store the transformed elements in vec3
126  _mm256_store_pd(element + col_idx, vec3);
127 
128 
130 
131  // 1 calculate the multiplications u3_1qbit_10*element_vec
132  vec3 = _mm256_mul_pd(u3_1qbit_10_vec, element_vec);
133 
134  // 4 Multiply elements of u3_1qbit_10*element_vec_permuted
135  vec4 = _mm256_mul_pd(u3_1qbit_10_vec, element_vec_permuted);
136 
137  // 5 Horizontally subtract the elements in vec3 and vec4
138  vec3 = _mm256_hsub_pd(vec3, vec4);
139 
140 
142 
143  // 1 calculate the multiplications u3_1qbit_01*element_pair_vec
144  vec5 = _mm256_mul_pd(u3_1qbit_11_vec, element_pair_vec);
145 
146  // 4 Multiply elements of u3_1qbit_01*element_vec_pair_permuted
147  vec4 = _mm256_mul_pd(u3_1qbit_11_vec, element_pair_vec_permuted);
148 
149  // 5 Horizontally subtract the elements in vec5 and vec4
150  vec5 = _mm256_hsub_pd(vec5, vec4);
151 
153  vec3 = _mm256_add_pd(vec3, vec5);
154 
155  // 6 store the transformed elements in vec3
156  _mm256_store_pd(element_pair + col_idx, vec3);
157 
158  }
159 
160  if (input.cols % 2 == 1) {
161 
162  int col_idx = input.cols - 1;
163 
164  int index = row_offset + col_idx;
165  int index_pair = row_offset_pair + col_idx;
166 
167  QGD_Complex16 element = input[index];
168  QGD_Complex16 element_pair = input[index_pair];
169 
170  QGD_Complex16 tmp1 = mult(u3_1qbit[0], element);
171  QGD_Complex16 tmp2 = mult(u3_1qbit[1], element_pair);
172 
173  input[index].real = tmp1.real + tmp2.real;
174  input[index].imag = tmp1.imag + tmp2.imag;
175 
176  tmp1 = mult(u3_1qbit[2], element);
177  tmp2 = mult(u3_1qbit[3], element_pair);
178 
179  input[index_pair].real = tmp1.real + tmp2.real;
180  input[index_pair].imag = tmp1.imag + tmp2.imag;
181 
182 
183  }
184 
185  }
186  else if (deriv) {
187  // when calculating derivatives, the constant element should be zeros
188  memset(input.get_data() + row_offset, 0.0, input.cols * sizeof(QGD_Complex16));
189  memset(input.get_data() + row_offset_pair, 0.0, input.cols * sizeof(QGD_Complex16));
190  }
191  else {
192  // leave the state as it is
193  continue;
194  }
195 
196 
197  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
198 
199 
200  //});
201  }
202 
203 
204  current_idx = current_idx + (index_step_target << 1);
205 
206 
207  }
208 
209 
210 
211 
212 }
213 
214 
215 
225 void
226 apply_kernel_to_input_AVX(Matrix& u3_1qbit, Matrix& input, const bool& deriv, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
227 
228  input.ensure_aligned();
229 
230  int index_step_target = 1 << target_qbit;
231  int current_idx = 0;
232 
233  // load elements of the U3 unitary into 256bit registers (8 registers)
234  __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit[0].real);
235  __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit[0].imag);
236  __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit[1].real);
237  __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit[1].imag);
238  __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit[2].real);
239  __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit[2].imag);
240  __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit[3].real);
241  __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit[3].imag);
242 
243 
244  for ( int current_idx_pair=current_idx + index_step_target; current_idx_pair<matrix_size; current_idx_pair=current_idx_pair+(index_step_target << 1) ) {
245 
246 
247  for (int idx = 0; idx < index_step_target; idx++) {
248 
249 
250  int current_idx_loc = current_idx + idx;
251  int current_idx_pair_loc = current_idx_pair + idx;
252 
253  int row_offset = current_idx_loc * input.stride;
254  int row_offset_pair = current_idx_pair_loc * input.stride;
255 
256  if (control_qbit < 0 || ((current_idx_loc >> control_qbit) & 1)) {
257 
258 
259  double* element = (double*)input.get_data() + 2 * row_offset;
260  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
261 
262 
263  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
264 
265  // extract successive elements from arrays element, element_pair
266  __m256d element_vec = _mm256_load_pd(element + col_idx);
267  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
268  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
269  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
270  element_vec = tmp;
271 
272  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
273  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
274  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
275  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
276  element_pair_vec = tmp;
277 
278  __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
279  vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
280  __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
281  vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
282  vec3 = _mm256_add_pd(vec3, vec4);
283  __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
284  vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
285  __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
286  vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
287  vec5 = _mm256_add_pd(vec5, vec6);
288 
289  // 6 store the transformed elements in vec3
290  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
291  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
292  vec3 = tmp;
293  _mm256_store_pd(element + col_idx, vec3);
294  _mm256_store_pd(element + col_idx + 4, vec5);
295 
296  __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
297  vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
298  __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
299  vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
300  vec7 = _mm256_add_pd(vec7, vec8);
301  __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
302  vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
303  __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
304  vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
305  vec9 = _mm256_add_pd(vec9, vec10);
306 
307  // 6 store the transformed elements in vec3
308  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
309  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
310  vec7 = tmp;
311  _mm256_store_pd(element_pair + col_idx, vec7);
312  _mm256_store_pd(element_pair + col_idx + 4, vec9);
313  }
314 
315  int remainder = input.cols % 4;
316  if (remainder != 0) {
317 
318  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
319  int index = row_offset + col_idx;
320  int index_pair = row_offset_pair + col_idx;
321 
322  QGD_Complex16 element = input[index];
323  QGD_Complex16 element_pair = input[index_pair];
324 
325  QGD_Complex16 tmp1 = mult(u3_1qbit[0], element);
326  QGD_Complex16 tmp2 = mult(u3_1qbit[1], element_pair);
327 
328  input[index].real = tmp1.real + tmp2.real;
329  input[index].imag = tmp1.imag + tmp2.imag;
330 
331  tmp1 = mult(u3_1qbit[2], element);
332  tmp2 = mult(u3_1qbit[3], element_pair);
333 
334  input[index_pair].real = tmp1.real + tmp2.real;
335  input[index_pair].imag = tmp1.imag + tmp2.imag;
336  }
337 
338  }
339 
340  }
341  else if (deriv) {
342  // when calculating derivatives, the constant element should be zeros
343  memset(input.get_data() + row_offset, 0.0, input.cols * sizeof(QGD_Complex16));
344  memset(input.get_data() + row_offset_pair, 0.0, input.cols * sizeof(QGD_Complex16));
345  }
346  else {
347  // leave the state as it is
348  continue;
349  }
350 
351 
352  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
353 
354 
355  }
356 
357 
358 
359  current_idx = current_idx + (index_step_target << 1);
360 
361  }
362 
363 
364 
365 }
366 
367 
377 void
378 apply_kernel_to_input_AVX_parallel(Matrix& u3_1qbit, Matrix& input, const bool& deriv, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
379 
380  input.ensure_aligned();
381 
382  int index_step_target = 1 << target_qbit;
383 
384  // load elements of the U3 unitary into 256bit registers (8 registers)
385  __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit[0].real);
386  __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit[0].imag);
387  __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit[1].real);
388  __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit[1].imag);
389  __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit[2].real);
390  __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit[2].imag);
391  __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit[3].real);
392  __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit[3].imag);
393 
394 
395 
396  int parallel_outer_cycles = matrix_size/(index_step_target << 1);
397  int outer_grain_size;
398  if ( index_step_target <= 2 ) {
399  outer_grain_size = 32;
400  }
401  else if ( index_step_target <= 4 ) {
402  outer_grain_size = 16;
403  }
404  else if ( index_step_target <= 8 ) {
405  outer_grain_size = 8;
406  }
407  else if ( index_step_target <= 16 ) {
408  outer_grain_size = 4;
409  }
410  else {
411  outer_grain_size = 2;
412  }
413 
414 
415  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
416 
417  int current_idx = r.begin()*(index_step_target << 1);
418  int current_idx_pair = index_step_target + r.begin()*(index_step_target << 1);
419 
420  for (int rdx=r.begin(); rdx<r.end(); rdx++) {
421 
422 
423  tbb::parallel_for( tbb::blocked_range<int>(0,index_step_target,32), [&](tbb::blocked_range<int> r) {
424  for (int idx=r.begin(); idx<r.end(); ++idx) {
425 
426 
427  int current_idx_loc = current_idx + idx;
428  int current_idx_pair_loc = current_idx_pair + idx;
429 
430  int row_offset = current_idx_loc * input.stride;
431  int row_offset_pair = current_idx_pair_loc * input.stride;
432 
433  if (control_qbit < 0 || ((current_idx_loc >> control_qbit) & 1)) {
434 
435 
436  double* element = (double*)input.get_data() + 2 * row_offset;
437  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
438 
439 
440  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
441 
442  // extract successive elements from arrays element, element_pair
443  __m256d element_vec = _mm256_load_pd(element + col_idx);
444  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
445  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
446  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
447  element_vec = tmp;
448 
449  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
450  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
451  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
452  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
453  element_pair_vec = tmp;
454 
455  __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
456  vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
457  __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
458  vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
459  vec3 = _mm256_add_pd(vec3, vec4);
460  __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
461  vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
462  __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
463  vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
464  vec5 = _mm256_add_pd(vec5, vec6);
465 
466  // 6 store the transformed elements in vec3
467  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
468  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
469  vec3 = tmp;
470  _mm256_store_pd(element + col_idx, vec3);
471  _mm256_store_pd(element + col_idx + 4, vec5);
472 
473  __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
474  vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
475  __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
476  vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
477  vec7 = _mm256_add_pd(vec7, vec8);
478  __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
479  vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
480  __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
481  vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
482  vec9 = _mm256_add_pd(vec9, vec10);
483 
484  // 6 store the transformed elements in vec3
485  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
486  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
487  vec7 = tmp;
488  _mm256_store_pd(element_pair + col_idx, vec7);
489  _mm256_store_pd(element_pair + col_idx + 4, vec9);
490  }
491 
492  int remainder = input.cols % 4;
493  if (remainder != 0) {
494 
495  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
496  int index = row_offset + col_idx;
497  int index_pair = row_offset_pair + col_idx;
498 
499  QGD_Complex16 element = input[index];
500  QGD_Complex16 element_pair = input[index_pair];
501 
502  QGD_Complex16 tmp1 = mult(u3_1qbit[0], element);
503  QGD_Complex16 tmp2 = mult(u3_1qbit[1], element_pair);
504 
505  input[index].real = tmp1.real + tmp2.real;
506  input[index].imag = tmp1.imag + tmp2.imag;
507 
508  tmp1 = mult(u3_1qbit[2], element);
509  tmp2 = mult(u3_1qbit[3], element_pair);
510 
511  input[index_pair].real = tmp1.real + tmp2.real;
512  input[index_pair].imag = tmp1.imag + tmp2.imag;
513  }
514 
515  }
516 
517  }
518  else if (deriv) {
519  // when calculating derivatives, the constant element should be zeros
520  memset(input.get_data() + row_offset, 0.0, input.cols * sizeof(QGD_Complex16));
521  memset(input.get_data() + row_offset_pair, 0.0, input.cols * sizeof(QGD_Complex16));
522  }
523  else {
524  // leave the state as it is
525  continue;
526  }
527 
528 
529  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
530 
531 
532  }
533  });
534 
535 
536 
537  current_idx = current_idx + (index_step_target << 1);
538  current_idx_pair = current_idx_pair + (index_step_target << 1);
539 
540  }
541  });
542 
543 
544 }
545 
546 
547 
548 
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
Definition: matrix_base.hpp:46
void apply_kernel_to_input_AVX_small(Matrix &u3_1qbit, Matrix &input, const bool &deriv, const int &target_qbit, const int &control_qbit, const int &matrix_size)
AVX kernel to apply single qubit gate kernel on an input matrix (efficient for small inputs) ...
void apply_kernel_to_input_AVX(Matrix &u3_1qbit, Matrix &input, const bool &deriv, const int &target_qbit, const int &control_qbit, const int &matrix_size)
AVX kernel to apply single qubit gate kernel on an input matrix (single threaded) ...
void ensure_aligned()
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
Definition: common.cpp:259
scalar * get_data() const
Call to get the pointer to the stored data.
void apply_kernel_to_input_AVX_parallel(Matrix &u3_1qbit, Matrix &input, const bool &deriv, const int &target_qbit, const int &control_qbit, const int &matrix_size)
Parallel AVX kernel to apply single qubit gate kernel on an input matrix.
int cols
The number of columns.
Definition: matrix_base.hpp:44
matrix_size
[load Umtx]
Definition: example.py:58
Structure type representing complex numbers in the SQUANDER package.
Definition: QGDTypes.h:38
Class to store data of complex arrays and its properties.
Definition: matrix.h:38
double real
the real part of a complex number
Definition: QGDTypes.h:40
double imag
the imaginary part of a complex number
Definition: QGDTypes.h:42