Sequential Quantum Gate Decomposer  v1.9.3
Powerful decomposition of general unitarias into one- and two-qubit gates gates
apply_kernel_to_state_vector_input_AVX.cpp
Go to the documentation of this file.
1 /*
2 Created on Fri Jun 26 14:13:26 2020
3 Copyright 2020 Peter Rakyta, Ph.D.
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 
17 @author: Peter Rakyta, Ph.D.
18 */
25 #include <immintrin.h>
26 #include "tbb/tbb.h"
27 
28 
29 
30 
40 void
41 apply_kernel_to_state_vector_input_AVX(Matrix& u3_1qbit, Matrix& input, const bool& deriv, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
42 
43 
44  int index_step_target = 1 << target_qbit;
45  int current_idx = 0;
46 
47  unsigned int bitmask_low = (1 << target_qbit) - 1;
48  unsigned int bitmask_high = ~bitmask_low;
49 
50  int control_qbit_step_index = (1<<control_qbit);
51 
52  if ( control_qbit == 0 ) {
53 
54  for (int idx=0; idx<matrix_size/2; idx++ ) {
55 
56  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
57  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
58 
59  // the index pair with target qubit state 1
60  int current_idx_pair = current_idx | (1<<target_qbit);
61 
62  if ( current_idx & control_qbit_step_index ) {
63 
64  QGD_Complex16 element = input[current_idx];
65  QGD_Complex16 element_pair = input[current_idx_pair];
66 
67 
68  QGD_Complex16&& tmp1 = mult(u3_1qbit[0], element);
69  QGD_Complex16&& tmp2 = mult(u3_1qbit[1], element_pair);
70 
71  input[current_idx].real = tmp1.real + tmp2.real;
72  input[current_idx].imag = tmp1.imag + tmp2.imag;
73 
74  QGD_Complex16&& tmp3 = mult(u3_1qbit[2], element);
75  QGD_Complex16&& tmp4 = mult(u3_1qbit[3], element_pair);
76 
77  input[current_idx_pair].real = tmp3.real + tmp4.real;
78  input[current_idx_pair].imag = tmp3.imag + tmp4.imag;
79 
80 
81 
82  }
83  else if (deriv) {
84  // when calculating derivatives, the constant element should be zeros
85  memset(input.get_data() + current_idx, 0.0, input.cols * sizeof(QGD_Complex16));
86  memset(input.get_data() + current_idx_pair, 0.0, input.cols * sizeof(QGD_Complex16));
87  }
88  else {
89  // leave the state as it is
90  continue;
91  }
92 
93 
94  }
95 
96  }
97  else if ( target_qbit == 0 ) {
98 
99 /*
100 AVX kernel developed according to https://github.com/qulacs/qulacs/blob/main/src/csim/update_ops_matrix_dense_single.cpp
101 
102 under MIT License
103 
104 Copyright (c) 2018 Qulacs Authors
105 
106 Permission is hereby granted, free of charge, to any person obtaining a copy
107 of this software and associated documentation files (the "Software"), to deal
108 in the Software without restriction, including without limitation the rights
109 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
110 copies of the Software, and to permit persons to whom the Software is
111 furnished to do so, subject to the following conditions:
112 
113 The above copyright notice and this permission notice shall be included in all
114 copies or substantial portions of the Software.
115 */
116 
117  __m256d mv00 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[0].imag, u3_1qbit[0].real);
118  __m256d mv01 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
119  __m256d mv20 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
120  __m256d mv21 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
121 
122  for (int idx=0; idx<matrix_size/2; idx++ ) {
123 
124  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
125  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
126 
127  if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
128 
129 
130  double *ptr = (double*)input.get_data() + 2 * current_idx;
131  __m256d data = _mm256_loadu_pd(ptr);
132 
133  __m256d data_u0 = _mm256_mul_pd(data, mv00);
134  __m256d data_u1 = _mm256_mul_pd(data, mv01);
135  __m256d data_u2 = _mm256_hadd_pd(data_u0, data_u1);
136  data_u2 = _mm256_permute4x64_pd(data_u2, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
137 
138  __m256d data_d0 = _mm256_mul_pd(data, mv20);
139  __m256d data_d1 = _mm256_mul_pd(data, mv21);
140  __m256d data_d2 = _mm256_hadd_pd(data_d0, data_d1);
141  data_d2 = _mm256_permute4x64_pd(data_d2, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
142 
143  __m256d data_r = _mm256_hadd_pd(data_u2, data_d2);
144 
145  data_r = _mm256_permute4x64_pd(data_r, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
146  _mm256_storeu_pd(ptr, data_r);
147 
148  }
149  else if (deriv) {
150  // when calculating derivatives, the constant element should be zeros
151  memset(input.get_data() + current_idx, 0.0, input.cols * 2 *sizeof(QGD_Complex16));
152  }
153  else {
154  // leave the state as it is
155  continue;
156  }
157 
158 
159  }
160 
161  }
162  else {
163 
164 
165 /*
166 AVX kernel developed according to https://github.com/qulacs/qulacs/blob/main/src/csim/update_ops_matrix_dense_single.cpp
167 
168 under MIT License
169 
170 Copyright (c) 2018 Qulacs Authors
171 
172 Permission is hereby granted, free of charge, to any person obtaining a copy
173 of this software and associated documentation files (the "Software"), to deal
174 in the Software without restriction, including without limitation the rights
175 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
176 copies of the Software, and to permit persons to whom the Software is
177 furnished to do so, subject to the following conditions:
178 
179 The above copyright notice and this permission notice shall be included in all
180 copies or substantial portions of the Software.
181 */
182 
183  __m256d mv00 = _mm256_set_pd(-u3_1qbit[0].imag, u3_1qbit[0].real, -u3_1qbit[0].imag, u3_1qbit[0].real);
184  __m256d mv01 = _mm256_set_pd( u3_1qbit[0].real, u3_1qbit[0].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
185  __m256d mv10 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[1].imag, u3_1qbit[1].real);
186  __m256d mv11 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[1].real, u3_1qbit[1].imag);
187  __m256d mv20 = _mm256_set_pd(-u3_1qbit[2].imag, u3_1qbit[2].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
188  __m256d mv21 = _mm256_set_pd( u3_1qbit[2].real, u3_1qbit[2].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
189  __m256d mv30 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[3].imag, u3_1qbit[3].real);
190  __m256d mv31 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[3].real, u3_1qbit[3].imag);
191 
192 
193  for (int idx=0; idx<matrix_size/2; idx=idx+2 ) {
194 
195  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
196  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
197 
198  // the index pair with target qubit state 1
199  int current_idx_pair = current_idx | (1<<target_qbit);
200 
201  if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
202 
203 
204  double* element = (double*)input.get_data() + 2 * current_idx;
205  double* element_pair = (double*)input.get_data() + 2 * current_idx_pair;
206 
207 
208  __m256d data0 = _mm256_loadu_pd(element);
209  __m256d data1 = _mm256_loadu_pd(element_pair);
210 
211  __m256d data_u2 = _mm256_mul_pd(data0, mv00);
212  __m256d data_u3 = _mm256_mul_pd(data1, mv10);
213  __m256d data_u4 = _mm256_mul_pd(data0, mv01);
214  __m256d data_u5 = _mm256_mul_pd(data1, mv11);
215 
216  __m256d data_u6 = _mm256_hadd_pd(data_u2, data_u4);
217  __m256d data_u7 = _mm256_hadd_pd(data_u3, data_u5);
218 
219  __m256d data_d2 = _mm256_mul_pd(data0, mv20);
220  __m256d data_d3 = _mm256_mul_pd(data1, mv30);
221  __m256d data_d4 = _mm256_mul_pd(data0, mv21);
222  __m256d data_d5 = _mm256_mul_pd(data1, mv31);
223 
224  __m256d data_d6 = _mm256_hadd_pd(data_d2, data_d4);
225  __m256d data_d7 = _mm256_hadd_pd(data_d3, data_d5);
226 
227  __m256d data_r0 = _mm256_add_pd(data_u6, data_u7);
228  __m256d data_r1 = _mm256_add_pd(data_d6, data_d7);
229 
230  _mm256_storeu_pd(element, data_r0);
231  _mm256_storeu_pd(element_pair, data_r1);
232 
233 
234  }
235  else if (deriv) {
236  // when calculating derivatives, the constant element should be zeros
237  memset(input.get_data() + current_idx, 0.0, input.cols * sizeof(QGD_Complex16));
238  memset(input.get_data() + current_idx_pair, 0.0, input.cols * sizeof(QGD_Complex16));
239  }
240  else {
241  // leave the state as it is
242  continue;
243  }
244 
245 
246  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
247 
248 
249 
250  }
251 
252 
253 
254  } // else
255 
256 }
257 
258 
259 
260 
261 
271 void
272 apply_kernel_to_state_vector_input_parallel_OpenMP_AVX(Matrix& u3_1qbit, Matrix& input, const bool& deriv, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
273 
274 
275  int index_step_target = 1 << target_qbit;
276  int current_idx = 0;
277 
278  unsigned int bitmask_low = (1 << target_qbit) - 1;
279  unsigned int bitmask_high = ~bitmask_low;
280 
281  int control_qbit_step_index = (1<<control_qbit);
282 
283  if ( control_qbit == 0 ) {
284 #pragma omp parallel for
285  for (int idx=0; idx<matrix_size/2; idx++ ) {
286 
287  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
288  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
289 
290  // the index pair with target qubit state 1
291  int current_idx_pair = current_idx | (1<<target_qbit);
292 
293  if ( current_idx & control_qbit_step_index ) {
294 
295  QGD_Complex16 element = input[current_idx];
296  QGD_Complex16 element_pair = input[current_idx_pair];
297 
298 
299  QGD_Complex16&& tmp1 = mult(u3_1qbit[0], element);
300  QGD_Complex16&& tmp2 = mult(u3_1qbit[1], element_pair);
301 
302  input[current_idx].real = tmp1.real + tmp2.real;
303  input[current_idx].imag = tmp1.imag + tmp2.imag;
304 
305  QGD_Complex16&& tmp3 = mult(u3_1qbit[2], element);
306  QGD_Complex16&& tmp4 = mult(u3_1qbit[3], element_pair);
307 
308  input[current_idx_pair].real = tmp3.real + tmp4.real;
309  input[current_idx_pair].imag = tmp3.imag + tmp4.imag;
310 
311 
312 
313  }
314  else if (deriv) {
315  // when calculating derivatives, the constant element should be zeros
316  memset(input.get_data() + current_idx, 0.0, input.cols * sizeof(QGD_Complex16));
317  memset(input.get_data() + current_idx_pair, 0.0, input.cols * sizeof(QGD_Complex16));
318  }
319  else {
320  // leave the state as it is
321  continue;
322  }
323 
324 
325  }
326 
327  }
328  else if ( target_qbit == 0 ) {
329 
330 /*
331 AVX kernel developed according to https://github.com/qulacs/qulacs/blob/main/src/csim/update_ops_matrix_dense_single.cpp
332 
333 under MIT License
334 
335 Copyright (c) 2018 Qulacs Authors
336 
337 Permission is hereby granted, free of charge, to any person obtaining a copy
338 of this software and associated documentation files (the "Software"), to deal
339 in the Software without restriction, including without limitation the rights
340 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
341 copies of the Software, and to permit persons to whom the Software is
342 furnished to do so, subject to the following conditions:
343 
344 The above copyright notice and this permission notice shall be included in all
345 copies or substantial portions of the Software.
346 */
347 
348  __m256d mv00 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[0].imag, u3_1qbit[0].real);
349  __m256d mv01 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
350  __m256d mv20 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
351  __m256d mv21 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
352 
353 #pragma omp parallel for
354  for (int idx=0; idx<matrix_size/2; idx++ ) {
355 
356  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
357  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
358 
359  if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
360 
361 
362  double *ptr = (double*)input.get_data() + 2 * current_idx;
363  __m256d data = _mm256_loadu_pd(ptr);
364 
365  __m256d data_u0 = _mm256_mul_pd(data, mv00);
366  __m256d data_u1 = _mm256_mul_pd(data, mv01);
367  __m256d data_u2 = _mm256_hadd_pd(data_u0, data_u1);
368  data_u2 = _mm256_permute4x64_pd(data_u2, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
369 
370  __m256d data_d0 = _mm256_mul_pd(data, mv20);
371  __m256d data_d1 = _mm256_mul_pd(data, mv21);
372  __m256d data_d2 = _mm256_hadd_pd(data_d0, data_d1);
373  data_d2 = _mm256_permute4x64_pd(data_d2, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
374 
375  __m256d data_r = _mm256_hadd_pd(data_u2, data_d2);
376 
377  data_r = _mm256_permute4x64_pd(data_r, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
378  _mm256_storeu_pd(ptr, data_r);
379 
380  }
381  else if (deriv) {
382  // when calculating derivatives, the constant element should be zeros
383  memset(input.get_data() + current_idx, 0.0, input.cols * 2 *sizeof(QGD_Complex16));
384  }
385  else {
386  // leave the state as it is
387  continue;
388  }
389 
390 
391  }
392 
393  }
394  else {
395 
396 
397 /*
398 AVX kernel developed according to https://github.com/qulacs/qulacs/blob/main/src/csim/update_ops_matrix_dense_single.cpp
399 
400 under MIT License
401 
402 Copyright (c) 2018 Qulacs Authors
403 
404 Permission is hereby granted, free of charge, to any person obtaining a copy
405 of this software and associated documentation files (the "Software"), to deal
406 in the Software without restriction, including without limitation the rights
407 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
408 copies of the Software, and to permit persons to whom the Software is
409 furnished to do so, subject to the following conditions:
410 
411 The above copyright notice and this permission notice shall be included in all
412 copies or substantial portions of the Software.
413 */
414 
415  __m256d mv00 = _mm256_set_pd(-u3_1qbit[0].imag, u3_1qbit[0].real, -u3_1qbit[0].imag, u3_1qbit[0].real);
416  __m256d mv01 = _mm256_set_pd( u3_1qbit[0].real, u3_1qbit[0].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
417  __m256d mv10 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[1].imag, u3_1qbit[1].real);
418  __m256d mv11 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[1].real, u3_1qbit[1].imag);
419  __m256d mv20 = _mm256_set_pd(-u3_1qbit[2].imag, u3_1qbit[2].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
420  __m256d mv21 = _mm256_set_pd( u3_1qbit[2].real, u3_1qbit[2].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
421  __m256d mv30 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[3].imag, u3_1qbit[3].real);
422  __m256d mv31 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[3].real, u3_1qbit[3].imag);
423 
424 #pragma omp parallel for
425  for (int idx=0; idx<matrix_size/2; idx=idx+2 ) {
426 
427  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
428  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
429 
430  // the index pair with target qubit state 1
431  int current_idx_pair = current_idx | (1<<target_qbit);
432 
433  if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
434 
435 
436  double* element = (double*)input.get_data() + 2 * current_idx;
437  double* element_pair = (double*)input.get_data() + 2 * current_idx_pair;
438 
439 
440  __m256d data0 = _mm256_loadu_pd(element);
441  __m256d data1 = _mm256_loadu_pd(element_pair);
442 
443  __m256d data_u2 = _mm256_mul_pd(data0, mv00);
444  __m256d data_u3 = _mm256_mul_pd(data1, mv10);
445  __m256d data_u4 = _mm256_mul_pd(data0, mv01);
446  __m256d data_u5 = _mm256_mul_pd(data1, mv11);
447 
448  __m256d data_u6 = _mm256_hadd_pd(data_u2, data_u4);
449  __m256d data_u7 = _mm256_hadd_pd(data_u3, data_u5);
450 
451  __m256d data_d2 = _mm256_mul_pd(data0, mv20);
452  __m256d data_d3 = _mm256_mul_pd(data1, mv30);
453  __m256d data_d4 = _mm256_mul_pd(data0, mv21);
454  __m256d data_d5 = _mm256_mul_pd(data1, mv31);
455 
456  __m256d data_d6 = _mm256_hadd_pd(data_d2, data_d4);
457  __m256d data_d7 = _mm256_hadd_pd(data_d3, data_d5);
458 
459  __m256d data_r0 = _mm256_add_pd(data_u6, data_u7);
460  __m256d data_r1 = _mm256_add_pd(data_d6, data_d7);
461 
462  _mm256_storeu_pd(element, data_r0);
463  _mm256_storeu_pd(element_pair, data_r1);
464 
465 
466  }
467  else if (deriv) {
468  // when calculating derivatives, the constant element should be zeros
469  memset(input.get_data() + current_idx, 0.0, input.cols * sizeof(QGD_Complex16));
470  memset(input.get_data() + current_idx_pair, 0.0, input.cols * sizeof(QGD_Complex16));
471  }
472  else {
473  // leave the state as it is
474  continue;
475  }
476 
477 
478  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
479 
480 
481 
482  }
483 
484 
485 
486  } // else
487 
488 }
489 
490 
491 
492 
493 
503 void
504 apply_kernel_to_state_vector_input_parallel_AVX(Matrix& u3_1qbit, Matrix& input, const bool& deriv, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
505 
506 
507  int grain_size = 64;
508 
509  unsigned int bitmask_low = (1 << target_qbit) - 1;
510  unsigned int bitmask_high = ~bitmask_low;
511 
512  int control_qbit_step_index = (1<<control_qbit);
513 
514  tbb::affinity_partitioner aff_p;
515 
516  if ( control_qbit == 0 ) {
517  tbb::parallel_for( tbb::blocked_range<int>(0,matrix_size/2,grain_size), [&](tbb::blocked_range<int> r) {
518 
519  for (int idx=r.begin(); idx<r.end(); idx++) {
520 
521 
522  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
523  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
524 
525  // the index pair with target qubit state 1
526  int current_idx_pair = current_idx | (1<<target_qbit);
527 
528  if ( current_idx & control_qbit_step_index ) {
529 
530  QGD_Complex16 element = input[current_idx];
531  QGD_Complex16 element_pair = input[current_idx_pair];
532 
533 
534  QGD_Complex16&& tmp1 = mult(u3_1qbit[0], element);
535  QGD_Complex16&& tmp2 = mult(u3_1qbit[1], element_pair);
536 
537  input[current_idx].real = tmp1.real + tmp2.real;
538  input[current_idx].imag = tmp1.imag + tmp2.imag;
539 
540  QGD_Complex16&& tmp3 = mult(u3_1qbit[2], element);
541  QGD_Complex16&& tmp4 = mult(u3_1qbit[3], element_pair);
542 
543  input[current_idx_pair].real = tmp3.real + tmp4.real;
544  input[current_idx_pair].imag = tmp3.imag + tmp4.imag;
545 
546  }
547  else if (deriv) {
548  // when calculating derivatives, the constant element should be zeros
549  memset(input.get_data() + current_idx, 0.0, input.cols * sizeof(QGD_Complex16));
550  memset(input.get_data() + current_idx_pair, 0.0, input.cols * sizeof(QGD_Complex16));
551  }
552  else {
553  // leave the state as it is
554  continue;
555  }
556 
557 
558  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
559 
560 
561  }
562  }, aff_p);
563 
564 
565  }
566  else if ( target_qbit == 0 ) {
567 
568 /*
569 AVX kernel developed according to https://github.com/qulacs/qulacs/blob/main/src/csim/update_ops_matrix_dense_single.cpp
570 
571 under MIT License
572 
573 Copyright (c) 2018 Qulacs Authors
574 
575 Permission is hereby granted, free of charge, to any person obtaining a copy
576 of this software and associated documentation files (the "Software"), to deal
577 in the Software without restriction, including without limitation the rights
578 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
579 copies of the Software, and to permit persons to whom the Software is
580 furnished to do so, subject to the following conditions:
581 
582 The above copyright notice and this permission notice shall be included in all
583 copies or substantial portions of the Software.
584 */
585 
586  __m256d mv00 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[0].imag, u3_1qbit[0].real);
587  __m256d mv01 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
588  __m256d mv20 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
589  __m256d mv21 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
590 
591  tbb::parallel_for( tbb::blocked_range<int>(0,matrix_size/2,grain_size), [&](tbb::blocked_range<int> r) {
592 
593  for (int idx=r.begin(); idx<r.end(); idx++) {
594 
595  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
596  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
597 
598  if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
599 
600 
601  double *ptr = (double*)input.get_data() + 2 * current_idx;
602  __m256d data = _mm256_loadu_pd(ptr);
603 
604  __m256d data_u0 = _mm256_mul_pd(data, mv00);
605  __m256d data_u1 = _mm256_mul_pd(data, mv01);
606  __m256d data_u2 = _mm256_hadd_pd(data_u0, data_u1);
607  data_u2 = _mm256_permute4x64_pd(data_u2, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
608 
609  __m256d data_d0 = _mm256_mul_pd(data, mv20);
610  __m256d data_d1 = _mm256_mul_pd(data, mv21);
611  __m256d data_d2 = _mm256_hadd_pd(data_d0, data_d1);
612  data_d2 = _mm256_permute4x64_pd(data_d2, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
613 
614  __m256d data_r = _mm256_hadd_pd(data_u2, data_d2);
615 
616  data_r = _mm256_permute4x64_pd(data_r, 216); // (3210) -> (3120) : 1*0 + 4*2 + 16*1 + 64*3 = 216
617  _mm256_storeu_pd(ptr, data_r);
618 
619  }
620  else if (deriv) {
621  // when calculating derivatives, the constant element should be zeros
622  memset(input.get_data() + current_idx, 0.0, input.cols * 2 *sizeof(QGD_Complex16));
623  }
624  else {
625  // leave the state as it is
626  continue;
627  }
628 
629 
630  }
631  }, aff_p);
632 
633  }
634  else {
635 
636 /*
637 AVX kernel developed according to https://github.com/qulacs/qulacs/blob/main/src/csim/update_ops_matrix_dense_single.cpp
638 
639 under MIT License
640 
641 Copyright (c) 2018 Qulacs Authors
642 
643 Permission is hereby granted, free of charge, to any person obtaining a copy
644 of this software and associated documentation files (the "Software"), to deal
645 in the Software without restriction, including without limitation the rights
646 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
647 copies of the Software, and to permit persons to whom the Software is
648 furnished to do so, subject to the following conditions:
649 
650 The above copyright notice and this permission notice shall be included in all
651 copies or substantial portions of the Software.
652 */
653  __m256d mv00 = _mm256_set_pd(-u3_1qbit[0].imag, u3_1qbit[0].real, -u3_1qbit[0].imag, u3_1qbit[0].real);
654  __m256d mv01 = _mm256_set_pd( u3_1qbit[0].real, u3_1qbit[0].imag, u3_1qbit[0].real, u3_1qbit[0].imag);
655  __m256d mv10 = _mm256_set_pd(-u3_1qbit[1].imag, u3_1qbit[1].real, -u3_1qbit[1].imag, u3_1qbit[1].real);
656  __m256d mv11 = _mm256_set_pd( u3_1qbit[1].real, u3_1qbit[1].imag, u3_1qbit[1].real, u3_1qbit[1].imag);
657  __m256d mv20 = _mm256_set_pd(-u3_1qbit[2].imag, u3_1qbit[2].real, -u3_1qbit[2].imag, u3_1qbit[2].real);
658  __m256d mv21 = _mm256_set_pd( u3_1qbit[2].real, u3_1qbit[2].imag, u3_1qbit[2].real, u3_1qbit[2].imag);
659  __m256d mv30 = _mm256_set_pd(-u3_1qbit[3].imag, u3_1qbit[3].real, -u3_1qbit[3].imag, u3_1qbit[3].real);
660  __m256d mv31 = _mm256_set_pd( u3_1qbit[3].real, u3_1qbit[3].imag, u3_1qbit[3].real, u3_1qbit[3].imag);
661 
662 
663  tbb::parallel_for( tbb::blocked_range<int>(0,matrix_size/2,grain_size), [&](tbb::blocked_range<int> r) {
664 
665  for (int idx=r.begin(); idx<r.end(); idx=idx+2) {
666  // generate index by inserting state 0 into the place of the target qbit while pushing high bits left by one
667  int current_idx = ((idx & bitmask_high) << 1) | (idx & bitmask_low);
668 
669  // the index pair with target qubit state 1
670  int current_idx_pair = current_idx | (1<<target_qbit);
671 
672  if (control_qbit < 0 || (current_idx & control_qbit_step_index) ) {
673 
674 
675  double* element = (double*)input.get_data() + 2 * current_idx;
676  double* element_pair = (double*)input.get_data() + 2 * current_idx_pair;
677 
678 
679  __m256d data0 = _mm256_loadu_pd(element);
680  __m256d data1 = _mm256_loadu_pd(element_pair);
681 
682  __m256d data_u2 = _mm256_mul_pd(data0, mv00);
683  __m256d data_u3 = _mm256_mul_pd(data1, mv10);
684  __m256d data_u4 = _mm256_mul_pd(data0, mv01);
685  __m256d data_u5 = _mm256_mul_pd(data1, mv11);
686 
687  __m256d data_u6 = _mm256_hadd_pd(data_u2, data_u4);
688  __m256d data_u7 = _mm256_hadd_pd(data_u3, data_u5);
689 
690  __m256d data_d2 = _mm256_mul_pd(data0, mv20);
691  __m256d data_d3 = _mm256_mul_pd(data1, mv30);
692  __m256d data_d4 = _mm256_mul_pd(data0, mv21);
693  __m256d data_d5 = _mm256_mul_pd(data1, mv31);
694 
695  __m256d data_d6 = _mm256_hadd_pd(data_d2, data_d4);
696  __m256d data_d7 = _mm256_hadd_pd(data_d3, data_d5);
697 
698  __m256d data_r0 = _mm256_add_pd(data_u6, data_u7);
699  __m256d data_r1 = _mm256_add_pd(data_d6, data_d7);
700 
701  _mm256_storeu_pd(element, data_r0);
702  _mm256_storeu_pd(element_pair, data_r1);
703 
704 
705  }
706  else if (deriv) {
707  // when calculating derivatives, the constant element should be zeros
708  memset(input.get_data() + current_idx, 0.0, input.cols * sizeof(QGD_Complex16));
709  memset(input.get_data() + current_idx_pair, 0.0, input.cols * sizeof(QGD_Complex16));
710  }
711  else {
712  // leave the state as it is
713  continue;
714  }
715 
716 
717  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
718 
719 
720 
721  }
722  }, aff_p);
723 
724 
725 
726  } // else
727 
728 
729 
730 
731 }
732 
733 
734 
735 
736 
737 
738 
void apply_kernel_to_state_vector_input_parallel_OpenMP_AVX(Matrix &u3_1qbit, Matrix &input, const bool &deriv, const int &target_qbit, const int &control_qbit, const int &matrix_size)
Parallel AVX kernel on a state vector (parallelized with OpenMP)
void apply_kernel_to_state_vector_input_AVX(Matrix &u3_1qbit, Matrix &input, const bool &deriv, const int &target_qbit, const int &control_qbit, const int &matrix_size)
AVX kernel on a state vector.
data
load the unitary from file
Definition: example.py:51
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
Definition: common.cpp:259
void apply_kernel_to_state_vector_input_parallel_AVX(Matrix &u3_1qbit, Matrix &input, const bool &deriv, const int &target_qbit, const int &control_qbit, const int &matrix_size)
Parallel AVX kernel on a state vector (parallelized with Intel TBB)
scalar * get_data() const
Call to get the pointer to the stored data.
int cols
The number of columns.
Definition: matrix_base.hpp:44
matrix_size
[load Umtx]
Definition: example.py:58
Structure type representing complex numbers in the SQUANDER package.
Definition: QGDTypes.h:38
Class to store data of complex arrays and its properties.
Definition: matrix.h:38
double real
the real part of a complex number
Definition: QGDTypes.h:40
double imag
the imaginary part of a complex number
Definition: QGDTypes.h:42