Sequential Quantum Gate Decomposer  v1.9.3
Powerful decomposition of general unitarias into one- and two-qubit gates gates
apply_large_kernel_to_input_AVX.cpp
Go to the documentation of this file.
1 /*
2 Created on Fri Jun 26 14:13:26 2020
3 Copyright 2020 Peter Rakyta, Ph.D.
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 
17 @author: Peter Rakyta, Ph.D.
18 */
25 #include "tbb/tbb.h"
26 
27 inline __m256d get_AVX_vector(double* element_outer, double* element_inner){
28 
29  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
30  element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
31  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
32  element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
33  __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
34  outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
35 
36  return outer_inner_vec;
37 }
38 
39 inline __m256d complex_mult_AVX(__m256d input_vec, __m256d unitary_row_vec, __m256d neg){
40 
41  __m256d vec3 = _mm256_mul_pd(input_vec, unitary_row_vec);
42  __m256d unitary_row_switched = _mm256_permute_pd(unitary_row_vec, 0x5);
43  unitary_row_switched = _mm256_mul_pd(unitary_row_switched, neg);
44  __m256d vec4 = _mm256_mul_pd(input_vec, unitary_row_switched);
45  __m256d result_vec = _mm256_hsub_pd(vec3, vec4);
46  result_vec = _mm256_permute4x64_pd(result_vec,0b11011000);
47 
48  return result_vec;
49 }
50 
58 void apply_large_kernel_to_input_AVX(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
59  if (input.cols==1){
60  //switch((int)involved_qbits.size()){
61  //case 2:{
62  //apply_2qbit_kernel_to_state_vector_input_parallel_AVX(unitary,input,involved_qbits,matrix_size);
63  apply_2qbit_kernel_to_state_vector_input_AVX(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
64  //}
65  //case 3:{
66  // apply_3qbit_kernel_to_state_vector_input_parallel_AVX(unitary,input,involved_qbits,matrix_size);
67  //}
68  //case 4:{
69  // apply_4qbit_kernel_to_state_vector_input_parallel_AVX(unitary,input,involved_qbits,matrix_size);
70  //}
71  //}
72  }
73  else{
74  apply_2qbit_kernel_to_matrix_input_AVX(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
75  }
76 }
77 
78 
87 void apply_2qbit_kernel_to_state_vector_input_AVX(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
88 
89  int index_step_outer = 1 << outer_qbit;
90  int index_step_inner = 1 << inner_qbit;
91  int current_idx = 0;
92  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
93  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
94 
95  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
96  if (inner_qbit==0){
97  for (int idx=0; idx<index_step_inner; idx++){
98 
99 
100  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
101  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
102  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
103  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
104  double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
105 
106  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
107  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
108  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
109  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
110 
111  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
112  element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
113  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
114  element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
115  __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
116  outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
117 
118 
119  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
120  element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
121  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
122  element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
123  __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
124  outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
125 
126 
127 
128 
129  for (int mult_idx=0; mult_idx<4; mult_idx++){
130  double* unitary_row_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
131  double* unitary_row_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
132 
133  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
134  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
135 
136  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
137 
138  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
139 
140  __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
141  result_vec = _mm256_hadd_pd(result_vec,result_vec);
142  double* result = (double*)&result_vec;
143  results[mult_idx*2] = result[0];
144  results[mult_idx*2+1] = result[2];
145  }
146  input[current_idx_outer_loc].real = results[0];
147  input[current_idx_outer_loc].imag = results[1];
148  input[current_idx_inner_loc].real = results[2];
149  input[current_idx_inner_loc].imag = results[3];
150  input[current_idx_outer_pair_loc].real = results[4];
151  input[current_idx_outer_pair_loc].imag = results[5];
152  input[current_idx_inner_pair_loc].real = results[6];
153  input[current_idx_inner_pair_loc].imag = results[7];
154  }
155  }
156  else {
157 
158  for (int idx=0; idx<index_step_inner; idx=idx+2){
159 
160  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
161 
162  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
163  double* element_inner = element_outer + 2 * index_step_inner;
164 
165  double* element_outer_pair = element_outer + 2 * index_step_outer;
166  double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
167 
168  double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
169 
170  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
171  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
172 
173  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
174  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
175 
176  __m256d result_vecs[4];
177 
178  for (int mult_idx=0; mult_idx<4; mult_idx++){
179  double* unitary_col_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
180  double* unitary_col_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
181 
182  __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01); // a,b,c,d
183  unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000); // a, c, b, d
184  __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000); // a, a, b, b
185  __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111); // c, c, d, d
186  unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000); // a, b, a, b
187  unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000); // c, d, c, d
188 
189  __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23); // a,b,c,d
190  unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000); // a, c, b, d
191  __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000); // a, a, b, b
192  __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111); // c, c, d, d
193  unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000); // a, a, b, b
194  unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000); // c, c, d, d
195 
196  __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
197  __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
198  unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
199  __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
200  __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
201 
202 
203  __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
204  __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
205  unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
206  __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
207  __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
208 
209  __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
210  __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
211  unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
212  __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
213  __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
214 
215 
216  __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
217  __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
218  unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
219  __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
220  __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
221 
222  __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
223  result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
224  result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
225  result_vecs[mult_idx] = result_vec;
226  }
227  for (int row_idx=0; row_idx<4;row_idx++){
228  _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
229  }
230 
231  }
232 
233  }
234  }
235  current_idx = current_idx + (index_step_outer << 1);
236  }
237 
238 }
239 
248 void apply_2qbit_kernel_to_matrix_input_AVX(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
249  int index_step_outer = 1 << outer_qbit;
250  int index_step_inner = 1 << inner_qbit;
251  int current_idx = 0;
252  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
253  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
254 
255  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
256  for (int idx=0; idx<index_step_inner; idx++){
257 
258  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
259  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
260  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
261  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
262 
263  int row_offset_outer = current_idx_outer_loc*input.cols;
264  int row_offset_inner = current_idx_inner_loc*input.cols;
265  int row_offset_outer_pair = current_idx_outer_pair_loc*input.cols;
266  int row_offset_inner_pair = current_idx_inner_pair_loc*input.cols;
267  #pragma omp parallel for
268  for (int col_idx=0; col_idx<input.cols;col_idx++){
269 
270  int current_idx_outer = row_offset_outer + col_idx;
271  int current_idx_inner = row_offset_inner + col_idx;
272  int current_idx_outer_pair = row_offset_outer_pair + col_idx;
273  int current_idx_inner_pair = row_offset_inner_pair + col_idx;
274 
275  double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
276 
277  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer;
278  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner;
279  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair;
280  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair;
281 
282  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
283  element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
284  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
285  element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
286  __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
287  outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
288 
289 
290  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
291  element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
292  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
293  element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
294  __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
295  outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
296 
297 
298 
299 
300  for (int mult_idx=0; mult_idx<4; mult_idx++){
301  double* unitary_row_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
302  double* unitary_row_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
303 
304  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
305  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
306 
307  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
308 
309  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
310 
311  __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
312  result_vec = _mm256_hadd_pd(result_vec,result_vec);
313  double* result = (double*)&result_vec;
314  results[mult_idx*2] = result[0];
315  results[mult_idx*2+1] = result[2];
316  }
317  input[current_idx_outer].real = results[0];
318  input[current_idx_outer].imag = results[1];
319  input[current_idx_inner].real = results[2];
320  input[current_idx_inner].imag = results[3];
321  input[current_idx_outer_pair].real = results[4];
322  input[current_idx_outer_pair].imag = results[5];
323  input[current_idx_inner_pair].real = results[6];
324  input[current_idx_inner_pair].imag = results[7];
325  }
326  }
327 
328  }
329  current_idx = current_idx + (index_step_outer << 1);
330  }
331 }
332 
333 
334 
343 void apply_2qbit_kernel_to_state_vector_input_parallel_AVX(Matrix& two_qbit_unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
344  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
345 
346  int inner_qbit = involved_qbits[0];
347 
348  int outer_qbit = involved_qbits[1];
349 
350  int index_step_outer = 1 << outer_qbit;
351 
352  int index_step_inner = 1 << inner_qbit;
353 
354 
355  int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
356 
357  int parallel_inner_cycles = index_step_outer/(index_step_inner << 1);
358 
359  //int outer_grain_size = get_grain_size(index_step_outer);
360  //int inner_grain_size = get_grain_size(index_step_inner);
361 
362  int outer_grain_size;
363  if ( index_step_outer <= 2 ) {
364  outer_grain_size = 64;
365  }
366  else if ( index_step_outer <= 4 ) {
367  outer_grain_size = 32;
368  }
369  else if ( index_step_outer <= 8 ) {
370  outer_grain_size = 16;
371  }
372  else if ( index_step_outer <= 16 ) {
373  outer_grain_size = 8;
374  }
375  else {
376  outer_grain_size = 2;
377  }
378 outer_grain_size = 6000000;
379  int inner_grain_size = 64000000;
380 
381 
382  tbb::parallel_for( tbb::blocked_range<int>(0, parallel_outer_cycles, outer_grain_size), [&](tbb::blocked_range<int> r) {
383 
384  int current_idx = r.begin()*(index_step_outer<<1);
385  int current_idx_pair_outer = current_idx + index_step_outer;
386 
387 
388  for (int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
389 
390  tbb::parallel_for( tbb::blocked_range<int>(0, parallel_inner_cycles, inner_grain_size), [&](tbb::blocked_range<int> r) {
391 
392  int current_idx_inner = r.begin()*(index_step_inner<<1);
393 
394  for (int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
395 
396  if (inner_qbit<2){
397  for (int idx=0; idx<index_step_inner; ++idx){
398 
399  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
400  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
401  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
402  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
403  double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
404 
405  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
406  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
407  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
408  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
409 
410  __m256d outer_inner_vec = get_AVX_vector(element_outer, element_inner);
411 
412 
413 
414  __m256d outer_inner_pair_vec = get_AVX_vector(element_outer_pair, element_inner_pair);
415 
416 
417  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
418 
419  for (int mult_idx=0; mult_idx<4; mult_idx++){
420  double* unitary_row_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
421  double* unitary_row_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
422 
423  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
424  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
425 
426  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
427 
428  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
429 
430 
431  __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
432  result_vec = _mm256_hadd_pd(result_vec,result_vec);
433  double* result = (double*)&result_vec;
434  results[mult_idx*2] = result[0];
435  results[mult_idx*2+1] = result[2];
436  }
437 
438  input[current_idx_outer_loc].real = results[0];
439  input[current_idx_outer_loc].imag = results[1];
440  input[current_idx_inner_loc].real = results[2];
441  input[current_idx_inner_loc].imag = results[3];
442  input[current_idx_outer_pair_loc].real = results[4];
443  input[current_idx_outer_pair_loc].imag = results[5];
444  input[current_idx_inner_pair_loc].real = results[6];
445  input[current_idx_inner_pair_loc].imag = results[7];
446  }
447 
448  }
449  else{
450 
451  for (int alt_idx=0; alt_idx<index_step_inner/2; ++alt_idx){
452 
453  int idx = alt_idx*2;
454  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
455 
456  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
457  double* element_inner = element_outer + 2 * index_step_inner;
458 
459  double* element_outer_pair = element_outer + 2 * index_step_outer;
460  double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
461 
462  double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
463 
464  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
465  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
466 
467  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
468  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
469 
470  __m256d result_vecs[4];
471 
472  for (int mult_idx=0; mult_idx<4; mult_idx++){
473  double* unitary_col_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
474  double* unitary_col_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
475 
476  __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01); // a,b,c,d
477  unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000); // a, c, b, d
478  __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000); // a, a, b, b
479  __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111); // c, c, d, d
480  unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000); // a, b, a, b
481  unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000); // c, d, c, d
482 
483  __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23); // a,b,c,d
484  unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000); // a, c, b, d
485  __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000); // a, a, b, b
486  __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111); // c, c, d, d
487  unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000); // a, a, b, b
488  unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000); // c, c, d, d
489 
490  __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
491  __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
492  unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
493  __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
494  __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
495 
496 
497  __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
498  __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
499  unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
500  __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
501  __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
502 
503  __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
504  __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
505  unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
506  __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
507  __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
508 
509 
510  __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
511  __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
512  unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
513  __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
514  __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
515 
516  __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
517  result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
518  result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
519  result_vecs[mult_idx] = result_vec;
520  }
521 
522  for (int row_idx=0; row_idx<4;row_idx++){
523  _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
524  }
525 
526  }
527  }
528 
529  current_idx_inner = current_idx_inner +(index_step_inner << 1);
530 
531  }
532  });
533 
534  current_idx = current_idx + (index_step_outer << 1);
535  current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
536 
537  }
538  });
539 
540 }
541 
549 void apply_3qbit_kernel_to_state_vector_input_parallel_AVX(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
550 
551  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
552 
553  int index_step_inner = 1 << involved_qbits[0];
554 
555  int index_step_middle = 1 << involved_qbits[1];
556 
557  int index_step_outer = 1 << involved_qbits[2];
558 
559 
560  int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
561 
562  int parallel_inner_cycles = index_step_middle/(index_step_inner << 1);
563 
564  int parallel_middle_cycles = index_step_outer/(index_step_middle << 1);
565 
566 
567  int outer_grain_size = get_grain_size(index_step_outer);
568  int inner_grain_size = get_grain_size(index_step_outer);
569  int middle_grain_size = get_grain_size(index_step_middle);
570 
571  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
572 
573  int current_idx = r.begin()*(index_step_outer<<1);
574 
575  int current_idx_pair_outer = current_idx + index_step_outer;
576 
577 
578  for (int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
579 
580  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle_cycles,middle_grain_size), [&](tbb::blocked_range<int> r) {
581 
582  int current_idx_middle = r.begin()*(index_step_middle<<1);
583  for (int middle_rdx=r.begin(); middle_rdx<r.end(); middle_rdx++){
584 
585  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
586  int current_idx_inner = r.begin()*(index_step_inner<<1);
587 
588  for (int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
589 
590  tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
591 
592  for (int idx=r.begin(); idx<r.end(); ++idx){
593 
594  int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
595  int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
596 
597  int current_idx_outer_loc = current_idx_loc;
598  int current_idx_inner_loc = current_idx_loc + index_step_inner;
599 
600  int current_idx_middle_loc = current_idx_loc + index_step_middle;
601  int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
602 
603  int current_idx_outer_pair_loc = current_idx_pair_loc;
604  int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
605 
606  int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
607  int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
608 
609  QGD_Complex16 results[8];
610 
611  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
612  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
613 
614  double* element_middle = (double*)input.get_data() + 2 * current_idx_middle_loc;
615  double* element_middle_inner = (double*)input.get_data() + 2 * current_idx_middle_inner_loc;
616 
617  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
618  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
619 
620  double* element_middle_pair = (double*)input.get_data() + 2 * current_idx_middle_pair_loc;
621  double* element_middle_inner_pair = (double*)input.get_data() + 2 * current_idx_middle_inner_pair_loc;
622 
623  __m256d outer_inner_vec = get_AVX_vector(element_outer, element_inner);
624 
625  __m256d middle_inner_vec = get_AVX_vector(element_middle,element_middle_inner);
626 
627  __m256d outer_inner_pair_vec = get_AVX_vector(element_outer_pair, element_inner_pair);
628 
629  __m256d middle_inner_pair_vec = get_AVX_vector(element_middle_pair, element_middle_inner_pair);
630 
631 
632 
633  for (int mult_idx=0; mult_idx<8; mult_idx++){
634 
635  double* unitary_row_01 = (double*)unitary.get_data() + 16*mult_idx;
636  double* unitary_row_23 = (double*)unitary.get_data() + 16*mult_idx + 4;
637  double* unitary_row_45 = (double*)unitary.get_data() + 16*mult_idx + 8;
638  double* unitary_row_67 = (double*)unitary.get_data() + 16*mult_idx + 12;
639 
640  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
641  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
642  __m256d unitary_row_45_vec = _mm256_loadu_pd(unitary_row_45);
643  __m256d unitary_row_67_vec = _mm256_loadu_pd(unitary_row_67);
644 
645 
646  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
647 
648  __m256d result_upper_middle_vec = complex_mult_AVX(middle_inner_vec,unitary_row_23_vec,neg);
649 
650  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_45_vec,neg);
651 
652  __m256d result_lower_middle_vec = complex_mult_AVX(middle_inner_pair_vec,unitary_row_67_vec,neg);
653 
654 
655  __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle_vec);
656  __m256d result2_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle_vec);
657  __m256d result_vec = _mm256_hadd_pd(result1_vec,result2_vec);
658  result_vec = _mm256_hadd_pd(result_vec,result_vec);
659  double* result = (double*)&result_vec;
660  results[mult_idx].real = result[0];
661  results[mult_idx].imag = result[2];
662  }
663  input[current_idx_outer_loc] = results[0];
664  input[current_idx_inner_loc] = results[1];
665  input[current_idx_middle_loc] = results[2];
666  input[current_idx_middle_inner_loc] = results[3];
667  input[current_idx_outer_pair_loc] = results[4];
668  input[current_idx_inner_pair_loc] = results[5];
669  input[current_idx_middle_pair_loc] = results[6];
670  input[current_idx_middle_inner_pair_loc] = results[7];
671  }
672  });
673 
674 
675  current_idx_inner = current_idx_inner +(index_step_inner << 1);
676  }
677  });
678  current_idx_middle = current_idx_middle +(index_step_middle << 1);
679  }
680  });
681  current_idx = current_idx + (index_step_outer << 1);
682  current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
683 
684  }
685  });
686 }
687 
695 void apply_4qbit_kernel_to_state_vector_input_parallel_AVX(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
696 
697  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
698 
699  int index_step_inner = 1 << involved_qbits[0];
700 
701  int index_step_middle1 = 1 << involved_qbits[1];
702 
703  int index_step_middle2 = 1 << involved_qbits[2];
704 
705  int index_step_outer = 1 << involved_qbits[3];
706 
707  int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
708 
709  int parallel_inner_cycles = index_step_middle1/(index_step_inner << 1);
710 
711  int parallel_middle1_cycles = index_step_middle2/(index_step_middle1 << 1);
712 
713  int parallel_middle2_cycles = index_step_outer/(index_step_middle2 << 1);
714 
715  int outer_grain_size = get_grain_size(index_step_outer);
716  int inner_grain_size = get_grain_size(index_step_outer);
717  int middle1_grain_size = get_grain_size(index_step_middle1);
718  int middle2_grain_size = get_grain_size(index_step_middle2);
719 
720  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
721 
722  int current_idx = r.begin()*(index_step_outer<<1);
723 
724  int current_idx_pair_outer = current_idx + index_step_outer;
725 
726 
727  for (int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
728 
729  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle2_cycles,middle2_grain_size), [&](tbb::blocked_range<int> r) {
730 
731  int current_idx_middle2 = r.begin()*(index_step_middle2<<1);
732 
733  for (int middle2_rdx=r.begin(); middle2_rdx<r.end(); middle2_rdx++){
734 
735  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle1_cycles,middle1_grain_size), [&](tbb::blocked_range<int> r) {
736 
737  int current_idx_middle1 = r.begin()*(index_step_middle1<<1);
738 
739  for (int middle1_rdx=r.begin(); middle1_rdx<r.end(); middle1_rdx++){
740 
741  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
742 
743  int current_idx_inner = r.begin()*(index_step_inner<<1);
744 
745  for (int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
746 
747  tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
748 
749  for (int idx=r.begin(); idx<r.end(); ++idx){
750 
751  int current_idx_loc = current_idx + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2 ;
752  int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2;
753 
754  int current_idx_outer_loc = current_idx_loc;
755  int current_idx_inner_loc = current_idx_loc + index_step_inner;
756 
757  int current_idx_middle1_loc = current_idx_loc + index_step_middle1;
758  int current_idx_middle1_inner_loc = current_idx_loc + index_step_middle1 + index_step_inner;
759 
760  int current_idx_middle2_loc = current_idx_loc + index_step_middle2;
761  int current_idx_middle2_inner_loc = current_idx_loc + index_step_middle2 + index_step_inner;
762 
763  int current_idx_middle12_loc = current_idx_loc + index_step_middle1 + index_step_middle2;
764  int current_idx_middle12_inner_loc = current_idx_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
765 
766  int current_idx_outer_pair_loc = current_idx_pair_loc;
767  int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
768 
769  int current_idx_middle1_pair_loc =current_idx_pair_loc + index_step_middle1;
770  int current_idx_middle1_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_inner;
771 
772  int current_idx_middle2_pair_loc = current_idx_pair_loc + index_step_middle2;
773  int current_idx_middle2_inner_pair_loc = current_idx_pair_loc + index_step_middle2 + index_step_inner;
774 
775  int current_idx_middle12_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2;
776  int current_idx_middle12_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
777 
778  QGD_Complex16 results[16];
779 
780  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
781  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
782 
783  double* element_middle1 = (double*)input.get_data() + 2 * current_idx_middle1_loc;
784  double* element_middle1_inner = (double*)input.get_data() + 2 * current_idx_middle1_inner_loc;
785 
786  double* element_middle2 = (double*)input.get_data() + 2 * current_idx_middle2_loc;
787  double* element_middle2_inner = (double*)input.get_data() + 2 * current_idx_middle2_inner_loc;
788 
789  double* element_middle12 = (double*)input.get_data() + 2 * current_idx_middle12_loc;
790  double* element_middle12_inner = (double*)input.get_data() + 2 * current_idx_middle12_inner_loc;
791 
792  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
793  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
794 
795  double* element_middle1_pair = (double*)input.get_data() + 2 * current_idx_middle1_pair_loc;
796  double* element_middle1_inner_pair = (double*)input.get_data() + 2 * current_idx_middle1_inner_pair_loc;
797 
798  double* element_middle2_pair = (double*)input.get_data() + 2 * current_idx_middle2_pair_loc;
799  double* element_middle2_inner_pair = (double*)input.get_data() + 2 * current_idx_middle2_inner_pair_loc;
800 
801  double* element_middle12_pair = (double*)input.get_data() + 2 * current_idx_middle12_pair_loc;
802  double* element_middle12_inner_pair = (double*)input.get_data() + 2 * current_idx_middle12_inner_pair_loc;
803 
804 
805  __m256d outer_inner_vec = get_AVX_vector(element_outer, element_inner);
806 
807  __m256d middle1_inner_vec = get_AVX_vector(element_middle1,element_middle1_inner);
808 
809  __m256d middle2_inner_vec = get_AVX_vector(element_middle2, element_middle2_inner);
810 
811  __m256d middle12_inner_vec = get_AVX_vector(element_middle12,element_middle12_inner);
812 
813  __m256d outer_inner_pair_vec = get_AVX_vector(element_outer_pair, element_inner_pair);
814 
815  __m256d middle1_inner_pair_vec = get_AVX_vector(element_middle1_pair, element_middle1_inner_pair);
816 
817  __m256d middle2_inner_pair_vec = get_AVX_vector(element_middle2_pair, element_middle2_inner_pair);
818 
819  __m256d middle12_inner_pair_vec = get_AVX_vector(element_middle12_pair, element_middle12_inner_pair);
820 
821 
822  for (int mult_idx=0; mult_idx<16; mult_idx++){
823 
824  double* unitary_row_1 = (double*)unitary.get_data() + 32*mult_idx;
825  double* unitary_row_2 = (double*)unitary.get_data() + 32*mult_idx + 4;
826  double* unitary_row_3 = (double*)unitary.get_data() + 32*mult_idx + 8;
827  double* unitary_row_4 = (double*)unitary.get_data() + 32*mult_idx + 12;
828  double* unitary_row_5 = (double*)unitary.get_data() + 32*mult_idx + 16;
829  double* unitary_row_6 = (double*)unitary.get_data() + 32*mult_idx + 20;
830  double* unitary_row_7 = (double*)unitary.get_data() + 32*mult_idx + 24;
831  double* unitary_row_8 = (double*)unitary.get_data() + 32*mult_idx + 28;
832 
833  __m256d unitary_row_1_vec = _mm256_loadu_pd(unitary_row_1);
834  __m256d unitary_row_2_vec = _mm256_loadu_pd(unitary_row_2);
835  __m256d unitary_row_3_vec = _mm256_loadu_pd(unitary_row_3);
836  __m256d unitary_row_4_vec = _mm256_loadu_pd(unitary_row_4);
837  __m256d unitary_row_5_vec = _mm256_loadu_pd(unitary_row_5);
838  __m256d unitary_row_6_vec = _mm256_loadu_pd(unitary_row_6);
839  __m256d unitary_row_7_vec = _mm256_loadu_pd(unitary_row_7);
840  __m256d unitary_row_8_vec = _mm256_loadu_pd(unitary_row_8);
841 
842  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_1_vec,neg);
843 
844  __m256d result_upper_middle1_vec = complex_mult_AVX(middle1_inner_vec,unitary_row_2_vec,neg);
845 
846  __m256d result_upper_middle2_vec = complex_mult_AVX(middle2_inner_vec,unitary_row_3_vec,neg);
847 
848  __m256d result_upper_middle12_vec = complex_mult_AVX(middle12_inner_vec,unitary_row_4_vec,neg);
849 
850  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_5_vec,neg);
851 
852  __m256d result_lower_middle1_vec = complex_mult_AVX(middle1_inner_pair_vec,unitary_row_6_vec,neg);
853 
854  __m256d result_lower_middle2_vec = complex_mult_AVX(middle2_inner_pair_vec,unitary_row_7_vec,neg);
855 
856  __m256d result_lower_middle12_vec = complex_mult_AVX(middle12_inner_pair_vec,unitary_row_8_vec,neg);
857 
858 
859  __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle1_vec);
860  __m256d result2_vec = _mm256_hadd_pd(result_upper_middle2_vec,result_upper_middle12_vec);
861  __m256d result3_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle1_vec);
862  __m256d result4_vec = _mm256_hadd_pd(result_lower_middle2_vec,result_lower_middle12_vec);
863  __m256d result5_vec = _mm256_hadd_pd(result1_vec,result2_vec);
864  __m256d result6_vec = _mm256_hadd_pd(result3_vec,result4_vec);
865  __m256d result_vec = _mm256_hadd_pd(result5_vec,result6_vec);
866  result_vec = _mm256_hadd_pd(result_vec,result_vec);
867  double* result = (double*)&result_vec;
868  results[mult_idx].real = result[0];
869  results[mult_idx].imag = result[2];
870  }
871  input[current_idx_outer_loc] = results[0];
872  input[current_idx_inner_loc] = results[1];
873  input[current_idx_middle1_loc] = results[2];
874  input[current_idx_middle1_inner_loc] = results[3];
875  input[current_idx_middle2_loc] = results[4];
876  input[current_idx_middle2_inner_loc] = results[5];
877  input[current_idx_middle12_loc] = results[6];
878  input[current_idx_middle12_inner_loc] = results[7];
879  input[current_idx_outer_pair_loc] = results[8];
880  input[current_idx_inner_pair_loc] = results[9];
881  input[current_idx_middle1_pair_loc] = results[10];
882  input[current_idx_middle1_inner_pair_loc] = results[11];
883  input[current_idx_middle2_pair_loc] = results[12];
884  input[current_idx_middle2_inner_pair_loc] = results[13];
885  input[current_idx_middle12_pair_loc] = results[14];
886  input[current_idx_middle12_inner_pair_loc] = results[15];
887  }
888  });
889 
890 
891  current_idx_inner = current_idx_inner +(index_step_inner << 1);
892  }
893  });
894  current_idx_middle1 = current_idx_middle1 +(index_step_middle1 << 1);
895  }
896  });
897  current_idx_middle2 = current_idx_middle2 +(index_step_middle2 << 1);
898  }
899  });
900  current_idx = current_idx + (index_step_outer << 1);
901  current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
902 
903  }
904  });
905 }
906 
916 void
917 apply_crot_kernel_to_matrix_input_AVX_parallel(Matrix& u3_1qbit1,Matrix& u3_1qbit2, Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
918 
919 
920  input.ensure_aligned();
921 
922  int index_step_target = 1 << target_qbit;
923 
924  // load elements of the U3 unitary into 256bit registers (8 registers)
925  __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].real);
926  __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
927  __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
928  __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
929  __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
930  __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
931  __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
932  __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
933 
934  __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
935  __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
936  __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
937  __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
938  __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
939  __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
940  __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
941  __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
942 
943 
944  int parallel_outer_cycles = matrix_size/(index_step_target << 1);
945  int outer_grain_size;
946  if ( index_step_target <= 2 ) {
947  outer_grain_size = 32;
948  }
949  else if ( index_step_target <= 4 ) {
950  outer_grain_size = 16;
951  }
952  else if ( index_step_target <= 8 ) {
953  outer_grain_size = 8;
954  }
955  else if ( index_step_target <= 16 ) {
956  outer_grain_size = 4;
957  }
958  else {
959  outer_grain_size = 2;
960  }
961 
962 
963  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
964 
965  int current_idx = r.begin()*(index_step_target << 1);
966  int current_idx_pair = index_step_target + r.begin()*(index_step_target << 1);
967 
968  for (int rdx=r.begin(); rdx<r.end(); rdx++) {
969 
970 
971  tbb::parallel_for( tbb::blocked_range<int>(0,index_step_target,32), [&](tbb::blocked_range<int> r) {
972  for (int idx=r.begin(); idx<r.end(); ++idx) {
973 
974 
975  int current_idx_loc = current_idx + idx;
976  int current_idx_pair_loc = current_idx_pair + idx;
977 
978  int row_offset = current_idx_loc * input.stride;
979  int row_offset_pair = current_idx_pair_loc * input.stride;
980 
981  if ((current_idx_loc >> control_qbit) & 1) {
982 
983 
984  double* element = (double*)input.get_data() + 2 * row_offset;
985  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
986 
987 
988  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
989 
990  // extract successive elements from arrays element, element_pair
991  __m256d element_vec = _mm256_load_pd(element + col_idx);
992  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
993  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
994  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
995  element_vec = tmp;
996 
997  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
998  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
999  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
1000  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
1001  element_pair_vec = tmp;
1002 
1003  __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
1004  vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
1005  __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
1006  vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
1007  vec3 = _mm256_add_pd(vec3, vec4);
1008  __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
1009  vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
1010  __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
1011  vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
1012  vec5 = _mm256_add_pd(vec5, vec6);
1013 
1014  // 6 store the transformed elements in vec3
1015  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
1016  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
1017  vec3 = tmp;
1018  _mm256_store_pd(element + col_idx, vec3);
1019  _mm256_store_pd(element + col_idx + 4, vec5);
1020 
1021  __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
1022  vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
1023  __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
1024  vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
1025  vec7 = _mm256_add_pd(vec7, vec8);
1026  __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
1027  vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
1028  __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
1029  vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
1030  vec9 = _mm256_add_pd(vec9, vec10);
1031 
1032  // 6 store the transformed elements in vec3
1033  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
1034  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
1035  vec7 = tmp;
1036  _mm256_store_pd(element_pair + col_idx, vec7);
1037  _mm256_store_pd(element_pair + col_idx + 4, vec9);
1038  }
1039 
1040  int remainder = input.cols % 4;
1041  if (remainder != 0) {
1042 
1043  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
1044  int index = row_offset + col_idx;
1045  int index_pair = row_offset_pair + col_idx;
1046 
1047  QGD_Complex16 element = input[index];
1048  QGD_Complex16 element_pair = input[index_pair];
1049 
1050  QGD_Complex16 tmp1 = mult(u3_1qbit1[0], element);
1051  QGD_Complex16 tmp2 = mult(u3_1qbit1[1], element_pair);
1052 
1053  input[index].real = tmp1.real + tmp2.real;
1054  input[index].imag = tmp1.imag + tmp2.imag;
1055 
1056  tmp1 = mult(u3_1qbit1[2], element);
1057  tmp2 = mult(u3_1qbit1[3], element_pair);
1058 
1059  input[index_pair].real = tmp1.real + tmp2.real;
1060  input[index_pair].imag = tmp1.imag + tmp2.imag;
1061  }
1062 
1063  }
1064 
1065  }
1066 
1067  else {
1068 
1069  double* element = (double*)input.get_data() + 2 * row_offset;
1070  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
1071 
1072 
1073  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
1074 
1075  // extract successive elements from arrays element, element_pair
1076  __m256d element_vec = _mm256_load_pd(element + col_idx);
1077  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
1078  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
1079  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
1080  element_vec = tmp;
1081 
1082  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
1083  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
1084  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
1085  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
1086  element_pair_vec = tmp;
1087 
1088  __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
1089  vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
1090  __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
1091  vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
1092  vec3 = _mm256_add_pd(vec3, vec4);
1093  __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
1094  vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
1095  __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
1096  vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
1097  vec5 = _mm256_add_pd(vec5, vec6);
1098 
1099  // 6 store the transformed elements in vec3
1100  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
1101  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
1102  vec3 = tmp;
1103  _mm256_store_pd(element + col_idx, vec3);
1104  _mm256_store_pd(element + col_idx + 4, vec5);
1105 
1106  __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
1107  vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
1108  __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
1109  vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
1110  vec7 = _mm256_add_pd(vec7, vec8);
1111  __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
1112  vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
1113  __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
1114  vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
1115  vec9 = _mm256_add_pd(vec9, vec10);
1116 
1117  // 6 store the transformed elements in vec3
1118  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
1119  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
1120  vec7 = tmp;
1121  _mm256_store_pd(element_pair + col_idx, vec7);
1122  _mm256_store_pd(element_pair + col_idx + 4, vec9);
1123  }
1124 
1125  int remainder = input.cols % 4;
1126  if (remainder != 0) {
1127 
1128  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
1129  int index = row_offset + col_idx;
1130  int index_pair = row_offset_pair + col_idx;
1131 
1132  QGD_Complex16 element = input[index];
1133  QGD_Complex16 element_pair = input[index_pair];
1134 
1135  QGD_Complex16 tmp1 = mult(u3_1qbit2[0], element);
1136  QGD_Complex16 tmp2 = mult(u3_1qbit2[1], element_pair);
1137 
1138  input[index].real = tmp1.real + tmp2.real;
1139  input[index].imag = tmp1.imag + tmp2.imag;
1140 
1141  tmp1 = mult(u3_1qbit2[2], element);
1142  tmp2 = mult(u3_1qbit2[3], element_pair);
1143 
1144  input[index_pair].real = tmp1.real + tmp2.real;
1145  input[index_pair].imag = tmp1.imag + tmp2.imag;
1146  }
1147 
1148  }
1149  }
1150 
1151 
1152  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
1153 
1154 
1155  }
1156  });
1157 
1158 
1159 
1160  current_idx = current_idx + (index_step_target << 1);
1161  current_idx_pair = current_idx_pair + (index_step_target << 1);
1162 
1163  }
1164  });
1165 
1166 
1167 }
1168 
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
Definition: matrix_base.hpp:46
void apply_2qbit_kernel_to_matrix_input_AVX(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
Call to apply kernel to apply two qubit gate kernel on an input matrix using AVX. ...
void ensure_aligned()
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
Definition: common.cpp:259
scalar * get_data() const
Call to get the pointer to the stored data.
void apply_large_kernel_to_input_AVX(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
Call to apply kernel to apply multi qubit gate kernel on an input matrix.
void apply_2qbit_kernel_to_state_vector_input_parallel_AVX(Matrix &two_qbit_unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
Call to apply kernel to apply two qubit gate kernel on a state vector using AVX and TBB...
int rows
The number of rows.
Definition: matrix_base.hpp:42
int cols
The number of columns.
Definition: matrix_base.hpp:44
matrix_size
[load Umtx]
Definition: example.py:58
Structure type representing complex numbers in the SQUANDER package.
Definition: QGDTypes.h:38
Class to store data of complex arrays and its properties.
Definition: matrix.h:38
void apply_crot_kernel_to_matrix_input_AVX_parallel(Matrix &u3_1qbit1, Matrix &u3_1qbit2, Matrix &input, const int &target_qbit, const int &control_qbit, const int &matrix_size)
Call to apply crot gate kernel on an input matrix using AVX and TBB.
__m256d complex_mult_AVX(__m256d input_vec, __m256d unitary_row_vec, __m256d neg)
void apply_3qbit_kernel_to_state_vector_input_parallel_AVX(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
Call to apply kernel to apply three qubit gate kernel on a state vector using AVX and TBB...
double real
the real part of a complex number
Definition: QGDTypes.h:40
__m256d get_AVX_vector(double *element_outer, double *element_inner)
int get_grain_size(int index_step)
void apply_2qbit_kernel_to_state_vector_input_AVX(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
Call to apply kernel to apply two qubit gate kernel on a state vector using AVX.
result
the result of the Qiskit job
void apply_4qbit_kernel_to_state_vector_input_parallel_AVX(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
Call to apply kernel to apply four qubit gate kernel on a state vector using AVX and TBB...
double imag
the imaginary part of a complex number
Definition: QGDTypes.h:42