Sequential Quantum Gate Decomposer  v1.9.3
Powerful decomposition of general unitarias into one- and two-qubit gates gates
apply_large_kernel_to_input_AVX.cpp
Go to the documentation of this file.
1 /*
2 Created on Fri Jun 26 14:13:26 2020
3 Copyright 2020 Peter Rakyta, Ph.D.
4 
5 Licensed under the Apache License, Version 2.0 (the "License");
6 you may not use this file except in compliance with the License.
7 You may obtain a copy of the License at
8 
9  http://www.apache.org/licenses/LICENSE-2.0
10 
11 Unless required by applicable law or agreed to in writing, software
12 distributed under the License is distributed on an "AS IS" BASIS,
13 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 See the License for the specific language governing permissions and
15 limitations under the License.
16 
17 @author: Peter Rakyta, Ph.D.
18 */
25 #include "tbb/tbb.h"
26 
27 inline __m256d get_AVX_vector(double* element_outer, double* element_inner){
28 
29  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
30  element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
31  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
32  element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
33  __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
34  outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
35 
36  return outer_inner_vec;
37 }
38 
39 inline __m256d complex_mult_AVX(__m256d input_vec, __m256d unitary_row_vec, __m256d neg){
40 
41  __m256d vec3 = _mm256_mul_pd(input_vec, unitary_row_vec);
42  __m256d unitary_row_switched = _mm256_permute_pd(unitary_row_vec, 0x5);
43  unitary_row_switched = _mm256_mul_pd(unitary_row_switched, neg);
44  __m256d vec4 = _mm256_mul_pd(input_vec, unitary_row_switched);
45  __m256d result_vec = _mm256_hsub_pd(vec3, vec4);
46  result_vec = _mm256_permute4x64_pd(result_vec,0b11011000);
47 
48  return result_vec;
49 }
50 
51 
52 void apply_large_kernel_to_input_AVX(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
53  if (input.cols==1){
54  //switch((int)involved_qbits.size()){
55  //case 2:{
56  //apply_2qbit_kernel_to_state_vector_input_parallel_AVX(unitary,input,involved_qbits,matrix_size);
57  apply_2qbit_kernel_to_state_vector_input_AVX(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
58  //}
59  //case 3:{
60  // apply_3qbit_kernel_to_state_vector_input_parallel_AVX(unitary,input,involved_qbits,matrix_size);
61  //}
62  //case 4:{
63  // apply_4qbit_kernel_to_state_vector_input_parallel_AVX(unitary,input,involved_qbits,matrix_size);
64  //}
65  //}
66  }
67  else{
68  apply_2qbit_kernel_to_matrix_input_AVX(unitary, input, involved_qbits[0], involved_qbits[1], matrix_size);
69  }
70 }
71 
72 
73 
74 void apply_2qbit_kernel_to_state_vector_input_AVX(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
75 
76  int index_step_outer = 1 << outer_qbit;
77  int index_step_inner = 1 << inner_qbit;
78  int current_idx = 0;
79  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
80  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
81 
82  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
83  if (inner_qbit==0){
84  for (int idx=0; idx<index_step_inner; idx++){
85 
86 
87  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
88  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
89  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
90  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
91  double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
92 
93  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
94  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
95  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
96  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
97 
98  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
99  element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
100  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
101  element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
102  __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
103  outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
104 
105 
106  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
107  element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
108  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
109  element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
110  __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
111  outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
112 
113 
114 
115 
116  for (int mult_idx=0; mult_idx<4; mult_idx++){
117  double* unitary_row_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
118  double* unitary_row_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
119 
120  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
121  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
122 
123  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
124 
125  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
126 
127  __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
128  result_vec = _mm256_hadd_pd(result_vec,result_vec);
129  double* result = (double*)&result_vec;
130  results[mult_idx*2] = result[0];
131  results[mult_idx*2+1] = result[2];
132  }
133  input[current_idx_outer_loc].real = results[0];
134  input[current_idx_outer_loc].imag = results[1];
135  input[current_idx_inner_loc].real = results[2];
136  input[current_idx_inner_loc].imag = results[3];
137  input[current_idx_outer_pair_loc].real = results[4];
138  input[current_idx_outer_pair_loc].imag = results[5];
139  input[current_idx_inner_pair_loc].real = results[6];
140  input[current_idx_inner_pair_loc].imag = results[7];
141  }
142  }
143  else {
144 
145  for (int idx=0; idx<index_step_inner; idx=idx+2){
146 
147  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
148 
149  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
150  double* element_inner = element_outer + 2 * index_step_inner;
151 
152  double* element_outer_pair = element_outer + 2 * index_step_outer;
153  double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
154 
155  double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
156 
157  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
158  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
159 
160  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
161  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
162 
163  __m256d result_vecs[4];
164 
165  for (int mult_idx=0; mult_idx<4; mult_idx++){
166  double* unitary_col_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
167  double* unitary_col_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
168 
169  __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01); // a,b,c,d
170  unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000); // a, c, b, d
171  __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000); // a, a, b, b
172  __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111); // c, c, d, d
173  unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000); // a, b, a, b
174  unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000); // c, d, c, d
175 
176  __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23); // a,b,c,d
177  unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000); // a, c, b, d
178  __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000); // a, a, b, b
179  __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111); // c, c, d, d
180  unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000); // a, a, b, b
181  unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000); // c, c, d, d
182 
183  __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
184  __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
185  unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
186  __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
187  __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
188 
189 
190  __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
191  __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
192  unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
193  __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
194  __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
195 
196  __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
197  __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
198  unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
199  __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
200  __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
201 
202 
203  __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
204  __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
205  unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
206  __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
207  __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
208 
209  __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
210  result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
211  result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
212  result_vecs[mult_idx] = result_vec;
213  }
214  for (int row_idx=0; row_idx<4;row_idx++){
215  _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
216  }
217 
218  }
219 
220  }
221  }
222  current_idx = current_idx + (index_step_outer << 1);
223  }
224 
225 }
226 
227 void apply_2qbit_kernel_to_matrix_input_AVX(Matrix& two_qbit_unitary, Matrix& input, const int& inner_qbit, const int& outer_qbit, const int& matrix_size){
228  int index_step_outer = 1 << outer_qbit;
229  int index_step_inner = 1 << inner_qbit;
230  int current_idx = 0;
231  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
232  for (int current_idx_pair_outer=current_idx + index_step_outer; current_idx_pair_outer<input.rows; current_idx_pair_outer=current_idx_pair_outer+(index_step_outer << 1)){
233 
234  for (int current_idx_inner = 0; current_idx_inner < index_step_outer; current_idx_inner=current_idx_inner+(index_step_inner<<1)){
235  for (int idx=0; idx<index_step_inner; idx++){
236 
237  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
238  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
239  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
240  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
241 
242  int row_offset_outer = current_idx_outer_loc*input.cols;
243  int row_offset_inner = current_idx_inner_loc*input.cols;
244  int row_offset_outer_pair = current_idx_outer_pair_loc*input.cols;
245  int row_offset_inner_pair = current_idx_inner_pair_loc*input.cols;
246  #pragma omp parallel for
247  for (int col_idx=0; col_idx<input.cols;col_idx++){
248 
249  int current_idx_outer = row_offset_outer + col_idx;
250  int current_idx_inner = row_offset_inner + col_idx;
251  int current_idx_outer_pair = row_offset_outer_pair + col_idx;
252  int current_idx_inner_pair = row_offset_inner_pair + col_idx;
253 
254  double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
255 
256  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer;
257  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner;
258  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair;
259  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair;
260 
261  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
262  element_outer_vec = _mm256_permute4x64_pd(element_outer_vec,0b11011000);
263  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
264  element_inner_vec = _mm256_permute4x64_pd(element_inner_vec,0b11011000);
265  __m256d outer_inner_vec = _mm256_shuffle_pd(element_outer_vec,element_inner_vec,0b0000);
266  outer_inner_vec = _mm256_permute4x64_pd(outer_inner_vec,0b11011000);
267 
268 
269  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
270  element_outer_pair_vec = _mm256_permute4x64_pd(element_outer_pair_vec,0b11011000);
271  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
272  element_inner_pair_vec = _mm256_permute4x64_pd(element_inner_pair_vec,0b11011000);
273  __m256d outer_inner_pair_vec = _mm256_shuffle_pd(element_outer_pair_vec,element_inner_pair_vec,0b0000);
274  outer_inner_pair_vec = _mm256_permute4x64_pd(outer_inner_pair_vec,0b11011000);
275 
276 
277 
278 
279  for (int mult_idx=0; mult_idx<4; mult_idx++){
280  double* unitary_row_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
281  double* unitary_row_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
282 
283  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
284  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
285 
286  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
287 
288  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
289 
290  __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
291  result_vec = _mm256_hadd_pd(result_vec,result_vec);
292  double* result = (double*)&result_vec;
293  results[mult_idx*2] = result[0];
294  results[mult_idx*2+1] = result[2];
295  }
296  input[current_idx_outer].real = results[0];
297  input[current_idx_outer].imag = results[1];
298  input[current_idx_inner].real = results[2];
299  input[current_idx_inner].imag = results[3];
300  input[current_idx_outer_pair].real = results[4];
301  input[current_idx_outer_pair].imag = results[5];
302  input[current_idx_inner_pair].real = results[6];
303  input[current_idx_inner_pair].imag = results[7];
304  }
305  }
306 
307  }
308  current_idx = current_idx + (index_step_outer << 1);
309  }
310 }
311 
312 
313 void apply_2qbit_kernel_to_state_vector_input_parallel_AVX(Matrix& two_qbit_unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
314  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
315 
316  int inner_qbit = involved_qbits[0];
317 
318  int outer_qbit = involved_qbits[1];
319 
320  int index_step_outer = 1 << outer_qbit;
321 
322  int index_step_inner = 1 << inner_qbit;
323 
324 
325  int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
326 
327  int parallel_inner_cycles = index_step_outer/(index_step_inner << 1);
328 
329  //int outer_grain_size = get_grain_size(index_step_outer);
330  //int inner_grain_size = get_grain_size(index_step_inner);
331 
332  int outer_grain_size;
333  if ( index_step_outer <= 2 ) {
334  outer_grain_size = 64;
335  }
336  else if ( index_step_outer <= 4 ) {
337  outer_grain_size = 32;
338  }
339  else if ( index_step_outer <= 8 ) {
340  outer_grain_size = 16;
341  }
342  else if ( index_step_outer <= 16 ) {
343  outer_grain_size = 8;
344  }
345  else {
346  outer_grain_size = 2;
347  }
348 outer_grain_size = 6000000;
349  int inner_grain_size = 64000000;
350 
351 
352  tbb::parallel_for( tbb::blocked_range<int>(0, parallel_outer_cycles, outer_grain_size), [&](tbb::blocked_range<int> r) {
353 
354  int current_idx = r.begin()*(index_step_outer<<1);
355  int current_idx_pair_outer = current_idx + index_step_outer;
356 
357 
358  for (int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
359 
360  tbb::parallel_for( tbb::blocked_range<int>(0, parallel_inner_cycles, inner_grain_size), [&](tbb::blocked_range<int> r) {
361 
362  int current_idx_inner = r.begin()*(index_step_inner<<1);
363 
364  for (int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
365 
366  if (inner_qbit<2){
367  for (int idx=0; idx<index_step_inner; ++idx){
368 
369  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
370  int current_idx_inner_loc = current_idx + current_idx_inner + idx + index_step_inner;
371  int current_idx_outer_pair_loc = current_idx_pair_outer + idx + current_idx_inner;
372  int current_idx_inner_pair_loc = current_idx_pair_outer + idx + current_idx_inner + index_step_inner;
373  double results[8] = {0.,0.,0.,0.,0.,0.,0.,0.};
374 
375  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
376  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
377  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
378  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
379 
380  __m256d outer_inner_vec = get_AVX_vector(element_outer, element_inner);
381 
382 
383 
384  __m256d outer_inner_pair_vec = get_AVX_vector(element_outer_pair, element_inner_pair);
385 
386 
387  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
388 
389  for (int mult_idx=0; mult_idx<4; mult_idx++){
390  double* unitary_row_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
391  double* unitary_row_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
392 
393  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
394  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
395 
396  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
397 
398  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_23_vec,neg);
399 
400 
401  __m256d result_vec = _mm256_hadd_pd(result_upper_vec,result_lower_vec);
402  result_vec = _mm256_hadd_pd(result_vec,result_vec);
403  double* result = (double*)&result_vec;
404  results[mult_idx*2] = result[0];
405  results[mult_idx*2+1] = result[2];
406  }
407 
408  input[current_idx_outer_loc].real = results[0];
409  input[current_idx_outer_loc].imag = results[1];
410  input[current_idx_inner_loc].real = results[2];
411  input[current_idx_inner_loc].imag = results[3];
412  input[current_idx_outer_pair_loc].real = results[4];
413  input[current_idx_outer_pair_loc].imag = results[5];
414  input[current_idx_inner_pair_loc].real = results[6];
415  input[current_idx_inner_pair_loc].imag = results[7];
416  }
417 
418  }
419  else{
420 
421  for (int alt_idx=0; alt_idx<index_step_inner/2; ++alt_idx){
422 
423  int idx = alt_idx*2;
424  int current_idx_outer_loc = current_idx + current_idx_inner + idx;
425 
426  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
427  double* element_inner = element_outer + 2 * index_step_inner;
428 
429  double* element_outer_pair = element_outer + 2 * index_step_outer;
430  double* element_inner_pair = element_outer_pair + 2 * index_step_inner;
431 
432  double* indices[4] = {element_outer,element_inner,element_outer_pair,element_inner_pair};
433 
434  __m256d element_outer_vec = _mm256_loadu_pd(element_outer);
435  __m256d element_inner_vec = _mm256_loadu_pd(element_inner);
436 
437  __m256d element_outer_pair_vec = _mm256_loadu_pd(element_outer_pair);
438  __m256d element_inner_pair_vec = _mm256_loadu_pd(element_inner_pair);
439 
440  __m256d result_vecs[4];
441 
442  for (int mult_idx=0; mult_idx<4; mult_idx++){
443  double* unitary_col_01 = (double*)two_qbit_unitary.get_data() + 8*mult_idx;
444  double* unitary_col_23 = (double*)two_qbit_unitary.get_data() + 8*mult_idx + 4;
445 
446  __m256d unitary_col_01_vec = _mm256_loadu_pd(unitary_col_01); // a,b,c,d
447  unitary_col_01_vec = _mm256_permute4x64_pd(unitary_col_01_vec,0b11011000); // a, c, b, d
448  __m256d unitary_col_0_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b0000); // a, a, b, b
449  __m256d unitary_col_1_vec = _mm256_shuffle_pd(unitary_col_01_vec,unitary_col_01_vec,0b1111); // c, c, d, d
450  unitary_col_0_vec = _mm256_permute4x64_pd(unitary_col_0_vec,0b11011000); // a, b, a, b
451  unitary_col_1_vec = _mm256_permute4x64_pd(unitary_col_1_vec,0b11011000); // c, d, c, d
452 
453  __m256d unitary_col_23_vec = _mm256_loadu_pd(unitary_col_23); // a,b,c,d
454  unitary_col_23_vec = _mm256_permute4x64_pd(unitary_col_23_vec,0b11011000); // a, c, b, d
455  __m256d unitary_col_2_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b0000); // a, a, b, b
456  __m256d unitary_col_3_vec = _mm256_shuffle_pd(unitary_col_23_vec,unitary_col_23_vec,0b1111); // c, c, d, d
457  unitary_col_2_vec = _mm256_permute4x64_pd(unitary_col_2_vec,0b11011000); // a, a, b, b
458  unitary_col_3_vec = _mm256_permute4x64_pd(unitary_col_3_vec,0b11011000); // c, c, d, d
459 
460  __m256d outer_vec_3 = _mm256_mul_pd(element_outer_vec, unitary_col_0_vec);
461  __m256d unitary_col_0_switched = _mm256_permute_pd(unitary_col_0_vec, 0x5);
462  unitary_col_0_switched = _mm256_mul_pd( unitary_col_0_switched, neg);
463  __m256d outer_vec_4 = _mm256_mul_pd( element_outer_vec, unitary_col_0_switched);
464  __m256d result_outer_vec = _mm256_hsub_pd( outer_vec_3, outer_vec_4);
465 
466 
467  __m256d inner_vec_3 = _mm256_mul_pd(element_inner_vec, unitary_col_1_vec);
468  __m256d unitary_col_1_switched = _mm256_permute_pd(unitary_col_1_vec, 0x5);
469  unitary_col_1_switched = _mm256_mul_pd( unitary_col_1_switched, neg);
470  __m256d inner_vec_4 = _mm256_mul_pd( element_inner_vec, unitary_col_1_switched);
471  __m256d result_inner_vec = _mm256_hsub_pd( inner_vec_3, inner_vec_4);
472 
473  __m256d outer_pair_vec_3 = _mm256_mul_pd(element_outer_pair_vec, unitary_col_2_vec);
474  __m256d unitary_col_2_switched = _mm256_permute_pd(unitary_col_2_vec, 0x5);
475  unitary_col_2_switched = _mm256_mul_pd( unitary_col_2_switched, neg);
476  __m256d outer_pair_vec_4 = _mm256_mul_pd( element_outer_pair_vec, unitary_col_2_switched);
477  __m256d result_outer_pair_vec = _mm256_hsub_pd( outer_pair_vec_3, outer_pair_vec_4);
478 
479 
480  __m256d inner_pair_vec_3 = _mm256_mul_pd(element_inner_pair_vec, unitary_col_3_vec);
481  __m256d unitary_col_3_switched = _mm256_permute_pd(unitary_col_3_vec, 0x5);
482  unitary_col_3_switched = _mm256_mul_pd( unitary_col_3_switched, neg);
483  __m256d inner_pair_vec_4 = _mm256_mul_pd( element_inner_pair_vec, unitary_col_3_switched);
484  __m256d result_inner_pair_vec = _mm256_hsub_pd( inner_pair_vec_3, inner_pair_vec_4);
485 
486  __m256d result_vec = _mm256_add_pd(result_outer_vec,result_inner_vec);
487  result_vec = _mm256_add_pd(result_vec,result_outer_pair_vec);
488  result_vec = _mm256_add_pd(result_vec,result_inner_pair_vec);
489  result_vecs[mult_idx] = result_vec;
490  }
491 
492  for (int row_idx=0; row_idx<4;row_idx++){
493  _mm256_storeu_pd(indices[row_idx], result_vecs[row_idx]);
494  }
495 
496  }
497  }
498 
499  current_idx_inner = current_idx_inner +(index_step_inner << 1);
500 
501  }
502  });
503 
504  current_idx = current_idx + (index_step_outer << 1);
505  current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
506 
507  }
508  });
509 
510 }
511 
512 
513 void apply_3qbit_kernel_to_state_vector_input_parallel_AVX(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
514 
515  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
516 
517  int index_step_inner = 1 << involved_qbits[0];
518 
519  int index_step_middle = 1 << involved_qbits[1];
520 
521  int index_step_outer = 1 << involved_qbits[2];
522 
523 
524  int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
525 
526  int parallel_inner_cycles = index_step_middle/(index_step_inner << 1);
527 
528  int parallel_middle_cycles = index_step_outer/(index_step_middle << 1);
529 
530 
531  int outer_grain_size = get_grain_size(index_step_outer);
532  int inner_grain_size = get_grain_size(index_step_outer);
533  int middle_grain_size = get_grain_size(index_step_middle);
534 
535  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
536 
537  int current_idx = r.begin()*(index_step_outer<<1);
538 
539  int current_idx_pair_outer = current_idx + index_step_outer;
540 
541 
542  for (int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
543 
544  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle_cycles,middle_grain_size), [&](tbb::blocked_range<int> r) {
545 
546  int current_idx_middle = r.begin()*(index_step_middle<<1);
547  for (int middle_rdx=r.begin(); middle_rdx<r.end(); middle_rdx++){
548 
549  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
550  int current_idx_inner = r.begin()*(index_step_inner<<1);
551 
552  for (int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
553 
554  tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
555 
556  for (int idx=r.begin(); idx<r.end(); ++idx){
557 
558  int current_idx_loc = current_idx + current_idx_middle + current_idx_inner + idx;
559  int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle;
560 
561  int current_idx_outer_loc = current_idx_loc;
562  int current_idx_inner_loc = current_idx_loc + index_step_inner;
563 
564  int current_idx_middle_loc = current_idx_loc + index_step_middle;
565  int current_idx_middle_inner_loc = current_idx_loc + index_step_middle + index_step_inner;
566 
567  int current_idx_outer_pair_loc = current_idx_pair_loc;
568  int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
569 
570  int current_idx_middle_pair_loc =current_idx_pair_loc + index_step_middle;
571  int current_idx_middle_inner_pair_loc = current_idx_pair_loc + index_step_middle + index_step_inner;
572 
573  QGD_Complex16 results[8];
574 
575  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
576  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
577 
578  double* element_middle = (double*)input.get_data() + 2 * current_idx_middle_loc;
579  double* element_middle_inner = (double*)input.get_data() + 2 * current_idx_middle_inner_loc;
580 
581  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
582  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
583 
584  double* element_middle_pair = (double*)input.get_data() + 2 * current_idx_middle_pair_loc;
585  double* element_middle_inner_pair = (double*)input.get_data() + 2 * current_idx_middle_inner_pair_loc;
586 
587  __m256d outer_inner_vec = get_AVX_vector(element_outer, element_inner);
588 
589  __m256d middle_inner_vec = get_AVX_vector(element_middle,element_middle_inner);
590 
591  __m256d outer_inner_pair_vec = get_AVX_vector(element_outer_pair, element_inner_pair);
592 
593  __m256d middle_inner_pair_vec = get_AVX_vector(element_middle_pair, element_middle_inner_pair);
594 
595 
596 
597  for (int mult_idx=0; mult_idx<8; mult_idx++){
598 
599  double* unitary_row_01 = (double*)unitary.get_data() + 16*mult_idx;
600  double* unitary_row_23 = (double*)unitary.get_data() + 16*mult_idx + 4;
601  double* unitary_row_45 = (double*)unitary.get_data() + 16*mult_idx + 8;
602  double* unitary_row_67 = (double*)unitary.get_data() + 16*mult_idx + 12;
603 
604  __m256d unitary_row_01_vec = _mm256_loadu_pd(unitary_row_01);
605  __m256d unitary_row_23_vec = _mm256_loadu_pd(unitary_row_23);
606  __m256d unitary_row_45_vec = _mm256_loadu_pd(unitary_row_45);
607  __m256d unitary_row_67_vec = _mm256_loadu_pd(unitary_row_67);
608 
609 
610  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_01_vec,neg);
611 
612  __m256d result_upper_middle_vec = complex_mult_AVX(middle_inner_vec,unitary_row_23_vec,neg);
613 
614  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_45_vec,neg);
615 
616  __m256d result_lower_middle_vec = complex_mult_AVX(middle_inner_pair_vec,unitary_row_67_vec,neg);
617 
618 
619  __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle_vec);
620  __m256d result2_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle_vec);
621  __m256d result_vec = _mm256_hadd_pd(result1_vec,result2_vec);
622  result_vec = _mm256_hadd_pd(result_vec,result_vec);
623  double* result = (double*)&result_vec;
624  results[mult_idx].real = result[0];
625  results[mult_idx].imag = result[2];
626  }
627  input[current_idx_outer_loc] = results[0];
628  input[current_idx_inner_loc] = results[1];
629  input[current_idx_middle_loc] = results[2];
630  input[current_idx_middle_inner_loc] = results[3];
631  input[current_idx_outer_pair_loc] = results[4];
632  input[current_idx_inner_pair_loc] = results[5];
633  input[current_idx_middle_pair_loc] = results[6];
634  input[current_idx_middle_inner_pair_loc] = results[7];
635  }
636  });
637 
638 
639  current_idx_inner = current_idx_inner +(index_step_inner << 1);
640  }
641  });
642  current_idx_middle = current_idx_middle +(index_step_middle << 1);
643  }
644  });
645  current_idx = current_idx + (index_step_outer << 1);
646  current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
647 
648  }
649  });
650 }
651 
652 void apply_4qbit_kernel_to_state_vector_input_parallel_AVX(Matrix& unitary, Matrix& input, std::vector<int> involved_qbits, const int& matrix_size){
653 
654  __m256d neg = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);
655 
656  int index_step_inner = 1 << involved_qbits[0];
657 
658  int index_step_middle1 = 1 << involved_qbits[1];
659 
660  int index_step_middle2 = 1 << involved_qbits[2];
661 
662  int index_step_outer = 1 << involved_qbits[3];
663 
664  int parallel_outer_cycles = matrix_size/(index_step_outer << 1);
665 
666  int parallel_inner_cycles = index_step_middle1/(index_step_inner << 1);
667 
668  int parallel_middle1_cycles = index_step_middle2/(index_step_middle1 << 1);
669 
670  int parallel_middle2_cycles = index_step_outer/(index_step_middle2 << 1);
671 
672  int outer_grain_size = get_grain_size(index_step_outer);
673  int inner_grain_size = get_grain_size(index_step_outer);
674  int middle1_grain_size = get_grain_size(index_step_middle1);
675  int middle2_grain_size = get_grain_size(index_step_middle2);
676 
677  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
678 
679  int current_idx = r.begin()*(index_step_outer<<1);
680 
681  int current_idx_pair_outer = current_idx + index_step_outer;
682 
683 
684  for (int outer_rdx=r.begin(); outer_rdx<r.end(); outer_rdx++){
685 
686  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle2_cycles,middle2_grain_size), [&](tbb::blocked_range<int> r) {
687 
688  int current_idx_middle2 = r.begin()*(index_step_middle2<<1);
689 
690  for (int middle2_rdx=r.begin(); middle2_rdx<r.end(); middle2_rdx++){
691 
692  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_middle1_cycles,middle1_grain_size), [&](tbb::blocked_range<int> r) {
693 
694  int current_idx_middle1 = r.begin()*(index_step_middle1<<1);
695 
696  for (int middle1_rdx=r.begin(); middle1_rdx<r.end(); middle1_rdx++){
697 
698  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_inner_cycles,inner_grain_size), [&](tbb::blocked_range<int> r) {
699 
700  int current_idx_inner = r.begin()*(index_step_inner<<1);
701 
702  for (int inner_rdx=r.begin(); inner_rdx<r.end(); inner_rdx++){
703 
704  tbb::parallel_for(tbb::blocked_range<int>(0,index_step_inner,64),[&](tbb::blocked_range<int> r){
705 
706  for (int idx=r.begin(); idx<r.end(); ++idx){
707 
708  int current_idx_loc = current_idx + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2 ;
709  int current_idx_pair_loc = current_idx_pair_outer + idx + current_idx_inner + current_idx_middle1 + current_idx_middle2;
710 
711  int current_idx_outer_loc = current_idx_loc;
712  int current_idx_inner_loc = current_idx_loc + index_step_inner;
713 
714  int current_idx_middle1_loc = current_idx_loc + index_step_middle1;
715  int current_idx_middle1_inner_loc = current_idx_loc + index_step_middle1 + index_step_inner;
716 
717  int current_idx_middle2_loc = current_idx_loc + index_step_middle2;
718  int current_idx_middle2_inner_loc = current_idx_loc + index_step_middle2 + index_step_inner;
719 
720  int current_idx_middle12_loc = current_idx_loc + index_step_middle1 + index_step_middle2;
721  int current_idx_middle12_inner_loc = current_idx_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
722 
723  int current_idx_outer_pair_loc = current_idx_pair_loc;
724  int current_idx_inner_pair_loc = current_idx_pair_loc + index_step_inner;
725 
726  int current_idx_middle1_pair_loc =current_idx_pair_loc + index_step_middle1;
727  int current_idx_middle1_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_inner;
728 
729  int current_idx_middle2_pair_loc = current_idx_pair_loc + index_step_middle2;
730  int current_idx_middle2_inner_pair_loc = current_idx_pair_loc + index_step_middle2 + index_step_inner;
731 
732  int current_idx_middle12_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2;
733  int current_idx_middle12_inner_pair_loc = current_idx_pair_loc + index_step_middle1 + index_step_middle2 + index_step_inner;
734 
735  QGD_Complex16 results[16];
736 
737  double* element_outer = (double*)input.get_data() + 2 * current_idx_outer_loc;
738  double* element_inner = (double*)input.get_data() + 2 * current_idx_inner_loc;
739 
740  double* element_middle1 = (double*)input.get_data() + 2 * current_idx_middle1_loc;
741  double* element_middle1_inner = (double*)input.get_data() + 2 * current_idx_middle1_inner_loc;
742 
743  double* element_middle2 = (double*)input.get_data() + 2 * current_idx_middle2_loc;
744  double* element_middle2_inner = (double*)input.get_data() + 2 * current_idx_middle2_inner_loc;
745 
746  double* element_middle12 = (double*)input.get_data() + 2 * current_idx_middle12_loc;
747  double* element_middle12_inner = (double*)input.get_data() + 2 * current_idx_middle12_inner_loc;
748 
749  double* element_outer_pair = (double*)input.get_data() + 2 * current_idx_outer_pair_loc;
750  double* element_inner_pair = (double*)input.get_data() + 2 * current_idx_inner_pair_loc;
751 
752  double* element_middle1_pair = (double*)input.get_data() + 2 * current_idx_middle1_pair_loc;
753  double* element_middle1_inner_pair = (double*)input.get_data() + 2 * current_idx_middle1_inner_pair_loc;
754 
755  double* element_middle2_pair = (double*)input.get_data() + 2 * current_idx_middle2_pair_loc;
756  double* element_middle2_inner_pair = (double*)input.get_data() + 2 * current_idx_middle2_inner_pair_loc;
757 
758  double* element_middle12_pair = (double*)input.get_data() + 2 * current_idx_middle12_pair_loc;
759  double* element_middle12_inner_pair = (double*)input.get_data() + 2 * current_idx_middle12_inner_pair_loc;
760 
761 
762  __m256d outer_inner_vec = get_AVX_vector(element_outer, element_inner);
763 
764  __m256d middle1_inner_vec = get_AVX_vector(element_middle1,element_middle1_inner);
765 
766  __m256d middle2_inner_vec = get_AVX_vector(element_middle2, element_middle2_inner);
767 
768  __m256d middle12_inner_vec = get_AVX_vector(element_middle12,element_middle12_inner);
769 
770  __m256d outer_inner_pair_vec = get_AVX_vector(element_outer_pair, element_inner_pair);
771 
772  __m256d middle1_inner_pair_vec = get_AVX_vector(element_middle1_pair, element_middle1_inner_pair);
773 
774  __m256d middle2_inner_pair_vec = get_AVX_vector(element_middle2_pair, element_middle2_inner_pair);
775 
776  __m256d middle12_inner_pair_vec = get_AVX_vector(element_middle12_pair, element_middle12_inner_pair);
777 
778 
779  for (int mult_idx=0; mult_idx<16; mult_idx++){
780 
781  double* unitary_row_1 = (double*)unitary.get_data() + 32*mult_idx;
782  double* unitary_row_2 = (double*)unitary.get_data() + 32*mult_idx + 4;
783  double* unitary_row_3 = (double*)unitary.get_data() + 32*mult_idx + 8;
784  double* unitary_row_4 = (double*)unitary.get_data() + 32*mult_idx + 12;
785  double* unitary_row_5 = (double*)unitary.get_data() + 32*mult_idx + 16;
786  double* unitary_row_6 = (double*)unitary.get_data() + 32*mult_idx + 20;
787  double* unitary_row_7 = (double*)unitary.get_data() + 32*mult_idx + 24;
788  double* unitary_row_8 = (double*)unitary.get_data() + 32*mult_idx + 28;
789 
790  __m256d unitary_row_1_vec = _mm256_loadu_pd(unitary_row_1);
791  __m256d unitary_row_2_vec = _mm256_loadu_pd(unitary_row_2);
792  __m256d unitary_row_3_vec = _mm256_loadu_pd(unitary_row_3);
793  __m256d unitary_row_4_vec = _mm256_loadu_pd(unitary_row_4);
794  __m256d unitary_row_5_vec = _mm256_loadu_pd(unitary_row_5);
795  __m256d unitary_row_6_vec = _mm256_loadu_pd(unitary_row_6);
796  __m256d unitary_row_7_vec = _mm256_loadu_pd(unitary_row_7);
797  __m256d unitary_row_8_vec = _mm256_loadu_pd(unitary_row_8);
798 
799  __m256d result_upper_vec = complex_mult_AVX(outer_inner_vec,unitary_row_1_vec,neg);
800 
801  __m256d result_upper_middle1_vec = complex_mult_AVX(middle1_inner_vec,unitary_row_2_vec,neg);
802 
803  __m256d result_upper_middle2_vec = complex_mult_AVX(middle2_inner_vec,unitary_row_3_vec,neg);
804 
805  __m256d result_upper_middle12_vec = complex_mult_AVX(middle12_inner_vec,unitary_row_4_vec,neg);
806 
807  __m256d result_lower_vec = complex_mult_AVX(outer_inner_pair_vec,unitary_row_5_vec,neg);
808 
809  __m256d result_lower_middle1_vec = complex_mult_AVX(middle1_inner_pair_vec,unitary_row_6_vec,neg);
810 
811  __m256d result_lower_middle2_vec = complex_mult_AVX(middle2_inner_pair_vec,unitary_row_7_vec,neg);
812 
813  __m256d result_lower_middle12_vec = complex_mult_AVX(middle12_inner_pair_vec,unitary_row_8_vec,neg);
814 
815 
816  __m256d result1_vec = _mm256_hadd_pd(result_upper_vec,result_upper_middle1_vec);
817  __m256d result2_vec = _mm256_hadd_pd(result_upper_middle2_vec,result_upper_middle12_vec);
818  __m256d result3_vec = _mm256_hadd_pd(result_lower_vec,result_lower_middle1_vec);
819  __m256d result4_vec = _mm256_hadd_pd(result_lower_middle2_vec,result_lower_middle12_vec);
820  __m256d result5_vec = _mm256_hadd_pd(result1_vec,result2_vec);
821  __m256d result6_vec = _mm256_hadd_pd(result3_vec,result4_vec);
822  __m256d result_vec = _mm256_hadd_pd(result5_vec,result6_vec);
823  result_vec = _mm256_hadd_pd(result_vec,result_vec);
824  double* result = (double*)&result_vec;
825  results[mult_idx].real = result[0];
826  results[mult_idx].imag = result[2];
827  }
828  input[current_idx_outer_loc] = results[0];
829  input[current_idx_inner_loc] = results[1];
830  input[current_idx_middle1_loc] = results[2];
831  input[current_idx_middle1_inner_loc] = results[3];
832  input[current_idx_middle2_loc] = results[4];
833  input[current_idx_middle2_inner_loc] = results[5];
834  input[current_idx_middle12_loc] = results[6];
835  input[current_idx_middle12_inner_loc] = results[7];
836  input[current_idx_outer_pair_loc] = results[8];
837  input[current_idx_inner_pair_loc] = results[9];
838  input[current_idx_middle1_pair_loc] = results[10];
839  input[current_idx_middle1_inner_pair_loc] = results[11];
840  input[current_idx_middle2_pair_loc] = results[12];
841  input[current_idx_middle2_inner_pair_loc] = results[13];
842  input[current_idx_middle12_pair_loc] = results[14];
843  input[current_idx_middle12_inner_pair_loc] = results[15];
844  }
845  });
846 
847 
848  current_idx_inner = current_idx_inner +(index_step_inner << 1);
849  }
850  });
851  current_idx_middle1 = current_idx_middle1 +(index_step_middle1 << 1);
852  }
853  });
854  current_idx_middle2 = current_idx_middle2 +(index_step_middle2 << 1);
855  }
856  });
857  current_idx = current_idx + (index_step_outer << 1);
858  current_idx_pair_outer = current_idx_pair_outer + (index_step_outer << 1);
859 
860  }
861  });
862 }
863 
864 void
865 apply_crot_kernel_to_matrix_input_AVX_parallel(Matrix& u3_1qbit1,Matrix& u3_1qbit2, Matrix& input, const int& target_qbit, const int& control_qbit, const int& matrix_size) {
866 
867 
868  input.ensure_aligned();
869 
870  int index_step_target = 1 << target_qbit;
871 
872  // load elements of the U3 unitary into 256bit registers (8 registers)
873  __m256d u3_1bit_00r_vec = _mm256_broadcast_sd(&u3_1qbit1[0].real);
874  __m256d u3_1bit_00i_vec = _mm256_broadcast_sd(&u3_1qbit1[0].imag);
875  __m256d u3_1bit_01r_vec = _mm256_broadcast_sd(&u3_1qbit1[1].real);
876  __m256d u3_1bit_01i_vec = _mm256_broadcast_sd(&u3_1qbit1[1].imag);
877  __m256d u3_1bit_10r_vec = _mm256_broadcast_sd(&u3_1qbit1[2].real);
878  __m256d u3_1bit_10i_vec = _mm256_broadcast_sd(&u3_1qbit1[2].imag);
879  __m256d u3_1bit_11r_vec = _mm256_broadcast_sd(&u3_1qbit1[3].real);
880  __m256d u3_1bit_11i_vec = _mm256_broadcast_sd(&u3_1qbit1[3].imag);
881 
882  __m256d u3_1bit2_00r_vec = _mm256_broadcast_sd(&u3_1qbit2[0].real);
883  __m256d u3_1bit2_00i_vec = _mm256_broadcast_sd(&u3_1qbit2[0].imag);
884  __m256d u3_1bit2_01r_vec = _mm256_broadcast_sd(&u3_1qbit2[1].real);
885  __m256d u3_1bit2_01i_vec = _mm256_broadcast_sd(&u3_1qbit2[1].imag);
886  __m256d u3_1bit2_10r_vec = _mm256_broadcast_sd(&u3_1qbit2[2].real);
887  __m256d u3_1bit2_10i_vec = _mm256_broadcast_sd(&u3_1qbit2[2].imag);
888  __m256d u3_1bit2_11r_vec = _mm256_broadcast_sd(&u3_1qbit2[3].real);
889  __m256d u3_1bit2_11i_vec = _mm256_broadcast_sd(&u3_1qbit2[3].imag);
890 
891 
892  int parallel_outer_cycles = matrix_size/(index_step_target << 1);
893  int outer_grain_size;
894  if ( index_step_target <= 2 ) {
895  outer_grain_size = 32;
896  }
897  else if ( index_step_target <= 4 ) {
898  outer_grain_size = 16;
899  }
900  else if ( index_step_target <= 8 ) {
901  outer_grain_size = 8;
902  }
903  else if ( index_step_target <= 16 ) {
904  outer_grain_size = 4;
905  }
906  else {
907  outer_grain_size = 2;
908  }
909 
910 
911  tbb::parallel_for( tbb::blocked_range<int>(0,parallel_outer_cycles,outer_grain_size), [&](tbb::blocked_range<int> r) {
912 
913  int current_idx = r.begin()*(index_step_target << 1);
914  int current_idx_pair = index_step_target + r.begin()*(index_step_target << 1);
915 
916  for (int rdx=r.begin(); rdx<r.end(); rdx++) {
917 
918 
919  tbb::parallel_for( tbb::blocked_range<int>(0,index_step_target,32), [&](tbb::blocked_range<int> r) {
920  for (int idx=r.begin(); idx<r.end(); ++idx) {
921 
922 
923  int current_idx_loc = current_idx + idx;
924  int current_idx_pair_loc = current_idx_pair + idx;
925 
926  int row_offset = current_idx_loc * input.stride;
927  int row_offset_pair = current_idx_pair_loc * input.stride;
928 
929  if ((current_idx_loc >> control_qbit) & 1) {
930 
931 
932  double* element = (double*)input.get_data() + 2 * row_offset;
933  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
934 
935 
936  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
937 
938  // extract successive elements from arrays element, element_pair
939  __m256d element_vec = _mm256_load_pd(element + col_idx);
940  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
941  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
942  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
943  element_vec = tmp;
944 
945  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
946  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
947  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
948  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
949  element_pair_vec = tmp;
950 
951  __m256d vec3 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec);
952  vec3 = _mm256_fnmadd_pd(u3_1bit_00i_vec, element_vec2, vec3);
953  __m256d vec4 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec);
954  vec4 = _mm256_fnmadd_pd(u3_1bit_01i_vec, element_pair_vec2, vec4);
955  vec3 = _mm256_add_pd(vec3, vec4);
956  __m256d vec5 = _mm256_mul_pd(u3_1bit_00r_vec, element_vec2);
957  vec5 = _mm256_fmadd_pd(u3_1bit_00i_vec, element_vec, vec5);
958  __m256d vec6 = _mm256_mul_pd(u3_1bit_01r_vec, element_pair_vec2);
959  vec6 = _mm256_fmadd_pd(u3_1bit_01i_vec, element_pair_vec, vec6);
960  vec5 = _mm256_add_pd(vec5, vec6);
961 
962  // 6 store the transformed elements in vec3
963  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
964  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
965  vec3 = tmp;
966  _mm256_store_pd(element + col_idx, vec3);
967  _mm256_store_pd(element + col_idx + 4, vec5);
968 
969  __m256d vec7 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec);
970  vec7 = _mm256_fnmadd_pd(u3_1bit_10i_vec, element_vec2, vec7);
971  __m256d vec8 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec);
972  vec8 = _mm256_fnmadd_pd(u3_1bit_11i_vec, element_pair_vec2, vec8);
973  vec7 = _mm256_add_pd(vec7, vec8);
974  __m256d vec9 = _mm256_mul_pd(u3_1bit_10r_vec, element_vec2);
975  vec9 = _mm256_fmadd_pd(u3_1bit_10i_vec, element_vec, vec9);
976  __m256d vec10 = _mm256_mul_pd(u3_1bit_11r_vec, element_pair_vec2);
977  vec10 = _mm256_fmadd_pd(u3_1bit_11i_vec, element_pair_vec, vec10);
978  vec9 = _mm256_add_pd(vec9, vec10);
979 
980  // 6 store the transformed elements in vec3
981  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
982  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
983  vec7 = tmp;
984  _mm256_store_pd(element_pair + col_idx, vec7);
985  _mm256_store_pd(element_pair + col_idx + 4, vec9);
986  }
987 
988  int remainder = input.cols % 4;
989  if (remainder != 0) {
990 
991  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
992  int index = row_offset + col_idx;
993  int index_pair = row_offset_pair + col_idx;
994 
995  QGD_Complex16 element = input[index];
996  QGD_Complex16 element_pair = input[index_pair];
997 
998  QGD_Complex16 tmp1 = mult(u3_1qbit1[0], element);
999  QGD_Complex16 tmp2 = mult(u3_1qbit1[1], element_pair);
1000 
1001  input[index].real = tmp1.real + tmp2.real;
1002  input[index].imag = tmp1.imag + tmp2.imag;
1003 
1004  tmp1 = mult(u3_1qbit1[2], element);
1005  tmp2 = mult(u3_1qbit1[3], element_pair);
1006 
1007  input[index_pair].real = tmp1.real + tmp2.real;
1008  input[index_pair].imag = tmp1.imag + tmp2.imag;
1009  }
1010 
1011  }
1012 
1013  }
1014 
1015  else {
1016 
1017  double* element = (double*)input.get_data() + 2 * row_offset;
1018  double* element_pair = (double*)input.get_data() + 2 * row_offset_pair;
1019 
1020 
1021  for (int col_idx = 0; col_idx < 2 * (input.cols - 3); col_idx = col_idx + 8) {
1022 
1023  // extract successive elements from arrays element, element_pair
1024  __m256d element_vec = _mm256_load_pd(element + col_idx);
1025  __m256d element_vec2 = _mm256_load_pd(element + col_idx + 4);
1026  __m256d tmp = _mm256_shuffle_pd(element_vec, element_vec2, 0);
1027  element_vec2 = _mm256_shuffle_pd(element_vec, element_vec2, 0xf);
1028  element_vec = tmp;
1029 
1030  __m256d element_pair_vec = _mm256_load_pd(element_pair + col_idx);
1031  __m256d element_pair_vec2 = _mm256_load_pd(element_pair + col_idx + 4);
1032  tmp = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0);
1033  element_pair_vec2 = _mm256_shuffle_pd(element_pair_vec, element_pair_vec2, 0xf);
1034  element_pair_vec = tmp;
1035 
1036  __m256d vec3 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec);
1037  vec3 = _mm256_fnmadd_pd(u3_1bit2_00i_vec, element_vec2, vec3);
1038  __m256d vec4 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec);
1039  vec4 = _mm256_fnmadd_pd(u3_1bit2_01i_vec, element_pair_vec2, vec4);
1040  vec3 = _mm256_add_pd(vec3, vec4);
1041  __m256d vec5 = _mm256_mul_pd(u3_1bit2_00r_vec, element_vec2);
1042  vec5 = _mm256_fmadd_pd(u3_1bit2_00i_vec, element_vec, vec5);
1043  __m256d vec6 = _mm256_mul_pd(u3_1bit2_01r_vec, element_pair_vec2);
1044  vec6 = _mm256_fmadd_pd(u3_1bit2_01i_vec, element_pair_vec, vec6);
1045  vec5 = _mm256_add_pd(vec5, vec6);
1046 
1047  // 6 store the transformed elements in vec3
1048  tmp = _mm256_shuffle_pd(vec3, vec5, 0);
1049  vec5 = _mm256_shuffle_pd(vec3, vec5, 0xf);
1050  vec3 = tmp;
1051  _mm256_store_pd(element + col_idx, vec3);
1052  _mm256_store_pd(element + col_idx + 4, vec5);
1053 
1054  __m256d vec7 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec);
1055  vec7 = _mm256_fnmadd_pd(u3_1bit2_10i_vec, element_vec2, vec7);
1056  __m256d vec8 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec);
1057  vec8 = _mm256_fnmadd_pd(u3_1bit2_11i_vec, element_pair_vec2, vec8);
1058  vec7 = _mm256_add_pd(vec7, vec8);
1059  __m256d vec9 = _mm256_mul_pd(u3_1bit2_10r_vec, element_vec2);
1060  vec9 = _mm256_fmadd_pd(u3_1bit2_10i_vec, element_vec, vec9);
1061  __m256d vec10 = _mm256_mul_pd(u3_1bit2_11r_vec, element_pair_vec2);
1062  vec10 = _mm256_fmadd_pd(u3_1bit2_11i_vec, element_pair_vec, vec10);
1063  vec9 = _mm256_add_pd(vec9, vec10);
1064 
1065  // 6 store the transformed elements in vec3
1066  tmp = _mm256_shuffle_pd(vec7, vec9, 0);
1067  vec9 = _mm256_shuffle_pd(vec7, vec9, 0xf);
1068  vec7 = tmp;
1069  _mm256_store_pd(element_pair + col_idx, vec7);
1070  _mm256_store_pd(element_pair + col_idx + 4, vec9);
1071  }
1072 
1073  int remainder = input.cols % 4;
1074  if (remainder != 0) {
1075 
1076  for (int col_idx = input.cols-remainder; col_idx < input.cols; col_idx++) {
1077  int index = row_offset + col_idx;
1078  int index_pair = row_offset_pair + col_idx;
1079 
1080  QGD_Complex16 element = input[index];
1081  QGD_Complex16 element_pair = input[index_pair];
1082 
1083  QGD_Complex16 tmp1 = mult(u3_1qbit2[0], element);
1084  QGD_Complex16 tmp2 = mult(u3_1qbit2[1], element_pair);
1085 
1086  input[index].real = tmp1.real + tmp2.real;
1087  input[index].imag = tmp1.imag + tmp2.imag;
1088 
1089  tmp1 = mult(u3_1qbit2[2], element);
1090  tmp2 = mult(u3_1qbit2[3], element_pair);
1091 
1092  input[index_pair].real = tmp1.real + tmp2.real;
1093  input[index_pair].imag = tmp1.imag + tmp2.imag;
1094  }
1095 
1096  }
1097  }
1098 
1099 
1100  //std::cout << current_idx_target << " " << current_idx_target_pair << std::endl;
1101 
1102 
1103  }
1104  });
1105 
1106 
1107 
1108  current_idx = current_idx + (index_step_target << 1);
1109  current_idx_pair = current_idx_pair + (index_step_target << 1);
1110 
1111  }
1112  });
1113 
1114 
1115 }
1116 
int stride
The column stride of the array. (The array elements in one row are a_0, a_1, ... a_{cols-1}, 0, 0, 0, 0. The number of zeros is stride-cols)
Definition: matrix_base.hpp:46
void apply_2qbit_kernel_to_matrix_input_AVX(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
void ensure_aligned()
QGD_Complex16 mult(QGD_Complex16 &a, QGD_Complex16 &b)
Call to calculate the product of two complex scalars.
Definition: common.cpp:259
scalar * get_data() const
Call to get the pointer to the stored data.
void apply_large_kernel_to_input_AVX(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
void apply_2qbit_kernel_to_state_vector_input_parallel_AVX(Matrix &two_qbit_unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
int rows
The number of rows.
Definition: matrix_base.hpp:42
int cols
The number of columns.
Definition: matrix_base.hpp:44
matrix_size
[load Umtx]
Definition: example.py:58
Structure type representing complex numbers in the SQUANDER package.
Definition: QGDTypes.h:38
Class to store data of complex arrays and its properties.
Definition: matrix.h:38
void apply_crot_kernel_to_matrix_input_AVX_parallel(Matrix &u3_1qbit1, Matrix &u3_1qbit2, Matrix &input, const int &target_qbit, const int &control_qbit, const int &matrix_size)
__m256d complex_mult_AVX(__m256d input_vec, __m256d unitary_row_vec, __m256d neg)
void apply_3qbit_kernel_to_state_vector_input_parallel_AVX(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
double real
the real part of a complex number
Definition: QGDTypes.h:40
__m256d get_AVX_vector(double *element_outer, double *element_inner)
int get_grain_size(int index_step)
void apply_2qbit_kernel_to_state_vector_input_AVX(Matrix &two_qbit_unitary, Matrix &input, const int &inner_qbit, const int &outer_qbit, const int &matrix_size)
result
the result of the Qiskit job
void apply_4qbit_kernel_to_state_vector_input_parallel_AVX(Matrix &unitary, Matrix &input, std::vector< int > involved_qbits, const int &matrix_size)
double imag
the imaginary part of a complex number
Definition: QGDTypes.h:42