Expression Templates Library (ETL)
conv_4d_select.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
13 #pragma once
14 
15 namespace etl::detail {
16 
27 template <typename I, typename K, typename C>
28 constexpr etl::conv4_impl select_default_conv4_valid_impl(bool no_gpu, size_t i1, size_t i2, size_t k1, size_t k2) {
29  //Note: since the constexpr values will be known at compile time, the
30  //conditions will be a lot simplified
31 
32  constexpr order input_order = decay_traits<I>::storage_order;
33  constexpr order kernel_order = decay_traits<K>::storage_order;
34  constexpr order output_order = decay_traits<C>::storage_order;
35 
36  //Only the standard implementation is able to handle column major
37  if (input_order == order::ColumnMajor || kernel_order == order::ColumnMajor || output_order == order::ColumnMajor) {
38  return etl::conv4_impl::STD;
39  }
40 
41  if (impl::cudnn::conv_possible<I, K, C> && !no_gpu) {
43  }
44 
45  // Small kernels
46  if (k1 == k2 && k1 <= 5) {
47  if (impl::vec::conv2_possible<vector_mode, I, K, C> && i1 == i2 && i1 > 100) {
48  return etl::conv4_impl::VEC;
49  } else {
50  if (cblas_enabled) {
52  } else if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
54  }
55  }
56  }
57 
58  if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
59  return etl::conv4_impl::VEC;
60  } else if (cblas_enabled) {
62  }
63 
64  return etl::conv4_impl::STD;
65 }
66 
77 template <typename I, typename K, typename C>
78 constexpr etl::conv4_impl select_default_conv4_valid_filter_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
79  //Note: since the constexpr values will be known at compile time, the
80  //conditions will be a lot simplified
81 
82  constexpr order input_order = decay_traits<I>::storage_order;
83  constexpr order kernel_order = decay_traits<K>::storage_order;
84  constexpr order output_order = decay_traits<C>::storage_order;
85 
86  //Only the standard implementation is able to handle column major
87  if (input_order == order::ColumnMajor || kernel_order == order::ColumnMajor || output_order == order::ColumnMajor) {
88  return etl::conv4_impl::STD;
89  }
90 
91  // Small kernels
92  if (k1 == k2 && k1 <= 5) {
93  if (impl::vec::conv2_possible<vector_mode, I, K, C> && i1 == i2 && i1 > 100) {
94  return etl::conv4_impl::VEC;
95  } else {
96  if (cblas_enabled) {
98  } else if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
100  }
101  }
102  }
103 
104  if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
105  return etl::conv4_impl::VEC;
106  } else if (cblas_enabled) {
108  }
109 
110  return etl::conv4_impl::STD;
111 }
112 
123 template <typename I, typename K, typename C>
124 constexpr etl::conv4_impl select_default_conv4_valid_back_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
125  //Note: since the constexpr values will be known at compile time, the
126  //conditions will be a lot simplified
127 
128  constexpr order input_order = decay_traits<I>::storage_order;
129  constexpr order kernel_order = decay_traits<K>::storage_order;
130  constexpr order output_order = decay_traits<C>::storage_order;
131 
132  //Only the standard implementation is able to handle column major
133  if (input_order == order::ColumnMajor || kernel_order == order::ColumnMajor || output_order == order::ColumnMajor) {
134  return etl::conv4_impl::STD;
135  }
136 
137  // Small kernels
138  if (k1 == k2 && k1 <= 5) {
139  if (i1 == i2 && i1 > 100) {
140  if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
141  return etl::conv4_impl::VEC;
142  }
143  } else {
144  if (cblas_enabled) {
146  } else if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
148  }
149  }
150  }
151 
152  if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
153  return etl::conv4_impl::VEC;
154  } else if (cblas_enabled) {
156  }
157 
158  return etl::conv4_impl::STD;
159 }
160 
171 template <typename I, typename K, typename C>
172 constexpr etl::conv4_impl select_default_conv4_full_impl(bool no_gpu, size_t k1, size_t k2) {
173  //Note: since the constexpr values will be known at compile time, the
174  //conditions will be a lot simplified
175 
176  constexpr order input_order = decay_traits<I>::storage_order;
177  constexpr order kernel_order = decay_traits<K>::storage_order;
178  constexpr order output_order = decay_traits<C>::storage_order;
179 
180  //Only the standard implementation is able to handle column major
181  if (input_order == order::ColumnMajor || kernel_order == order::ColumnMajor || output_order == order::ColumnMajor) {
182  return etl::conv4_impl::STD;
183  }
184 
185  // CUDNN is always faster than the others
186  if (impl::cudnn::conv_possible<I, K, C> && !no_gpu) {
187  return etl::conv4_impl::CUDNN;
188  }
189 
190  // CUFFT is generally faster than the other, but anyway in GPU mode, CUDNN should be available
191  if (impl::cufft::conv2_possible<I, K, C> && !no_gpu) {
193  }
194 
195  // MKL is generally faster than VEC
196  // This could be improved for small batch size where VEC is interesting
197  if (impl::blas::conv2_possible<I, K, C>) {
198  if (impl::vec::conv2_possible<vector_mode, I, K, C> && k1 == k2 && (k2 == 3 || k2 == 5)) {
199  return etl::conv4_impl::VEC;
200  }
201 
203  }
204 
205  // If possible, use vectorized implementations
206  if (impl::vec::conv2_possible<vector_mode, I, K, C>) {
207  return etl::conv4_impl::VEC;
208  }
209 
210  // If nothing else if available
212 }
213 
214 #ifdef ETL_MANUAL_SELECT
215 
223 template <typename I, typename K, typename C>
224 inline etl::conv4_impl select_conv4_valid_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
225  if (local_context().conv4_selector.forced) {
226  auto forced = local_context().conv4_selector.impl;
227 
228  switch (forced) {
229  //VEC cannot always be used
232  if (!impl::vec::conv2_possible<vector_mode, I, K, C>) { // COVERAGE_EXCLUDE_LINE
233  std::cerr << "Forced selection to VEC conv4 implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
234  return select_default_conv4_valid_impl<I, K, C>(local_context().cpu, i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
235  } // COVERAGE_EXCLUDE_LINE
236 
237  return forced;
238 
239  //BLAS cannot always be used
241  if (!cblas_enabled) { // COVERAGE_EXCLUDE_LINE
242  std::cerr << "Forced selection to BLAS conv implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
243  return select_default_conv4_valid_impl<I, K, C>(local_context().cpu, i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
244  } // COVERAGE_EXCLUDE_LINE
245 
246  return forced;
247 
248  //CUDNN cannot always be used
250  if (!impl::cudnn::conv_possible<I, K, C> || local_context().cpu) { // COVERAGE_EXCLUDE_LINE
251  std::cerr << "Forced selection to CUDNN conv implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
252  return select_default_conv4_valid_impl<I, K, C>(local_context().cpu, i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
253  } // COVERAGE_EXCLUDE_LINE
254 
255  return forced;
256 
257  default:
258  return forced;
259  }
260  }
261 
262  return select_default_conv4_valid_impl<I, K, C>(local_context().cpu, i1, i2, k1, k2);
263 }
264 
272 template <typename I, typename K, typename C>
273 inline etl::conv4_impl select_conv4_valid_filter_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
274  if (local_context().conv4_selector.forced) {
275  auto forced = local_context().conv4_selector.impl;
276 
277  switch (forced) {
278  //VEC cannot always be used
281  if (!impl::vec::conv2_possible<vector_mode, I, K, C>) { // COVERAGE_EXCLUDE_LINE
282  std::cerr << "Forced selection to VEC conv4_valid_filter implementation, but not possible for this expression"
283  << std::endl; // COVERAGE_EXCLUDE_LINE
284  return select_default_conv4_valid_filter_impl<I, K, C>(i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
285  } // COVERAGE_EXCLUDE_LINE
286 
287  return forced;
288 
289  //BLAS cannot always be used
291  if (!cblas_enabled) { // COVERAGE_EXCLUDE_LINE
292  std::cerr << "Forced selection to BLAS conv implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
293  return select_default_conv4_valid_filter_impl<I, K, C>(i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
294  } // COVERAGE_EXCLUDE_LINE
295 
296  return forced;
297 
298  default:
299  return forced;
300  }
301  }
302 
303  return select_default_conv4_valid_filter_impl<I, K, C>(i1, i2, k1, k2);
304 }
305 
313 template <typename I, typename K, typename C>
314 inline etl::conv4_impl select_conv4_valid_back_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
315  if (local_context().conv4_selector.forced) {
316  auto forced = local_context().conv4_selector.impl;
317 
318  switch (forced) {
319  //VEC cannot always be used
322  if (!impl::vec::conv2_possible<vector_mode, I, K, C>) { // COVERAGE_EXCLUDE_LINE
323  std::cerr << "Forced selection to VEC conv4_valid_back implementation, but not possible for this expression"
324  << std::endl; // COVERAGE_EXCLUDE_LINE
325  return select_default_conv4_valid_back_impl<I, K, C>(i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
326  } // COVERAGE_EXCLUDE_LINE
327 
328  return forced;
329 
330  //BLAS cannot always be used
332  if (!cblas_enabled) { // COVERAGE_EXCLUDE_LINE
333  std::cerr << "Forced selection to BLAS conv implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
334  return select_default_conv4_valid_back_impl<I, K, C>(i1, i2, k1, k2); // COVERAGE_EXCLUDE_LINE
335  } // COVERAGE_EXCLUDE_LINE
336 
337  return forced;
338 
339  default:
340  return forced;
341  }
342  }
343 
344  return select_default_conv4_valid_back_impl<I, K, C>(i1, i2, k1, k2);
345 }
346 
354 template <typename I, typename K, typename C>
355 inline etl::conv4_impl select_conv4_full_impl(size_t k1, size_t k2) {
356  if (local_context().conv4_selector.forced) {
357  auto forced = local_context().conv4_selector.impl;
358 
359  switch (forced) {
360  //VEC cannot always be used
362  if (!impl::vec::conv2_possible<vector_mode, I, K, C>) { // COVERAGE_EXCLUDE_LINE
363  std::cerr << "Forced selection to VEC conv4_full implementation, but not possible for this expression"
364  << std::endl; // COVERAGE_EXCLUDE_LINE
365  return select_default_conv4_full_impl<I, K, C>(local_context().cpu, k1, k2); // COVERAGE_EXCLUDE_LINE
366  } // COVERAGE_EXCLUDE_LINE
367 
368  return forced;
369 
370  //CUDNN cannot always be used
372  if (!impl::cudnn::conv_possible<I, K, C> || local_context().cpu) { // COVERAGE_EXCLUDE_LINE
373  std::cerr << "Forced selection to CUDNN conv implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
374  return select_default_conv4_full_impl<I, K, C>(local_context().cpu, k1, k2); // COVERAGE_EXCLUDE_LINE
375  } // COVERAGE_EXCLUDE_LINE
376 
377  return forced;
378 
379  //CUFFT cannot always be used
381  if (!impl::cufft::conv2_possible<I, K, C> || local_context().cpu) { // COVERAGE_EXCLUDE_LINE
382  std::cerr << "Forced selection to FFT_CUFFT conv implementation, but not possible for this expression"
383  << std::endl; // COVERAGE_EXCLUDE_LINE
384  return select_default_conv4_full_impl<I, K, C>(local_context().cpu, k1, k2); // COVERAGE_EXCLUDE_LINE
385  } // COVERAGE_EXCLUDE_LINE
386 
387  return forced;
388 
389  //MKL cannot always be used
391  if (!impl::blas::conv2_possible<I, K, C>) { // COVERAGE_EXCLUDE_LINE
392  std::cerr << "Forced selection to FFT_MKL conv implementation, but not possible for this expression" << std::endl; // COVERAGE_EXCLUDE_LINE
393  return select_default_conv4_full_impl<I, K, C>(local_context().cpu, k1, k2); // COVERAGE_EXCLUDE_LINE
394  } // COVERAGE_EXCLUDE_LINE
395 
396  return forced;
397 
398  default:
399  return forced;
400  }
401  }
402 
403  return select_default_conv4_full_impl<I, K, C>(local_context().cpu, k1, k2);
404 }
405 
406 #else
407 
418 template <typename I, typename K, typename C>
419 constexpr etl::conv4_impl select_conv4_valid_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
420  return select_default_conv4_valid_impl<I, K, C>(false, i1, i2, k1, k2);
421 }
422 
433 template <typename I, typename K, typename C>
434 constexpr etl::conv4_impl select_conv4_valid_filter_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
435  return select_default_conv4_valid_filter_impl<I, K, C>(i1, i2, k1, k2);
436 }
437 
448 template <typename I, typename K, typename C>
449 constexpr etl::conv4_impl select_conv4_valid_back_impl(size_t i1, size_t i2, size_t k1, size_t k2) {
450  return select_default_conv4_valid_back_impl<I, K, C>(i1, i2, k1, k2);
451 }
452 
463 template <typename I, typename K, typename C>
464 constexpr etl::conv4_impl select_conv4_full_impl(size_t k1, size_t k2) {
465  return select_default_conv4_full_impl<I, K, C>(false, k1, k2);
466 }
467 
468 #endif
469 
470 } //end of namespace etl::detail
FFT reduction (with MKL impl)
Standard implementation.
order
Storage order of a matrix.
Definition: order.hpp:15
constexpr etl::conv4_impl select_default_conv4_full_impl(bool no_gpu, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:172
constexpr etl::conv4_impl select_conv4_valid_impl(size_t i1, size_t i2, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:419
VEC implementation.
Definition: expression_builder.hpp:699
constexpr etl::conv4_impl select_conv4_valid_back_impl(size_t i1, size_t i2, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:449
Traits to get information about ETL types.
Definition: tmp.hpp:68
constexpr etl::conv4_impl select_conv4_valid_filter_impl(size_t i1, size_t i2, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:434
context & local_context()
Return the configuration context of the current thread.
Definition: context.hpp:50
FFT reduction (with STD impl)
bool cpu
Force CPU evaluation.
Definition: context.hpp:29
GPU implementation.
Column-Major storage.
constexpr etl::conv4_impl select_conv4_full_impl(size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:464
constexpr bool cblas_enabled
Indicates if a BLAS library is available for ETL.
Definition: config.hpp:76
constexpr etl::conv4_impl select_default_conv4_valid_filter_impl(size_t i1, size_t i2, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:78
constexpr etl::conv4_impl select_default_conv4_valid_impl(bool no_gpu, size_t i1, size_t i2, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:28
constexpr etl::conv4_impl select_default_conv4_valid_back_impl(size_t i1, size_t i2, size_t k1, size_t k2)
Select the implementation of the 4D conv of I and K in C.
Definition: conv_4d_select.hpp:124
conv4_impl
Enumeration describing the different convolution implementations.
Definition: conv_impl.hpp:33
FFT reduction (with CUFFT impl)