wichtounet/etl/cudnn_2conv_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 #pragma once

 #define ETL_EXPERIMENTAL_TENSOR_CORES

 #ifdef ETL_CUDNN_MODE

 #include "etl/impl/cublas/cuda.hpp"
 #include "etl/impl/cudnn/cudnn.hpp"

 #endif

 namespace etl::impl::cudnn {

 template <typename I, typename K, typename C>
 constexpr bool conv_possible = cudnn_enabled&& all_homogeneous<I, K, C>&& all_row_major<I, K, C>&& all_dma<I, K, C>;

 template <typename I, typename K>
 constexpr bool conv_possible_ = cudnn_enabled&& all_homogeneous<I, K>&& all_row_major<I, K>&& all_dma<I, K>;

 #ifdef ETL_CUDNN_MODE

 template <typename I, typename K, typename C>
 void conv2_valid_set(I&& input, K&& kernel, C&& conv, size_t s1, size_t s2, size_t p1, size_t p2, cudnnConvolutionMode_t mode) {
     using type = std::remove_const_t<value_t<I>>;

     type alpha[] = {1.0f};
     type beta[]  = {0.0f};

     auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;

     decltype(auto) handle = start_cudnn();

     // Prepare the tensors
     auto input_tensor  = create_tensor_wrapper(input);
     auto output_tensor = create_tensor_wrapper(conv);
     auto filter        = create_filter_wrapper(kernel);

     // Prepare the convolution
     cudnnConvolutionDescriptor_t convolution;
     cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
     cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES
     cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif

     // Find the algorithm to use
     cudnnConvolutionFwdAlgoPerf_t algo;
     int algos = 0;
     cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(handle.get(), *input_tensor, *filter, convolution, *output_tensor,
                                                     1, &algos, &algo));

     // Prepare the workspace
     impl::cuda::cuda_memory<type> workspace;
     if (algo.memory) {
         workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
     }

     // Allocate GPU memory, if necessary

     input.ensure_gpu_up_to_date();
     kernel.ensure_gpu_up_to_date();
     conv.ensure_gpu_allocated();

     // Perform the convolution

     cudnn_check(cudnnConvolutionForward(handle.get(), alpha, *input_tensor, input.gpu_memory(), *filter, kernel.gpu_memory(), convolution, algo.algo,
                                         workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));

     conv.validate_gpu();
     conv.invalidate_cpu();

     // Release the resources
     cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
 }

 template <typename I, typename K, typename C>
 void conv2_valid([[maybe_unused]] I&& input,
                  [[maybe_unused]] K&& kernel,
                  [[maybe_unused]] C&& conv,
                  [[maybe_unused]] size_t s1,
                  [[maybe_unused]] size_t s2,
                  [[maybe_unused]] size_t p1,
                  [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv2_valid_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv2_valid_flipped([[maybe_unused]] I&& input,
                          [[maybe_unused]] K&& kernel,
                          [[maybe_unused]] C&& conv,
                          [[maybe_unused]] size_t s1,
                          [[maybe_unused]] size_t s2,
                          [[maybe_unused]] size_t p1,
                          [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv2_valid_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_forward_set(I&& input, K&& kernel, C&& conv, size_t s1, size_t s2, size_t p1, size_t p2, cudnnConvolutionMode_t mode) {
     using type = value_t<I>;

     type alpha[] = {1.0f};
     type beta[]  = {0.0f};

     auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;

     decltype(auto) handle = start_cudnn();

     // Prepare the tensors
     auto input_tensor  = create_tensor_wrapper(input);
     auto output_tensor = create_tensor_wrapper(conv);
     auto filter        = create_filter_wrapper(kernel);

     // Prepare the convolution
     cudnnConvolutionDescriptor_t convolution;
     cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
     cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES
     cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif

     // Find the algorithm to use
     cudnnConvolutionFwdAlgoPerf_t algo;
     int algos = 0;
     cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(handle.get(), *input_tensor, *filter, convolution, *output_tensor,
                                                     1, &algos, &algo));

     // Prepare the workspace
     impl::cuda::cuda_memory<type> workspace;
     if (algo.memory) {
         workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
     }

     // Allocate GPU memory, if necessary

     input.ensure_gpu_up_to_date();
     kernel.ensure_gpu_up_to_date();
     conv.ensure_gpu_allocated();

     // Perform the convolution

     cudnn_check(cudnnConvolutionForward(handle.get(), alpha, *input_tensor, input.gpu_memory(), *filter, kernel.gpu_memory(), convolution, algo.algo,
                                         workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));

     conv.validate_gpu();
     conv.invalidate_cpu();

     // Release the resources
     cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
 }

 template <typename I, typename K, typename C>
 void conv4_forward([[maybe_unused]] I&& input,
                    [[maybe_unused]] K&& kernel,
                    [[maybe_unused]] C&& conv,
                    [[maybe_unused]] size_t s1,
                    [[maybe_unused]] size_t s2,
                    [[maybe_unused]] size_t p1,
                    [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_forward_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_forward_flipped([[maybe_unused]] I&& input,
                            [[maybe_unused]] K&& kernel,
                            [[maybe_unused]] C&& conv,
                            [[maybe_unused]] size_t s1,
                            [[maybe_unused]] size_t s2,
                            [[maybe_unused]] size_t p1,
                            [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_forward_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_backward_filter_set(I&& input, K&& kernel, C&& conv, size_t s1, size_t s2, size_t p1, size_t p2, cudnnConvolutionMode_t mode) {
     using type = value_t<I>;

     type alpha[] = {1.0f};
     type beta[]  = {0.0f};

     auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;

     decltype(auto) handle = start_cudnn();

     // Prepare the tensors
     auto input_tensor  = create_tensor_wrapper(input);
     auto output_tensor = create_tensor_wrapper(kernel);
     auto filter        = create_filter_wrapper(conv);

     // Prepare the convolution
     cudnnConvolutionDescriptor_t convolution;
     cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
     cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES
     cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif

     // Find the algorithm to use
     cudnnConvolutionBwdFilterAlgoPerf_t algo;
     int algos = 0;
     cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm_v7(handle.get(), *input_tensor, *output_tensor, convolution, *filter,
                                                            1, &algos, &algo));

     // Prepare the workspace
     impl::cuda::cuda_memory<type> workspace;
     if (algo.memory) {
         workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
     }

     // Allocate GPU memory, if necessary

     input.ensure_gpu_up_to_date();
     kernel.ensure_gpu_up_to_date();
     conv.ensure_gpu_allocated();

     // Perform the convolution

     cudnn_check(cudnnConvolutionBackwardFilter(handle.get(), alpha, *input_tensor, input.gpu_memory(), *output_tensor, kernel.gpu_memory(), convolution,
                                                algo.algo, workspace.get(), algo.memory, beta, *filter, conv.gpu_memory()));

     conv.validate_gpu();
     conv.invalidate_cpu();

     // Release the resources
     cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
 }

 template <typename I, typename K, typename C>
 void conv4_backward_filter([[maybe_unused]] I&& input,
                            [[maybe_unused]] K&& kernel,
                            [[maybe_unused]] C&& conv,
                            [[maybe_unused]] size_t s1,
                            [[maybe_unused]] size_t s2,
                            [[maybe_unused]] size_t p1,
                            [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_backward_filter_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_backward_filter_flipped([[maybe_unused]] I&& input,
                                    [[maybe_unused]] K&& kernel,
                                    [[maybe_unused]] C&& conv,
                                    [[maybe_unused]] size_t s1,
                                    [[maybe_unused]] size_t s2,
                                    [[maybe_unused]] size_t p1,
                                    [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_backward_filter_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv2_full_set(I&& input, K&& kernel, C&& conv, cudnnConvolutionMode_t mode) {
     using type = std::remove_const_t<value_t<I>>;

     type alpha[] = {1.0f};
     type beta[]  = {0.0f};

     auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;

     decltype(auto) handle = start_cudnn();

     // Prepare the tensors
     auto input_tensor  = create_tensor_wrapper(input);
     auto output_tensor = create_tensor_wrapper(conv);
     auto filter        = create_filter_wrapper(kernel);

     // Prepare the convolution
     cudnnConvolutionDescriptor_t convolution;
     cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
     cudnn_check(cudnnSetConvolution2dDescriptor(convolution, 0, 0, 1, 1, 1, 1, mode, data_type));
 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES
     cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif

     // Find the algorithm to use
     cudnnConvolutionBwdDataAlgoPerf_t algo;
     int algos = 0;
     cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle.get(), *filter, *input_tensor, convolution, *output_tensor,
                                                          1, &algos, &algo));

     // Prepare the workspace
     impl::cuda::cuda_memory<type> workspace;
     if (algo.memory) {
         workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
     }

     // Allocate GPU memory, if necessary

     input.ensure_gpu_up_to_date();
     kernel.ensure_gpu_up_to_date();
     conv.ensure_gpu_allocated();

     // Perform the convolution

     cudnn_check(cudnnConvolutionBackwardData(handle.get(), alpha, *filter, kernel.gpu_memory(), *input_tensor, input.gpu_memory(), convolution, algo.algo,
                                              workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));

     conv.validate_gpu();
     conv.invalidate_cpu();

     // Release the resources
     cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
 }

 template <typename I, typename K, typename C>
 void conv2_full([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     if constexpr (conv_possible<I, K, C>) {
         conv2_full_set(input, kernel, conv, CUDNN_CROSS_CORRELATION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv2_full_flipped([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     if constexpr (conv_possible<I, K, C>) {
         conv2_full_set(input, kernel, conv, CUDNN_CONVOLUTION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv2_valid_multi_set(I& input, K&& kernel, C&& conv, size_t s1, size_t s2, size_t p1, size_t p2, cudnnConvolutionMode_t mode) {
     using type = std::remove_const_t<value_t<I>>;

     auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;

     type alpha[] = {1.0f};
     type beta[]  = {0.0f};

     decltype(auto) handle = start_cudnn();

     // Prepare the input tensor
     cudnnTensorDescriptor_t input_tensor;
     cudnn_check(cudnnCreateTensorDescriptor(&input_tensor));
     cudnn_check(cudnnSetTensor4dDescriptor(input_tensor, CUDNN_TENSOR_NCHW, data_type, 1, 1, etl::dim<0>(input), etl::dim<1>(input)));

     // Prepare the output tensor
     cudnnTensorDescriptor_t output_tensor;
     cudnn_check(cudnnCreateTensorDescriptor(&output_tensor));
     cudnn_check(cudnnSetTensor4dDescriptor(output_tensor, CUDNN_TENSOR_NCHW, data_type, 1, etl::dim<0>(conv), etl::dim<1>(conv), etl::dim<2>(conv)));

     // Prepare the filter
     cudnnFilterDescriptor_t filter;
     cudnn_check(cudnnCreateFilterDescriptor(&filter));
     cudnn_check(cudnnSetFilter4dDescriptor(filter, data_type, CUDNN_TENSOR_NCHW, etl::dim<0>(kernel), 1, etl::dim<1>(kernel), etl::dim<2>(kernel)));

     // Prepare the convolution
     cudnnConvolutionDescriptor_t convolution;
     cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
     cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES
     cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif

     // Find the algorithm to use
     cudnnConvolutionFwdAlgoPerf_t algo;
     int algos = 0;
     cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(handle.get(), input_tensor, filter, convolution, output_tensor,
                                                     1, &algos, &algo));

     // Prepare the workspace
     impl::cuda::cuda_memory<type> workspace;
     if (algo.memory) {
         workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
     }

     // Allocate GPU memory, if necessary

     input.ensure_gpu_up_to_date();
     kernel.ensure_gpu_up_to_date();
     conv.ensure_gpu_allocated();

     // Perform the convolution

     cudnn_check(cudnnConvolutionForward(handle.get(), alpha, input_tensor, input.gpu_memory(), filter, kernel.gpu_memory(), convolution, algo.algo,
                                         workspace.get(), algo.memory, beta, output_tensor, conv.gpu_memory()));

     conv.validate_gpu();
     conv.invalidate_cpu();

     // Release the resources
     cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
     cudnn_check(cudnnDestroyFilterDescriptor(filter));
     cudnn_check(cudnnDestroyTensorDescriptor(output_tensor));
     cudnn_check(cudnnDestroyTensorDescriptor(input_tensor));
 }

 template <typename I, typename K, typename C>
 void conv2_valid_multi([[maybe_unused]] I&& input,
                        [[maybe_unused]] K&& kernel,
                        [[maybe_unused]] C&& conv,
                        [[maybe_unused]] size_t s1,
                        [[maybe_unused]] size_t s2,
                        [[maybe_unused]] size_t p1,
                        [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv2_valid_multi_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv2_valid_multi_flipped([[maybe_unused]] I&& input,
                                [[maybe_unused]] K&& kernel,
                                [[maybe_unused]] C&& conv,
                                [[maybe_unused]] size_t s1,
                                [[maybe_unused]] size_t s2,
                                [[maybe_unused]] size_t p1,
                                [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv2_valid_multi_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_set(I&& input, K&& kernel, C&& conv, cudnnConvolutionMode_t mode, size_t s1, size_t s2, size_t p1, size_t p2) {
     using type = value_t<I>;

     type alpha[] = {1.0f};
     type beta[]  = {0.0f};

     auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;

     decltype(auto) handle = start_cudnn();

     // Prepare the tensors
     auto input_tensor  = create_tensor_wrapper(input);
     auto output_tensor = create_tensor_wrapper(conv);
     auto filter        = create_filter_wrapper(kernel);

     // Prepare the convolution
     cudnnConvolutionDescriptor_t convolution;
     cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
     cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES
     cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
 #endif

     // Find the algorithm to use
     cudnnConvolutionBwdDataAlgoPerf_t algo;
     int algos = 0;
     cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle.get(), *filter, *input_tensor, convolution, *output_tensor,
                                                          1, &algos, &algo));

     // Prepare the workspace
     impl::cuda::cuda_memory<type> workspace;
     if (algo.memory) {
         workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
     }

     // Allocate GPU memory, if necessary

     input.ensure_gpu_up_to_date();
     kernel.ensure_gpu_up_to_date();
     conv.ensure_gpu_allocated();

     // Perform the convolution

     cudnn_check(cudnnConvolutionBackwardData(handle.get(), alpha, *filter, kernel.gpu_memory(), *input_tensor, input.gpu_memory(), convolution, algo.algo,
                                              workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));

     conv.validate_gpu();
     conv.invalidate_cpu();

     // Release the resources
     cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data([[maybe_unused]] I&& input,
                          [[maybe_unused]] K&& kernel,
                          [[maybe_unused]] C&& conv,
                          [[maybe_unused]] size_t s1,
                          [[maybe_unused]] size_t s2,
                          [[maybe_unused]] size_t p1,
                          [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_backward_data_set(input, kernel, conv, CUDNN_CROSS_CORRELATION, s1, s2, p1, p2);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_flipped([[maybe_unused]] I&& input,
                                  [[maybe_unused]] K&& kernel,
                                  [[maybe_unused]] C&& conv,
                                  [[maybe_unused]] size_t s1,
                                  [[maybe_unused]] size_t s2,
                                  [[maybe_unused]] size_t p1,
                                  [[maybe_unused]] size_t p2) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_backward_data_set(input, kernel, conv, CUDNN_CONVOLUTION, s1, s2, p1, p2);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_full([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_backward_data_set(input, kernel, conv, CUDNN_CROSS_CORRELATION, 1, 1, 0, 0);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_full_flipped([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     if constexpr (conv_possible<I, K, C>) {
         conv4_backward_data_set(input, kernel, conv, CUDNN_CONVOLUTION, 1, 1, 0, 0);
     } else {
         cpp_unreachable("CUDNN not available/enabled");
     }
 }

 #else

 //COVERAGE_EXCLUDE_BEGIN

 template <typename I, typename K, typename C>
 void conv2_valid([[maybe_unused]] I&& input,
                  [[maybe_unused]] K&& kernel,
                  [[maybe_unused]] C&& conv,
                  [[maybe_unused]] size_t s1,
                  [[maybe_unused]] size_t s2,
                  [[maybe_unused]] size_t p1,
                  [[maybe_unused]] size_t p2) {
     cpp_unreachable("CUDNN not available/enabled");
 }

 template <typename I, typename K, typename C>
 void conv2_valid_flipped([[maybe_unused]] I&& input,
                          [[maybe_unused]] K&& kernel,
                          [[maybe_unused]] C&& conv,
                          [[maybe_unused]] size_t s1,
                          [[maybe_unused]] size_t s2,
                          [[maybe_unused]] size_t p1,
                          [[maybe_unused]] size_t p2) {
     cpp_unreachable("CUDNN not available/enabled");
 }

 template <typename I, typename K, typename C>
 void conv4_forward([[maybe_unused]] I&& input,
                    [[maybe_unused]] K&& kernel,
                    [[maybe_unused]] C&& conv,
                    [[maybe_unused]] size_t s1,
                    [[maybe_unused]] size_t s2,
                    [[maybe_unused]] size_t p1,
                    [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_valid");
 }

 template <typename I, typename K, typename C>
 void conv4_forward_flipped([[maybe_unused]] I&& input,
                            [[maybe_unused]] K&& kernel,
                            [[maybe_unused]] C&& conv,
                            [[maybe_unused]] size_t s1,
                            [[maybe_unused]] size_t s2,
                            [[maybe_unused]] size_t p1,
                            [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_valid_flipped");
 }

 template <typename I, typename K, typename C>
 void conv4_backward_filter([[maybe_unused]] I&& input,
                            [[maybe_unused]] K&& kernel,
                            [[maybe_unused]] C&& conv,
                            [[maybe_unused]] size_t s1,
                            [[maybe_unused]] size_t s2,
                            [[maybe_unused]] size_t p1,
                            [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_valid_filter");
 }

 template <typename I, typename K, typename C>
 void conv4_backward_filter_flipped([[maybe_unused]] I&& input,
                                    [[maybe_unused]] K&& kernel,
                                    [[maybe_unused]] C&& conv,
                                    [[maybe_unused]] size_t s1,
                                    [[maybe_unused]] size_t s2,
                                    [[maybe_unused]] size_t p1,
                                    [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_backward_filter_flipped");
 }

 template <typename I, typename K, typename C>
 void conv2_full([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     cpp_unreachable("Unsupported feature called: cudnn conv2_full");
 }

 template <typename I, typename K, typename C>
 void conv2_full_flipped([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     cpp_unreachable("Unsupported feature called: cudnn conv2_full_flipped");
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_full([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_full");
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_full_flipped([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_ful_flippedl");
 }

 template <typename I, typename K, typename C>
 void conv2_valid_multi([[maybe_unused]] I&& input,
                        [[maybe_unused]] K&& kernel,
                        [[maybe_unused]] C&& conv,
                        [[maybe_unused]] size_t s1,
                        [[maybe_unused]] size_t s2,
                        [[maybe_unused]] size_t p1,
                        [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv2_valid_multi");
 }

 template <typename I, typename K, typename C>
 void conv2_valid_multi_flipped([[maybe_unused]] I&& input,
                                [[maybe_unused]] K&& kernel,
                                [[maybe_unused]] C&& conv,
                                [[maybe_unused]] size_t s1,
                                [[maybe_unused]] size_t s2,
                                [[maybe_unused]] size_t p1,
                                [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv2_valid_multi_flipped");
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data([[maybe_unused]] I&& input,
                          [[maybe_unused]] K&& kernel,
                          [[maybe_unused]] C&& conv,
                          [[maybe_unused]] size_t s1,
                          [[maybe_unused]] size_t s2,
                          [[maybe_unused]] size_t p1,
                          [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_backward_data");
 }

 template <typename I, typename K, typename C>
 void conv4_backward_data_flipped([[maybe_unused]] I&& input,
                                  [[maybe_unused]] K&& kernel,
                                  [[maybe_unused]] C&& conv,
                                  [[maybe_unused]] size_t s1,
                                  [[maybe_unused]] size_t s2,
                                  [[maybe_unused]] size_t p1,
                                  [[maybe_unused]] size_t p2) {
     cpp_unreachable("Unsupported feature called: cudnn conv4_backward_data_flipped");
 }

     //COVERAGE_EXCLUDE_END

 #endif

 } //end of namespace etl::impl::cudnn
etl::impl::cudnn::conv4_backward_data_full
void conv4_backward_data_full([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv)
cudnn implementation of a 4D &#39;full&#39; convolution C = I * K
Definition: conv.hpp:858

etl::impl::cudnn::conv2_valid
void conv2_valid([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 2D &#39;valid&#39; convolution C = I * K.
Definition: conv.hpp:726

etl::impl::cudnn
Definition: bias_add.hpp:24

etl::impl::cudnn::conv_possible_
constexpr bool conv_possible_
Traits indicating if Convolution with CUDNN is possible for the given configuration.
Definition: conv.hpp:45

etl::impl::cudnn::conv4_forward
void conv4_forward([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 4D &#39;valid&#39; convolution C = I * K
Definition: conv.hpp:764

etl::impl::cudnn::conv2_valid_multi
void conv2_valid_multi([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 2D &#39;valid&#39; convolution C = I * K.
Definition: conv.hpp:880

etl::impl::cudnn::conv4_backward_filter_flipped
void conv4_backward_filter_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 4D &#39;valid&#39; convolution C = I * K, where the output are considered to be ker...
Definition: conv.hpp:819

etl::impl::cudnn::conv4_backward_data_full_flipped
void conv4_backward_data_full_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv)
cudnn implementation of a 2D &#39;valid&#39; convolution C = I * K, with multiple kernels ...
Definition: conv.hpp:869

etl::cudnn_enabled
constexpr bool cudnn_enabled
Indicates if the NVIDIA CUDNN library is available for ETL.
Definition: config.hpp:114

etl::impl::cudnn::conv4_backward_data
void conv4_backward_data([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 4D &#39;valid&#39; backward convolution C = I * K
Definition: conv.hpp:914

etl::impl::cudnn::conv4_backward_data_flipped
void conv4_backward_data_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 2D &#39;valid&#39; backward convolution C = I * K
Definition: conv.hpp:931

etl::impl::cuda::cuda_memory
Wrapper for CUDA memory (when disabled CUDA support)
Definition: cuda_memory.hpp:233

etl::impl::cudnn::start_cudnn
cudnn_handle & start_cudnn()
Start cudnn and return a RTTI helper over a raw cudnn handle.
Definition: cudnn.hpp:75

etl::impl::cudnn::conv4_forward_flipped
void conv4_forward_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 4D &#39;valid&#39; convolution C = I * K, with flipped weights
Definition: conv.hpp:781

etl::impl::cudnn::create_tensor_wrapper
cudnn_wrapper< cudnnTensorDescriptor_t > create_tensor_wrapper(I &&input)
Create a CUDNN tensor for the given input matrix.
Definition: cudnn.hpp:246

etl::impl::cudnn::conv2_valid_flipped
void conv2_valid_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 2D &#39;valid&#39; convolution C = I * K.
Definition: conv.hpp:747

cudnn.hpp
Utility functions for cudnn.

etl::impl::cudnn::conv_possible
constexpr bool conv_possible
Traits indicating if Convolution with CUDNN is possible for the given configuration.
Definition: conv.hpp:35

etl::impl::cudnn::conv2_valid_multi_flipped
void conv2_valid_multi_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
Standard implementation of a 2D &#39;valid&#39; convolution C = I * K, with multiple flipped kernels...
Definition: conv.hpp:897

etl::value_t
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81

etl::impl::cudnn::conv4_backward_filter
void conv4_backward_filter([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 4D &#39;valid&#39; convolution C = I * K, where the output are considered to be ker...
Definition: conv.hpp:800

etl::impl::cudnn::create_filter_wrapper
cudnn_wrapper< cudnnFilterDescriptor_t > create_filter_wrapper(I &&kernel)
Create a CUDNN filter tensor for the given input matrix.
Definition: cudnn.hpp:452