15 #define ETL_EXPERIMENTAL_TENSOR_CORES 19 #include "etl/impl/cublas/cuda.hpp" 34 template <
typename I,
typename K,
typename C>
44 template <
typename I,
typename K>
59 template <
typename I,
typename K,
typename C>
60 void conv2_valid_set(I&& input, K&& kernel, C&& conv,
size_t s1,
size_t s2,
size_t p1,
size_t p2, cudnnConvolutionMode_t mode) {
61 using type = std::remove_const_t<value_t<I>>;
63 type alpha[] = {1.0f};
66 auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
76 cudnnConvolutionDescriptor_t convolution;
77 cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
78 cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
79 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES 80 cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
84 cudnnConvolutionFwdAlgoPerf_t algo;
86 cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(handle.get(), *input_tensor, *filter, convolution, *output_tensor,
92 workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
97 input.ensure_gpu_up_to_date();
98 kernel.ensure_gpu_up_to_date();
99 conv.ensure_gpu_allocated();
103 cudnn_check(cudnnConvolutionForward(handle.get(), alpha, *input_tensor, input.gpu_memory(), *filter, kernel.gpu_memory(), convolution, algo.algo,
104 workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));
107 conv.invalidate_cpu();
110 cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
123 template <
typename I,
typename K,
typename C>
125 [[maybe_unused]] K&& kernel,
126 [[maybe_unused]] C&& conv,
127 [[maybe_unused]]
size_t s1,
128 [[maybe_unused]]
size_t s2,
129 [[maybe_unused]]
size_t p1,
130 [[maybe_unused]]
size_t p2) {
131 if constexpr (conv_possible<I, K, C>) {
132 conv2_valid_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
134 cpp_unreachable(
"CUDNN not available/enabled");
149 template <
typename I,
typename K,
typename C>
151 [[maybe_unused]] K&& kernel,
152 [[maybe_unused]] C&& conv,
153 [[maybe_unused]]
size_t s1,
154 [[maybe_unused]]
size_t s2,
155 [[maybe_unused]]
size_t p1,
156 [[maybe_unused]]
size_t p2) {
157 if constexpr (conv_possible<I, K, C>) {
158 conv2_valid_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
160 cpp_unreachable(
"CUDNN not available/enabled");
170 template <
typename I,
typename K,
typename C>
171 void conv4_forward_set(I&& input, K&& kernel, C&& conv,
size_t s1,
size_t s2,
size_t p1,
size_t p2, cudnnConvolutionMode_t mode) {
174 type alpha[] = {1.0f};
175 type beta[] = {0.0f};
177 auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
187 cudnnConvolutionDescriptor_t convolution;
188 cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
189 cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
190 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES 191 cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
195 cudnnConvolutionFwdAlgoPerf_t algo;
197 cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(handle.get(), *input_tensor, *filter, convolution, *output_tensor,
203 workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
208 input.ensure_gpu_up_to_date();
209 kernel.ensure_gpu_up_to_date();
210 conv.ensure_gpu_allocated();
214 cudnn_check(cudnnConvolutionForward(handle.get(), alpha, *input_tensor, input.gpu_memory(), *filter, kernel.gpu_memory(), convolution, algo.algo,
215 workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));
218 conv.invalidate_cpu();
221 cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
230 template <
typename I,
typename K,
typename C>
232 [[maybe_unused]] K&& kernel,
233 [[maybe_unused]] C&& conv,
234 [[maybe_unused]]
size_t s1,
235 [[maybe_unused]]
size_t s2,
236 [[maybe_unused]]
size_t p1,
237 [[maybe_unused]]
size_t p2) {
238 if constexpr (conv_possible<I, K, C>) {
239 conv4_forward_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
241 cpp_unreachable(
"CUDNN not available/enabled");
251 template <
typename I,
typename K,
typename C>
253 [[maybe_unused]] K&& kernel,
254 [[maybe_unused]] C&& conv,
255 [[maybe_unused]]
size_t s1,
256 [[maybe_unused]]
size_t s2,
257 [[maybe_unused]]
size_t p1,
258 [[maybe_unused]]
size_t p2) {
259 if constexpr (conv_possible<I, K, C>) {
260 conv4_forward_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
262 cpp_unreachable(
"CUDNN not available/enabled");
274 template <
typename I,
typename K,
typename C>
275 void conv4_backward_filter_set(I&& input, K&& kernel, C&& conv,
size_t s1,
size_t s2,
size_t p1,
size_t p2, cudnnConvolutionMode_t mode) {
278 type alpha[] = {1.0f};
279 type beta[] = {0.0f};
281 auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
291 cudnnConvolutionDescriptor_t convolution;
292 cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
293 cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
294 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES 295 cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
299 cudnnConvolutionBwdFilterAlgoPerf_t algo;
301 cudnn_check(cudnnGetConvolutionBackwardFilterAlgorithm_v7(handle.get(), *input_tensor, *output_tensor, convolution, *filter,
307 workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
312 input.ensure_gpu_up_to_date();
313 kernel.ensure_gpu_up_to_date();
314 conv.ensure_gpu_allocated();
318 cudnn_check(cudnnConvolutionBackwardFilter(handle.get(), alpha, *input_tensor, input.gpu_memory(), *output_tensor, kernel.gpu_memory(), convolution,
319 algo.algo, workspace.get(), algo.memory, beta, *filter, conv.gpu_memory()));
322 conv.invalidate_cpu();
325 cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
336 template <
typename I,
typename K,
typename C>
338 [[maybe_unused]] K&& kernel,
339 [[maybe_unused]] C&& conv,
340 [[maybe_unused]]
size_t s1,
341 [[maybe_unused]]
size_t s2,
342 [[maybe_unused]]
size_t p1,
343 [[maybe_unused]]
size_t p2) {
344 if constexpr (conv_possible<I, K, C>) {
345 conv4_backward_filter_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
347 cpp_unreachable(
"CUDNN not available/enabled");
359 template <
typename I,
typename K,
typename C>
361 [[maybe_unused]] K&& kernel,
362 [[maybe_unused]] C&& conv,
363 [[maybe_unused]]
size_t s1,
364 [[maybe_unused]]
size_t s2,
365 [[maybe_unused]]
size_t p1,
366 [[maybe_unused]]
size_t p2) {
367 if constexpr (conv_possible<I, K, C>) {
368 conv4_backward_filter_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
370 cpp_unreachable(
"CUDNN not available/enabled");
380 template <
typename I,
typename K,
typename C>
381 void conv2_full_set(I&& input, K&& kernel, C&& conv, cudnnConvolutionMode_t mode) {
382 using type = std::remove_const_t<value_t<I>>;
384 type alpha[] = {1.0f};
385 type beta[] = {0.0f};
387 auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
397 cudnnConvolutionDescriptor_t convolution;
398 cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
399 cudnn_check(cudnnSetConvolution2dDescriptor(convolution, 0, 0, 1, 1, 1, 1, mode, data_type));
400 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES 401 cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
405 cudnnConvolutionBwdDataAlgoPerf_t algo;
407 cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle.get(), *filter, *input_tensor, convolution, *output_tensor,
413 workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
418 input.ensure_gpu_up_to_date();
419 kernel.ensure_gpu_up_to_date();
420 conv.ensure_gpu_allocated();
424 cudnn_check(cudnnConvolutionBackwardData(handle.get(), alpha, *filter, kernel.gpu_memory(), *input_tensor, input.gpu_memory(), convolution, algo.algo,
425 workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));
428 conv.invalidate_cpu();
431 cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
440 template <
typename I,
typename K,
typename C>
441 void conv2_full([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
442 if constexpr (conv_possible<I, K, C>) {
443 conv2_full_set(input, kernel, conv, CUDNN_CROSS_CORRELATION);
445 cpp_unreachable(
"CUDNN not available/enabled");
455 template <
typename I,
typename K,
typename C>
456 void conv2_full_flipped([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
457 if constexpr (conv_possible<I, K, C>) {
458 conv2_full_set(input, kernel, conv, CUDNN_CONVOLUTION);
460 cpp_unreachable(
"CUDNN not available/enabled");
470 template <
typename I,
typename K,
typename C>
471 void conv2_valid_multi_set(I& input, K&& kernel, C&& conv,
size_t s1,
size_t s2,
size_t p1,
size_t p2, cudnnConvolutionMode_t mode) {
472 using type = std::remove_const_t<value_t<I>>;
474 auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
476 type alpha[] = {1.0f};
477 type beta[] = {0.0f};
482 cudnnTensorDescriptor_t input_tensor;
483 cudnn_check(cudnnCreateTensorDescriptor(&input_tensor));
484 cudnn_check(cudnnSetTensor4dDescriptor(input_tensor, CUDNN_TENSOR_NCHW, data_type, 1, 1, etl::dim<0>(input), etl::dim<1>(input)));
487 cudnnTensorDescriptor_t output_tensor;
488 cudnn_check(cudnnCreateTensorDescriptor(&output_tensor));
489 cudnn_check(cudnnSetTensor4dDescriptor(output_tensor, CUDNN_TENSOR_NCHW, data_type, 1, etl::dim<0>(conv), etl::dim<1>(conv), etl::dim<2>(conv)));
492 cudnnFilterDescriptor_t filter;
493 cudnn_check(cudnnCreateFilterDescriptor(&filter));
494 cudnn_check(cudnnSetFilter4dDescriptor(filter, data_type, CUDNN_TENSOR_NCHW, etl::dim<0>(kernel), 1, etl::dim<1>(kernel), etl::dim<2>(kernel)));
497 cudnnConvolutionDescriptor_t convolution;
498 cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
499 cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
500 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES 501 cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
505 cudnnConvolutionFwdAlgoPerf_t algo;
507 cudnn_check(cudnnGetConvolutionForwardAlgorithm_v7(handle.get(), input_tensor, filter, convolution, output_tensor,
513 workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
518 input.ensure_gpu_up_to_date();
519 kernel.ensure_gpu_up_to_date();
520 conv.ensure_gpu_allocated();
524 cudnn_check(cudnnConvolutionForward(handle.get(), alpha, input_tensor, input.gpu_memory(), filter, kernel.gpu_memory(), convolution, algo.algo,
525 workspace.get(), algo.memory, beta, output_tensor, conv.gpu_memory()));
528 conv.invalidate_cpu();
531 cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
532 cudnn_check(cudnnDestroyFilterDescriptor(filter));
533 cudnn_check(cudnnDestroyTensorDescriptor(output_tensor));
534 cudnn_check(cudnnDestroyTensorDescriptor(input_tensor));
543 template <
typename I,
typename K,
typename C>
545 [[maybe_unused]] K&& kernel,
546 [[maybe_unused]] C&& conv,
547 [[maybe_unused]]
size_t s1,
548 [[maybe_unused]]
size_t s2,
549 [[maybe_unused]]
size_t p1,
550 [[maybe_unused]]
size_t p2) {
551 if constexpr (conv_possible<I, K, C>) {
552 conv2_valid_multi_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CONVOLUTION);
554 cpp_unreachable(
"CUDNN not available/enabled");
564 template <
typename I,
typename K,
typename C>
566 [[maybe_unused]] K&& kernel,
567 [[maybe_unused]] C&& conv,
568 [[maybe_unused]]
size_t s1,
569 [[maybe_unused]]
size_t s2,
570 [[maybe_unused]]
size_t p1,
571 [[maybe_unused]]
size_t p2) {
572 if constexpr (conv_possible<I, K, C>) {
573 conv2_valid_multi_set(input, kernel, conv, s1, s2, p1, p2, CUDNN_CROSS_CORRELATION);
575 cpp_unreachable(
"CUDNN not available/enabled");
585 template <
typename I,
typename K,
typename C>
586 void conv4_backward_data_set(I&& input, K&& kernel, C&& conv, cudnnConvolutionMode_t mode,
size_t s1,
size_t s2,
size_t p1,
size_t p2) {
589 type alpha[] = {1.0f};
590 type beta[] = {0.0f};
592 auto data_type = std::is_same_v<type, float> ? CUDNN_DATA_FLOAT : CUDNN_DATA_DOUBLE;
602 cudnnConvolutionDescriptor_t convolution;
603 cudnn_check(cudnnCreateConvolutionDescriptor(&convolution));
604 cudnn_check(cudnnSetConvolution2dDescriptor(convolution, p1, p2, s1, s2, 1, 1, mode, data_type));
605 #ifdef ETL_EXPERIMENTAL_TENSOR_CORES 606 cudnn_check(cudnnSetConvolutionMathType(convolution, CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION));
610 cudnnConvolutionBwdDataAlgoPerf_t algo;
612 cudnn_check(cudnnGetConvolutionBackwardDataAlgorithm_v7(handle.get(), *filter, *input_tensor, convolution, *output_tensor,
618 workspace = impl::cuda::cuda_allocate_only<type>(algo.memory);
623 input.ensure_gpu_up_to_date();
624 kernel.ensure_gpu_up_to_date();
625 conv.ensure_gpu_allocated();
629 cudnn_check(cudnnConvolutionBackwardData(handle.get(), alpha, *filter, kernel.gpu_memory(), *input_tensor, input.gpu_memory(), convolution, algo.algo,
630 workspace.get(), algo.memory, beta, *output_tensor, conv.gpu_memory()));
633 conv.invalidate_cpu();
636 cudnn_check(cudnnDestroyConvolutionDescriptor(convolution));
645 template <
typename I,
typename K,
typename C>
647 [[maybe_unused]] K&& kernel,
648 [[maybe_unused]] C&& conv,
649 [[maybe_unused]]
size_t s1,
650 [[maybe_unused]]
size_t s2,
651 [[maybe_unused]]
size_t p1,
652 [[maybe_unused]]
size_t p2) {
653 if constexpr (conv_possible<I, K, C>) {
654 conv4_backward_data_set(input, kernel, conv, CUDNN_CROSS_CORRELATION, s1, s2, p1, p2);
656 cpp_unreachable(
"CUDNN not available/enabled");
666 template <
typename I,
typename K,
typename C>
668 [[maybe_unused]] K&& kernel,
669 [[maybe_unused]] C&& conv,
670 [[maybe_unused]]
size_t s1,
671 [[maybe_unused]]
size_t s2,
672 [[maybe_unused]]
size_t p1,
673 [[maybe_unused]]
size_t p2) {
674 if constexpr (conv_possible<I, K, C>) {
675 conv4_backward_data_set(input, kernel, conv, CUDNN_CONVOLUTION, s1, s2, p1, p2);
677 cpp_unreachable(
"CUDNN not available/enabled");
687 template <
typename I,
typename K,
typename C>
689 if constexpr (conv_possible<I, K, C>) {
690 conv4_backward_data_set(input, kernel, conv, CUDNN_CROSS_CORRELATION, 1, 1, 0, 0);
692 cpp_unreachable(
"CUDNN not available/enabled");
702 template <
typename I,
typename K,
typename C>
704 if constexpr (conv_possible<I, K, C>) {
705 conv4_backward_data_set(input, kernel, conv, CUDNN_CONVOLUTION, 1, 1, 0, 0);
707 cpp_unreachable(
"CUDNN not available/enabled");
725 template <
typename I,
typename K,
typename C>
727 [[maybe_unused]] K&& kernel,
728 [[maybe_unused]] C&& conv,
729 [[maybe_unused]]
size_t s1,
730 [[maybe_unused]]
size_t s2,
731 [[maybe_unused]]
size_t p1,
732 [[maybe_unused]]
size_t p2) {
733 cpp_unreachable(
"CUDNN not available/enabled");
746 template <
typename I,
typename K,
typename C>
748 [[maybe_unused]] K&& kernel,
749 [[maybe_unused]] C&& conv,
750 [[maybe_unused]]
size_t s1,
751 [[maybe_unused]]
size_t s2,
752 [[maybe_unused]]
size_t p1,
753 [[maybe_unused]]
size_t p2) {
754 cpp_unreachable(
"CUDNN not available/enabled");
763 template <
typename I,
typename K,
typename C>
765 [[maybe_unused]] K&& kernel,
766 [[maybe_unused]] C&& conv,
767 [[maybe_unused]]
size_t s1,
768 [[maybe_unused]]
size_t s2,
769 [[maybe_unused]]
size_t p1,
770 [[maybe_unused]]
size_t p2) {
771 cpp_unreachable(
"Unsupported feature called: cudnn conv4_valid");
780 template <
typename I,
typename K,
typename C>
782 [[maybe_unused]] K&& kernel,
783 [[maybe_unused]] C&& conv,
784 [[maybe_unused]]
size_t s1,
785 [[maybe_unused]]
size_t s2,
786 [[maybe_unused]]
size_t p1,
787 [[maybe_unused]]
size_t p2) {
788 cpp_unreachable(
"Unsupported feature called: cudnn conv4_valid_flipped");
799 template <
typename I,
typename K,
typename C>
801 [[maybe_unused]] K&& kernel,
802 [[maybe_unused]] C&& conv,
803 [[maybe_unused]]
size_t s1,
804 [[maybe_unused]]
size_t s2,
805 [[maybe_unused]]
size_t p1,
806 [[maybe_unused]]
size_t p2) {
807 cpp_unreachable(
"Unsupported feature called: cudnn conv4_valid_filter");
818 template <
typename I,
typename K,
typename C>
820 [[maybe_unused]] K&& kernel,
821 [[maybe_unused]] C&& conv,
822 [[maybe_unused]]
size_t s1,
823 [[maybe_unused]]
size_t s2,
824 [[maybe_unused]]
size_t p1,
825 [[maybe_unused]]
size_t p2) {
826 cpp_unreachable(
"Unsupported feature called: cudnn conv4_backward_filter_flipped");
835 template <
typename I,
typename K,
typename C>
836 void conv2_full([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
837 cpp_unreachable(
"Unsupported feature called: cudnn conv2_full");
846 template <
typename I,
typename K,
typename C>
847 void conv2_full_flipped([[maybe_unused]] I&& input, [[maybe_unused]] K&& kernel, [[maybe_unused]] C&& conv) {
848 cpp_unreachable(
"Unsupported feature called: cudnn conv2_full_flipped");
857 template <
typename I,
typename K,
typename C>
859 cpp_unreachable(
"Unsupported feature called: cudnn conv4_full");
868 template <
typename I,
typename K,
typename C>
870 cpp_unreachable(
"Unsupported feature called: cudnn conv4_ful_flippedl");
879 template <
typename I,
typename K,
typename C>
881 [[maybe_unused]] K&& kernel,
882 [[maybe_unused]] C&& conv,
883 [[maybe_unused]]
size_t s1,
884 [[maybe_unused]]
size_t s2,
885 [[maybe_unused]]
size_t p1,
886 [[maybe_unused]]
size_t p2) {
887 cpp_unreachable(
"Unsupported feature called: cudnn conv2_valid_multi");
896 template <
typename I,
typename K,
typename C>
898 [[maybe_unused]] K&& kernel,
899 [[maybe_unused]] C&& conv,
900 [[maybe_unused]]
size_t s1,
901 [[maybe_unused]]
size_t s2,
902 [[maybe_unused]]
size_t p1,
903 [[maybe_unused]]
size_t p2) {
904 cpp_unreachable(
"Unsupported feature called: cudnn conv2_valid_multi_flipped");
913 template <
typename I,
typename K,
typename C>
915 [[maybe_unused]] K&& kernel,
916 [[maybe_unused]] C&& conv,
917 [[maybe_unused]]
size_t s1,
918 [[maybe_unused]]
size_t s2,
919 [[maybe_unused]]
size_t p1,
920 [[maybe_unused]]
size_t p2) {
921 cpp_unreachable(
"Unsupported feature called: cudnn conv4_backward_data");
930 template <
typename I,
typename K,
typename C>
932 [[maybe_unused]] K&& kernel,
933 [[maybe_unused]] C&& conv,
934 [[maybe_unused]]
size_t s1,
935 [[maybe_unused]]
size_t s2,
936 [[maybe_unused]]
size_t p1,
937 [[maybe_unused]]
size_t p2) {
938 cpp_unreachable(
"Unsupported feature called: cudnn conv4_backward_data_flipped");
void conv4_backward_data_full([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv)
cudnn implementation of a 4D 'full' convolution C = I * K
Definition: conv.hpp:858
void conv2_valid([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 2D 'valid' convolution C = I * K.
Definition: conv.hpp:726
Definition: bias_add.hpp:24
constexpr bool conv_possible_
Traits indicating if Convolution with CUDNN is possible for the given configuration.
Definition: conv.hpp:45
void conv4_forward([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 4D 'valid' convolution C = I * K
Definition: conv.hpp:764
void conv2_valid_multi([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 2D 'valid' convolution C = I * K.
Definition: conv.hpp:880
void conv4_backward_filter_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 4D 'valid' convolution C = I * K, where the output are considered to be ker...
Definition: conv.hpp:819
void conv4_backward_data_full_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv)
cudnn implementation of a 2D 'valid' convolution C = I * K, with multiple kernels ...
Definition: conv.hpp:869
constexpr bool cudnn_enabled
Indicates if the NVIDIA CUDNN library is available for ETL.
Definition: config.hpp:114
void conv4_backward_data([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 4D 'valid' backward convolution C = I * K
Definition: conv.hpp:914
void conv4_backward_data_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 2D 'valid' backward convolution C = I * K
Definition: conv.hpp:931
Wrapper for CUDA memory (when disabled CUDA support)
Definition: cuda_memory.hpp:233
cudnn_handle & start_cudnn()
Start cudnn and return a RTTI helper over a raw cudnn handle.
Definition: cudnn.hpp:75
void conv4_forward_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
cudnn implementation of a 4D 'valid' convolution C = I * K, with flipped weights
Definition: conv.hpp:781
cudnn_wrapper< cudnnTensorDescriptor_t > create_tensor_wrapper(I &&input)
Create a CUDNN tensor for the given input matrix.
Definition: cudnn.hpp:246
void conv2_valid_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 2D 'valid' convolution C = I * K.
Definition: conv.hpp:747
Utility functions for cudnn.
constexpr bool conv_possible
Traits indicating if Convolution with CUDNN is possible for the given configuration.
Definition: conv.hpp:35
void conv2_valid_multi_flipped([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
Standard implementation of a 2D 'valid' convolution C = I * K, with multiple flipped kernels...
Definition: conv.hpp:897
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
void conv4_backward_filter([[maybe_unused]] I &&input, [[maybe_unused]] K &&kernel, [[maybe_unused]] C &&conv, [[maybe_unused]] size_t s1, [[maybe_unused]] size_t s2, [[maybe_unused]] size_t p1, [[maybe_unused]] size_t p2)
CUDNN implementation of a 4D 'valid' convolution C = I * K, where the output are considered to be ker...
Definition: conv.hpp:800
cudnn_wrapper< cudnnFilterDescriptor_t > create_filter_wrapper(I &&kernel)
Create a CUDNN filter tensor for the given input matrix.
Definition: cudnn.hpp:452