wichtounet/etl/cublas_2outer_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 #pragma once

 #ifdef ETL_CUBLAS_MODE

 #include "etl/impl/cublas/cuda.hpp"
 #include "etl/impl/cublas/cublas.hpp"

 #endif

 namespace etl::impl::cublas {

 #ifdef ETL_CUBLAS_MODE

 template <etl_single_precision A, etl_single_precision B, etl_single_precision C>
 void batch_outer(const A& a, const B& b, C&& c) {
     decltype(auto) handle = start_cublas();

     float alpha = 1.0;
     float beta  = 0.0;

     // This is brain-killing :s
     // CUBLAS need matrices in column-major order. By switching both
     // matrices, this is achieved. However, since one of the matrix
     // needs to be transposed, it must be changed again

     a.ensure_gpu_up_to_date();
     b.ensure_gpu_up_to_date();
     c.ensure_gpu_allocated();

     cublas_check(cublasSgemm(handle.get(), CUBLAS_OP_N, CUBLAS_OP_T, etl::columns(c), etl::rows(c), etl::rows(b), &alpha, b.gpu_memory(), etl::columns(b),
                              a.gpu_memory(), etl::columns(a), &beta, c.gpu_memory(), etl::columns(b)));

     c.validate_gpu();
     c.invalidate_cpu();
 }

 template <etl_double_precision A, etl_double_precision B, etl_double_precision C>
 void batch_outer(const A& a, const B& b, C&& c) {
     decltype(auto) handle = start_cublas();

     double alpha = 1.0;
     double beta  = 0.0;

     a.ensure_gpu_up_to_date();
     b.ensure_gpu_up_to_date();
     c.ensure_gpu_allocated();

     cublas_check(cublasDgemm(handle.get(), CUBLAS_OP_N, CUBLAS_OP_T, etl::columns(c), etl::rows(c), etl::rows(b), &alpha, b.gpu_memory(), etl::columns(b),
                              a.gpu_memory(), etl::columns(a), &beta, c.gpu_memory(), etl::columns(b)));

     c.validate_gpu();
     c.invalidate_cpu();
 }

 #else

 template <typename A, typename B, typename C>
 void batch_outer(const A& /*a*/, const B& /*b*/, C&& /*c*/) {
     cpp_unreachable("CUBLAS not enabled/available");
 }

 #endif

 } //end of namespace etl::impl::cublas
etl::ensure_gpu_allocated
void ensure_gpu_allocated() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: sub_view.hpp:717

etl::impl::cublas
Definition: axpy.hpp:22

etl::batch_outer
batch_outer_product_expr< detail::build_type< A >, detail::build_type< B > > batch_outer(A &&a, B &&b)
Batch Outer product multiplication of two matrices.
Definition: batch_outer_product_expr.hpp:333

etl
Root namespace for the ETL library.
Definition: adapter.hpp:15

cublas.hpp
Utility functions for cublas.

etl::columns
size_t columns(const E &expr)
Returns the number of columns of the given ETL expression.
Definition: helpers.hpp:78

etl::invalidate_cpu
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: sub_view.hpp:688

etl::ensure_gpu_up_to_date
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280

etl::validate_gpu
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: sub_view.hpp:709

etl::rows
size_t rows(const E &expr)
Returns the number of rows of the given ETL expression.
Definition: helpers.hpp:58

etl::gpu_memory
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674