wichtounet/etl/cublas_2sum_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 #pragma once

 #ifdef ETL_CUBLAS_MODE

 #include "etl/impl/cublas/cuda.hpp"
 #include "etl/impl/cublas/cublas.hpp"

 #ifdef ETL_EGBLAS_MODE
 #include "egblas.hpp"
 #endif

 #endif

 namespace etl::impl::cublas {

 #ifdef ETL_CUBLAS_MODE

 template <gpu_computable_single_precision A>
 float sum(const A& a) {
     decltype(auto) t1 = smart_forward_gpu(a);

     t1.ensure_gpu_up_to_date();

 #ifdef EGBLAS_HAS_SSUM
     return egblas_ssum(t1.gpu_memory(), etl::size(a), 1);
 #else
     decltype(auto) handle = start_cublas();

     auto ones = etl::impl::cuda::cuda_allocate_only<float>(etl::size(a));

 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wstrict-aliasing"

     // Set a vector of float one... :(
     float alpha        = 1.0;
     int32_t alpha_bits = *reinterpret_cast<int32_t*>(&alpha);
     cuMemsetD32(CUdeviceptr(ones.get()), alpha_bits, etl::size(a));

 #pragma GCC diagnostic pop

     float prod = 0.0;
     cublas_check(cublasSdot(handle.get(), etl::size(a), t1.gpu_memory(), 1, ones.get(), 1, &prod));
     return prod;
 #endif
 }

 template <gpu_computable_double_precision A>
 double sum(const A& a) {
     decltype(auto) t1 = smart_forward_gpu(a);

     t1.ensure_gpu_up_to_date();

 #ifdef EGBLAS_HAS_DSUM
     return egblas_dsum(t1.gpu_memory(), etl::size(a), 1);
 #else
     decltype(auto) handle = start_cublas();

     etl::dyn_vector<value_t<A>> ones(etl::size(a), 1.0);

     ones.ensure_gpu_up_to_date();

     double prod = 0.0;
     cublas_check(cublasDdot(handle.get(), etl::size(a), t1.gpu_memory(), 1, ones.gpu_memory(), 1, &prod));
     return prod;
 #endif
 }

 template <gpu_computable_single_precision A>
 float asum(const A& a) {
     decltype(auto) handle = start_cublas();

     decltype(auto) t1 = smart_forward_gpu(a);

     t1.ensure_gpu_up_to_date();

     float prod = 0.0;
     cublas_check(cublasSasum(handle.get(), etl::size(a), t1.gpu_memory(), 1, &prod));
     return prod;
 }

 template <gpu_computable_double_precision A>
 double asum(const A& a) {
     decltype(auto) handle = start_cublas();

     decltype(auto) t1 = smart_forward_gpu(a);

     t1.ensure_gpu_up_to_date();

     double prod = 0.0;
     cublas_check(cublasDasum(handle.get(), etl::size(a), t1.gpu_memory(), 1, &prod));
     return prod;
 }

 #else

 template <typename A>
 value_t<A> sum(const A& /*a*/) {
     cpp_unreachable("CUBLAS not enabled/available");
     return 0.0;
 }

 template <typename A>
 value_t<A> asum(const A& /*a*/) {
     cpp_unreachable("CUBLAS not enabled/available");
     return 0.0;
 }

 #endif

 } //end of namespace etl::impl::cublas
etl::dyn_vector
dyn_matrix_impl< T, order::RowMajor, 1 > dyn_vector
A dynamic vector, in row-major order.
Definition: value_fwd.hpp:196

etl::impl::cublas
Definition: axpy.hpp:22

etl
Root namespace for the ETL library.
Definition: adapter.hpp:15

cublas.hpp
Utility functions for cublas.

etl::asum
value_t< E > asum(E &&values)
Returns the sum of all the absolute values contained in the given expression.
Definition: expression_builder.hpp:637

etl::sum
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624

etl::smart_forward_gpu
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343

etl::size
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108

etl::ensure_gpu_up_to_date
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280

etl::value_t
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81

etl::gpu_memory
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674