20 #ifdef ETL_CUBLAS_MODE 22 #include "etl/impl/cublas/cuda.hpp" 25 #ifdef ETL_EGBLAS_MODE 33 #ifdef ETL_CUBLAS_MODE 39 template <gpu_computable_single_precision A>
40 float sum(
const A& a) {
45 #ifdef EGBLAS_HAS_SSUM 46 return egblas_ssum(t1.gpu_memory(),
etl::size(a), 1);
48 decltype(
auto) handle = start_cublas();
50 auto ones =
etl::impl::cuda::cuda_allocate_only<
float>(
etl::size(a));
52 #pragma GCC diagnostic push 53 #pragma GCC diagnostic ignored "-Wstrict-aliasing" 57 int32_t alpha_bits = *
reinterpret_cast<int32_t*
>(&alpha);
58 cuMemsetD32(CUdeviceptr(ones.get()), alpha_bits, etl::size(a));
60 #pragma GCC diagnostic pop 63 cublas_check(cublasSdot(handle.get(),
etl::size(a), t1.gpu_memory(), 1, ones.get(), 1, &prod));
71 template <gpu_computable_
double_precision A>
72 double sum(
const A& a) {
75 t1.ensure_gpu_up_to_date();
77 #ifdef EGBLAS_HAS_DSUM 78 return egblas_dsum(t1.gpu_memory(),
etl::size(a), 1);
80 decltype(
auto) handle = start_cublas();
84 ones.ensure_gpu_up_to_date();
87 cublas_check(cublasDdot(handle.get(),
etl::size(a), t1.
gpu_memory(), 1, ones.gpu_memory(), 1, &prod));
96 template <gpu_computable_single_precision A>
97 float asum(
const A& a) {
98 decltype(
auto) handle = start_cublas();
102 t1.ensure_gpu_up_to_date();
105 cublas_check(cublasSasum(handle.get(),
etl::size(a), t1.
gpu_memory(), 1, &prod));
112 template <gpu_computable_double_precision A>
113 double asum(const A& a) {
114 decltype(
auto) handle = start_cublas();
118 t1.ensure_gpu_up_to_date();
121 cublas_check(cublasDasum(handle.get(),
etl::size(a), t1.gpu_memory(), 1, &prod));
130 template <
typename A>
132 cpp_unreachable(
"CUBLAS not enabled/available");
139 template <
typename A>
141 cpp_unreachable(
"CUBLAS not enabled/available");
dyn_matrix_impl< T, order::RowMajor, 1 > dyn_vector
A dynamic vector, in row-major order.
Definition: value_fwd.hpp:196
Root namespace for the ETL library.
Definition: adapter.hpp:15
Utility functions for cublas.
value_t< E > asum(E &&values)
Returns the sum of all the absolute values contained in the given expression.
Definition: expression_builder.hpp:637
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674