Expression Templates Library (ETL)
sum.hpp
Go to the documentation of this file.
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
18 #pragma once
19 
20 #ifdef ETL_CUBLAS_MODE
21 
22 #include "etl/impl/cublas/cuda.hpp"
24 
25 #ifdef ETL_EGBLAS_MODE
26 #include "egblas.hpp"
27 #endif
28 
29 #endif
30 
31 namespace etl::impl::cublas {
32 
33 #ifdef ETL_CUBLAS_MODE
34 
39 template <gpu_computable_single_precision A>
40 float sum(const A& a) {
41  decltype(auto) t1 = smart_forward_gpu(a);
42 
44 
45 #ifdef EGBLAS_HAS_SSUM
46  return egblas_ssum(t1.gpu_memory(), etl::size(a), 1);
47 #else
48  decltype(auto) handle = start_cublas();
49 
50  auto ones = etl::impl::cuda::cuda_allocate_only<float>(etl::size(a));
51 
52 #pragma GCC diagnostic push
53 #pragma GCC diagnostic ignored "-Wstrict-aliasing"
54 
55  // Set a vector of float one... :(
56  float alpha = 1.0;
57  int32_t alpha_bits = *reinterpret_cast<int32_t*>(&alpha);
58  cuMemsetD32(CUdeviceptr(ones.get()), alpha_bits, etl::size(a));
59 
60 #pragma GCC diagnostic pop
61 
62  float prod = 0.0;
63  cublas_check(cublasSdot(handle.get(), etl::size(a), t1.gpu_memory(), 1, ones.get(), 1, &prod));
64  return prod;
65 #endif
66 }
67 
71 template <gpu_computable_double_precision A>
72 double sum(const A& a) {
73  decltype(auto) t1 = smart_forward_gpu(a);
74 
75  t1.ensure_gpu_up_to_date();
76 
77 #ifdef EGBLAS_HAS_DSUM
78  return egblas_dsum(t1.gpu_memory(), etl::size(a), 1);
79 #else
80  decltype(auto) handle = start_cublas();
81 
82  etl::dyn_vector<value_t<A>> ones(etl::size(a), 1.0);
83 
84  ones.ensure_gpu_up_to_date();
85 
86  double prod = 0.0;
87  cublas_check(cublasDdot(handle.get(), etl::size(a), t1.gpu_memory(), 1, ones.gpu_memory(), 1, &prod));
88  return prod;
89 #endif
90 }
91 
96 template <gpu_computable_single_precision A>
97 float asum(const A& a) {
98  decltype(auto) handle = start_cublas();
99 
100  decltype(auto) t1 = smart_forward_gpu(a);
101 
102  t1.ensure_gpu_up_to_date();
103 
104  float prod = 0.0;
105  cublas_check(cublasSasum(handle.get(), etl::size(a), t1.gpu_memory(), 1, &prod));
106  return prod;
107 }
108 
112 template <gpu_computable_double_precision A>
113 double asum(const A& a) {
114  decltype(auto) handle = start_cublas();
115 
116  decltype(auto) t1 = smart_forward_gpu(a);
117 
118  t1.ensure_gpu_up_to_date();
119 
120  double prod = 0.0;
121  cublas_check(cublasDasum(handle.get(), etl::size(a), t1.gpu_memory(), 1, &prod));
122  return prod;
123 }
124 
125 #else
126 
130 template <typename A>
131 value_t<A> sum(const A& /*a*/) {
132  cpp_unreachable("CUBLAS not enabled/available");
133  return 0.0;
134 }
135 
139 template <typename A>
140 value_t<A> asum(const A& /*a*/) {
141  cpp_unreachable("CUBLAS not enabled/available");
142  return 0.0;
143 }
144 
145 #endif
146 
147 } //end of namespace etl::impl::cublas
dyn_matrix_impl< T, order::RowMajor, 1 > dyn_vector
A dynamic vector, in row-major order.
Definition: value_fwd.hpp:196
Definition: axpy.hpp:22
Root namespace for the ETL library.
Definition: adapter.hpp:15
Utility functions for cublas.
value_t< E > asum(E &&values)
Returns the sum of all the absolute values contained in the given expression.
Definition: expression_builder.hpp:637
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674