10 #include "etl/expr/base_temporary_expr.hpp" 21 template <etl_4d A,
bool Mean>
35 || (impl::egblas::has_sbias_batch_sum4 && all_row_major<A> && all_single_precision<A>)
36 || (impl::egblas::has_dbias_batch_sum4 && all_row_major<A> && all_double_precision<A>);
52 static void check([[maybe_unused]]
const A&
a, [[maybe_unused]]
const C& c) {
53 if constexpr (all_fast<A, C>) {
54 static_assert(etl::dim<1, A>() == etl::dim<0, C>(),
"Invalid dimensions for bias_batch_mean_4d");
56 cpp_assert(etl::dim<1>(a) == etl::dim<0>(c),
"Invalid dimensions for bias_batch_mean_4d");
76 if constexpr (Mean && impl::egblas::has_sbias_batch_mean4 && all_row_major<A> && all_floating<A, L>) {
77 const auto N = etl::dim<0>(
a);
78 const auto K = etl::dim<1>(
a);
79 const auto W = etl::dim<2>(
a);
80 const auto H = etl::dim<3>(
a);
83 t1.ensure_gpu_up_to_date();
85 lhs.ensure_gpu_allocated();
87 impl::egblas::bias_batch_mean4(N, K, W, H, t1.gpu_memory(), lhs.gpu_memory());
91 }
else if constexpr (!Mean && impl::egblas::has_sbias_batch_sum4 && all_row_major<A> && all_floating<A, L>) {
92 const auto N = etl::dim<0>(
a);
93 const auto K = etl::dim<1>(
a);
94 const auto W = etl::dim<2>(
a);
95 const auto H = etl::dim<3>(
a);
98 t1.ensure_gpu_up_to_date();
100 lhs.ensure_gpu_allocated();
102 impl::egblas::bias_batch_sum4(N, K, W, H, t1.gpu_memory(), lhs.gpu_memory());
105 lhs.invalidate_cpu();
106 }
else if constexpr (!Mean &&
cudnn_enabled && all_floating<A, L>) {
112 standard_evaluator::pre_assign_rhs(
a);
114 a.ensure_cpu_up_to_date();
116 auto batch_fun_k = [&](
const size_t first,
const size_t last) {
118 for (
size_t k = first; k < last; ++k) {
121 for (
size_t b = 0; b < etl::dim<0>(
a); ++b) {
122 mean +=
sum(
a(b)(k));
125 if constexpr (Mean) {
126 lhs(k) = mean /
static_cast<T
>(N);
137 lhs.invalidate_gpu();
147 if constexpr (all_floating<A, L> && ((!Mean &&
cudnn_enabled) || (all_row_major<A, L> && impl::egblas::has_sbias_batch_sum4))) {
152 standard_evaluator::pre_assign_rhs(
a);
154 a.ensure_cpu_up_to_date();
155 a.ensure_gpu_up_to_date();
164 lhs.ensure_cpu_up_to_date();
166 auto batch_fun_k = [&](
const size_t first,
const size_t last) {
168 for (
size_t k = first; k < last; ++k) {
171 for (
size_t b = 0; b < etl::dim<0>(
a); ++b) {
172 mean +=
sum(
a(b)(k));
175 if constexpr (Mean) {
176 lhs(k) += mean /
static_cast<T
>(N);
187 lhs.invalidate_gpu();
197 if constexpr (all_floating<A, L> && ((!Mean &&
cudnn_enabled) || (all_row_major<A, L> && impl::egblas::has_sbias_batch_sum4))) {
202 standard_evaluator::pre_assign_rhs(
a);
204 a.ensure_cpu_up_to_date();
205 a.ensure_gpu_up_to_date();
214 lhs.ensure_cpu_up_to_date();
216 auto batch_fun_k = [&](
const size_t first,
const size_t last) {
218 for (
size_t k = first; k < last; ++k) {
221 for (
size_t b = 0; b < etl::dim<0>(
a); ++b) {
222 mean +=
sum(
a(b)(k));
225 if constexpr (Mean) {
226 lhs(k) -= mean /
static_cast<T
>(N);
237 lhs.invalidate_gpu();
247 if constexpr (all_floating<A, L> && ((!Mean &&
cudnn_enabled) || (all_row_major<A, L> && impl::egblas::has_sbias_batch_sum4))) {
252 standard_evaluator::pre_assign_rhs(
a);
254 a.ensure_cpu_up_to_date();
255 a.ensure_gpu_up_to_date();
264 lhs.ensure_cpu_up_to_date();
266 auto batch_fun_k = [&](
const size_t first,
const size_t last) {
268 for (
size_t k = first; k < last; ++k) {
271 for (
size_t b = 0; b < etl::dim<0>(
a); ++b) {
272 mean +=
sum(
a(b)(k));
275 if constexpr (Mean) {
276 lhs(k) *= mean /
static_cast<T
>(N);
287 lhs.invalidate_gpu();
297 if constexpr (all_floating<A, L> && ((!Mean &&
cudnn_enabled) || (all_row_major<A, L> && impl::egblas::has_sbias_batch_sum4))) {
302 standard_evaluator::pre_assign_rhs(
a);
304 a.ensure_cpu_up_to_date();
305 a.ensure_gpu_up_to_date();
314 lhs.ensure_cpu_up_to_date();
316 auto batch_fun_k = [&](
const size_t first,
const size_t last) {
318 for (
size_t k = first; k < last; ++k) {
321 for (
size_t b = 0; b < etl::dim<0>(
a); ++b) {
322 mean +=
sum(
a(b)(k));
325 if constexpr (Mean) {
326 lhs(k) /= mean /
static_cast<T
>(N);
337 lhs.invalidate_gpu();
347 if constexpr (all_floating<A, L> && ((!Mean &&
cudnn_enabled) || (all_row_major<A, L> && impl::egblas::has_sbias_batch_sum4))) {
352 standard_evaluator::pre_assign_rhs(
a);
354 a.ensure_cpu_up_to_date();
355 a.ensure_gpu_up_to_date();
364 lhs.ensure_cpu_up_to_date();
366 auto batch_fun_k = [&](
const size_t first,
const size_t last) {
368 for (
size_t k = first; k < last; ++k) {
371 for (
size_t b = 0; b < etl::dim<0>(
a); ++b) {
372 mean +=
sum(
a(b)(k));
375 if constexpr (Mean) {
376 lhs(k) %= mean /
static_cast<T
>(N);
387 lhs.invalidate_gpu();
399 return os <<
"bias_batch_mean_4d(" << expr.
_a <<
")";
401 return os <<
"bias_batch_sum_4d(" << expr.
_a <<
")";
410 template <
typename A,
bool Mean>
417 static constexpr
bool is_etl =
true;
422 static constexpr
bool is_linear =
false;
424 static constexpr
bool is_value =
false;
425 static constexpr
bool is_direct =
true;
426 static constexpr
bool is_generator =
false;
427 static constexpr
bool is_padded =
false;
428 static constexpr
bool is_aligned =
true;
429 static constexpr
bool is_temporary =
true;
430 static constexpr
bool gpu_computable = is_gpu_t<value_type> &&
cuda_enabled;
431 static constexpr
order storage_order = sub_traits::storage_order;
438 template <vector_mode_t V>
439 static constexpr
bool vectorizable =
true;
446 static constexpr
size_t dim() {
447 static_assert(DD == 0,
"Invalid dimensions access");
457 static size_t dim(
const expr_t& e, [[maybe_unused]]
size_t d) {
458 cpp_assert(d == 0,
"Invalid dimensions access");
460 return etl::dim<1>(e.
_a);
469 return etl::dim<1>(e.
_a);
476 static constexpr
size_t size() {
value_t< E > mean(E &&values)
Returns the mean of all the values contained in the given expression.
Definition: expression_builder.hpp:650
void assign_mod_to(L &&lhs) const
Modulo the given left-hand-side expression.
Definition: bias_batch_mean_4d_expr.hpp:346
bias_batch_mean_4d_expr< detail::build_type< E >, false > bias_batch_sum_4d(const E &value)
Returns the transpose of the given expression.
Definition: bias_batch_mean_4d_expr.hpp:513
value_t< A > value_type
The value type of the expression.
Definition: bias_batch_mean_4d_expr.hpp:415
void assign_to(L &&lhs) const
Assign to a matrix of the same storage order.
Definition: bias_batch_mean_4d_expr.hpp:67
std::add_lvalue_reference_t< A > a()
Returns the sub expression.
Definition: base_temporary_expr.hpp:489
static constexpr auto storage_order
The sub storage order.
Definition: bias_batch_mean_4d_expr.hpp:28
value_t< A > value_type
The type of value of the expression.
Definition: bias_batch_mean_4d_expr.hpp:23
void engine_dispatch_1d_serial(Functor &&functor, size_t first, size_t last, size_t threshold, [[maybe_unused]] size_t n_threads=etl::threads)
Dispatch the elements of a range to a functor in a parallel manner, using the global thread engine...
Definition: parallel_support.hpp:734
static constexpr size_t size()
Returns the size of the expression.
Definition: bias_batch_mean_4d_expr.hpp:476
constexpr bool is_magic_view
Traits indicating if the given ETL type is a magic view expression.
Definition: traits.hpp:311
static constexpr size_t dimensions()
Returns the number of dimensions of the expression.
Definition: bias_batch_mean_4d_expr.hpp:484
order
Storage order of a matrix.
Definition: order.hpp:15
static void check([[maybe_unused]] const A &a, [[maybe_unused]] const C &c)
Validate the transposition dimensions.
Definition: bias_batch_mean_4d_expr.hpp:52
constexpr bool cuda_enabled
Indicates if CUDA is available.
Definition: config.hpp:94
EGBLAS wrappers for the bias_batch_sum operation.
std::decay_t< A > sub_expr_t
The sub expression type.
Definition: bias_batch_mean_4d_expr.hpp:413
static constexpr bool gpu_computable
Indicates if the temporary expression can be directly evaluated using only GPU.
Definition: bias_batch_mean_4d_expr.hpp:34
A transposition expression.
Definition: bias_batch_mean_4d_expr.hpp:22
A _a
The sub expression reference.
Definition: base_temporary_expr.hpp:447
static size_t size(const expr_t &e)
Returns the size of the expression.
Definition: bias_batch_mean_4d_expr.hpp:468
constexpr bool is_fast
Traits to test if the given ETL expresion type is fast (sizes known at compile-time) ...
Definition: traits.hpp:588
void assign_add_to(L &&lhs) const
Add to the given left-hand-side expression.
Definition: bias_batch_mean_4d_expr.hpp:146
static size_t dim(const expr_t &e, [[maybe_unused]] size_t d)
Returns the dth dimension of the expression.
Definition: bias_batch_mean_4d_expr.hpp:457
Traits to get information about ETL types.
Definition: tmp.hpp:68
Root namespace for the ETL library.
Definition: adapter.hpp:15
friend std::ostream & operator<<(std::ostream &os, const bias_batch_mean_4d_expr &expr)
Print a representation of the expression on the given stream.
Definition: bias_batch_mean_4d_expr.hpp:397
bias_batch_mean implementations with NVidia cuDNN library
constexpr bool cudnn_enabled
Indicates if the NVIDIA CUDNN library is available for ETL.
Definition: config.hpp:114
void assign_mul_to(L &&lhs) const
Multiply the given left-hand-side expression.
Definition: bias_batch_mean_4d_expr.hpp:246
static constexpr size_t dim()
Returns the DDth dimension of the expression.
Definition: bias_batch_mean_4d_expr.hpp:446
void std_mod_evaluate(Expr &&expr, Result &&result)
Compound modulo evaluation of the expr into result.
Definition: evaluator.hpp:1271
void std_mul_evaluate(Expr &&expr, Result &&result)
Compound multiply evaluation of the expr into result.
Definition: evaluator.hpp:1233
bias_batch_mean_4d_expr(A a)
Construct a new expression.
Definition: bias_batch_mean_4d_expr.hpp:42
constexpr bool is_transformer
Traits indicating if the given ETL type is a transformer expression.
Definition: traits.hpp:297
value_t< E > sum(E &&values)
Returns the sum of all the values contained in the given expression.
Definition: expression_builder.hpp:624
decltype(auto) smart_forward_gpu(E &expr)
Smart forwarding for a temporary expression that will be computed in GPU.
Definition: helpers.hpp:343
constexpr size_t size(const E &expr) noexcept
Returns the size of the given ETL expression.
Definition: helpers.hpp:108
void assign_div_to(L &&lhs) const
Divide the given left-hand-side expression.
Definition: bias_batch_mean_4d_expr.hpp:296
constexpr bool is_view
Traits indicating if the given ETL type is a view expression.
Definition: traits.hpp:304
static constexpr bool is_fast
Indicates if T is a fast structure.
Definition: traits_base.hpp:25
void std_sub_evaluate(Expr &&expr, Result &&result)
Compound subtract evaluation of the expr into result.
Definition: evaluator.hpp:1214
Abstract base class for temporary unary expression.
Definition: base_temporary_expr.hpp:443
void assign_sub_to(L &&lhs) const
Sub from the given left-hand-side expression.
Definition: bias_batch_mean_4d_expr.hpp:196
constexpr bool is_thread_safe
Traits to test if the given ETL expresion type is thread safe.
Definition: traits.hpp:687
typename decay_traits< E >::value_type value_t
Traits to extract the value type out of an ETL type.
Definition: tmp.hpp:81
void std_div_evaluate(Expr &&expr, Result &&result)
Compound divide evaluation of the expr into result.
Definition: evaluator.hpp:1252
bias_batch_mean_4d_expr< detail::build_type< E >, true > bias_batch_mean_4d(const E &value)
Returns the transpose of the given expression.
Definition: bias_batch_mean_4d_expr.hpp:503
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
void std_add_evaluate(Expr &&expr, Result &&result)
Compound add evaluation of the expr into result.
Definition: evaluator.hpp:1195
static constexpr int complexity() noexcept
Estimate the complexity of computation.
Definition: bias_batch_mean_4d_expr.hpp:492