10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H 30 template <
typename ArgType>
31 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int MulCost() {
35 template <
typename ArgType>
36 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int AddCost() {
39 template <
typename ArgType>
40 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int DivCost() {
44 template <
typename ArgType>
45 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int ModCost() {
48 template <
typename SrcType,
typename TargetType>
49 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int CastCost() {
55 TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
57 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles)
58 : bytes_loaded_(bytes_loaded),
59 bytes_stored_(bytes_stored),
60 compute_cycles_(compute_cycles) {}
63 TensorOpCost(
double bytes_loaded,
double bytes_stored,
double compute_cycles,
64 bool vectorized,
double packet_size)
65 : bytes_loaded_(bytes_loaded),
66 bytes_stored_(bytes_stored),
67 compute_cycles_(vectorized ? compute_cycles / packet_size
69 eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
70 eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
71 eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
74 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_loaded()
const {
77 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double bytes_stored()
const {
80 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double compute_cycles()
const {
81 return compute_cycles_;
83 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double total_cost(
84 double load_cost,
double store_cost,
double compute_cost)
const {
85 return load_cost * bytes_loaded_ + store_cost * bytes_stored_ +
86 compute_cost * compute_cycles_;
91 EIGEN_DEVICE_FUNC
void dropMemoryCost() {
97 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
98 const TensorOpCost& rhs)
const {
99 double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
100 double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
101 double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
102 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
106 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
107 const TensorOpCost& rhs)
const {
108 double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
109 double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
110 double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
111 return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
114 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
115 const TensorOpCost& rhs) {
116 bytes_loaded_ += rhs.bytes_loaded();
117 bytes_stored_ += rhs.bytes_stored();
118 compute_cycles_ += rhs.compute_cycles();
122 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator*=(
double rhs) {
123 bytes_loaded_ *= rhs;
124 bytes_stored_ *= rhs;
125 compute_cycles_ *= rhs;
129 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator+(
130 TensorOpCost lhs,
const TensorOpCost& rhs) {
134 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
135 TensorOpCost lhs,
double rhs) {
139 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
friend TensorOpCost operator*(
140 double lhs, TensorOpCost rhs) {
145 friend std::ostream& operator<<(std::ostream& os,
const TensorOpCost& tc) {
146 return os <<
"[bytes_loaded = " << tc.bytes_loaded()
147 <<
", bytes_stored = " << tc.bytes_stored()
148 <<
", compute_cycles = " << tc.compute_cycles() <<
"]";
152 double bytes_loaded_;
153 double bytes_stored_;
154 double compute_cycles_;
160 template <
typename Device>
164 static const int kDeviceCyclesPerComputeCycle = 1;
167 static const int kStartupCycles = 100000;
168 static const int kPerThreadCycles = 100000;
169 static const int kTaskSize = 40000;
174 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
int numThreads(
175 double output_size,
const TensorOpCost& cost_per_coeff,
int max_threads) {
176 double cost = totalCost(output_size, cost_per_coeff);
177 int threads = (cost - kStartupCycles) / kPerThreadCycles + 0.9;
178 return numext::mini(max_threads, numext::maxi(1, threads));
184 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double taskSize(
185 double output_size,
const TensorOpCost& cost_per_coeff) {
186 return totalCost(output_size, cost_per_coeff) / kTaskSize;
190 static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
double totalCost(
191 double output_size,
const TensorOpCost& cost_per_coeff) {
201 const double kLoadCycles = 1.0 / 64 * 11;
202 const double kStoreCycles = 1.0 / 64 * 11;
205 cost_per_coeff.total_cost(kLoadCycles, kStoreCycles,
206 kDeviceCyclesPerComputeCycle);
212 #endif // EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H Definition: TensorCostModel.h:25
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:85
Definition: UnaryFunctors.h:152
Definition: BinaryFunctors.h:76
Definition: BinaryFunctors.h:358
Definition: XprHelper.h:146
Definition: TensorCostModel.h:161