10 #ifndef EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 11 #define EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H 26 template<
typename Expression,
typename Device,
bool Vectorizable>
30 typedef typename Expression::Index Index;
32 static inline void run(
const Expression& expr,
const Device& device = Device())
35 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
38 const Index size = array_prod(evaluator.dimensions());
39 for (Index i = 0; i < size; ++i) {
40 evaluator.evalScalar(i);
48 template<
typename Expression>
52 typedef typename Expression::Index Index;
57 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
60 const Index size = array_prod(evaluator.dimensions());
65 const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
66 for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
67 for (Index j = 0; j < 4; j++) {
68 evaluator.evalPacket(i + j * PacketSize);
71 const Index VectorizedSize = (size / PacketSize) * PacketSize;
72 for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
73 evaluator.evalPacket(i);
75 for (Index i = VectorizedSize; i < size; ++i) {
76 evaluator.evalScalar(i);
86 #ifdef EIGEN_USE_THREADS 87 template <
typename Evaluator,
typename Index,
bool Vectorizable>
89 static void run(Evaluator* evaluator_in,
const Index first,
const Index last) {
91 eigen_assert(last >= first);
92 for (Index i = first; i < last; ++i) {
93 evaluator.evalScalar(i);
97 static Index alignBlockSize(Index size) {
102 template <
typename Evaluator,
typename Index>
103 struct EvalRange<Evaluator, Index, true> {
106 static void run(Evaluator* evaluator_in,
const Index first,
const Index last) {
108 eigen_assert(last >= first);
110 if (last - first >= PacketSize) {
111 eigen_assert(first % PacketSize == 0);
112 Index last_chunk_offset = last - 4 * PacketSize;
116 for (; i <= last_chunk_offset; i += 4*PacketSize) {
117 for (Index j = 0; j < 4; j++) {
118 evaluator.evalPacket(i + j * PacketSize);
121 last_chunk_offset = last - PacketSize;
122 for (; i <= last_chunk_offset; i += PacketSize) {
123 evaluator.evalPacket(i);
126 for (; i < last; ++i) {
127 evaluator.evalScalar(i);
131 static Index alignBlockSize(Index size) {
133 if (size >= 16 * PacketSize) {
134 return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
137 return (size + PacketSize - 1) & ~(PacketSize - 1);
141 template <
typename Expression,
bool Vectorizable>
142 class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
144 typedef typename Expression::Index Index;
145 static inline void run(
const Expression& expr,
const ThreadPoolDevice& device)
149 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
152 const Index size = array_prod(evaluator.dimensions());
153 #if !defined(EIGEN_USE_SIMPLE_THREAD_POOL) 154 device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
155 EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
156 [&evaluator](Index first, Index last) {
157 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
160 size_t num_threads = device.numThreads();
161 if (num_threads > 1) {
163 size, evaluator.costPerCoeff(Vectorizable), num_threads);
165 if (num_threads == 1) {
166 EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
169 Index blocksz = std::ceil<Index>(
static_cast<float>(size)/num_threads) + PacketSize - 1;
170 const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
171 const Index numblocks = size / blocksize;
173 Barrier barrier(numblocks);
174 for (
int i = 0; i < numblocks; ++i) {
175 device.enqueue_with_barrier(
176 &barrier, &EvalRange<Evaluator, Index, Vectorizable>::run,
177 &evaluator, i * blocksize, (i + 1) * blocksize);
179 if (numblocks * blocksize < size) {
180 EvalRange<Evaluator, Index, Vectorizable>::run(
181 &evaluator, numblocks * blocksize, size);
185 #endif // defined(!EIGEN_USE_SIMPLE_THREAD_POOL) 190 #endif // EIGEN_USE_THREADS 194 #if defined(EIGEN_USE_GPU) 196 template <
typename Expression,
bool Vectorizable>
199 typedef typename Expression::Index Index;
200 static void run(
const Expression& expr,
const GpuDevice& device);
204 #if defined(__CUDACC__) 205 template <
typename Evaluator,
typename Index,
bool Vectorizable>
206 struct EigenMetaKernelEval {
207 static __device__ EIGEN_ALWAYS_INLINE
208 void run(Evaluator&
eval, Index first, Index last, Index step_size) {
209 for (Index i = first; i < last; i += step_size) {
215 template <
typename Evaluator,
typename Index>
216 struct EigenMetaKernelEval<Evaluator, Index, true> {
217 static __device__ EIGEN_ALWAYS_INLINE
218 void run(Evaluator&
eval, Index first, Index last, Index step_size) {
220 const Index vectorized_size = (last / PacketSize) * PacketSize;
221 const Index vectorized_step_size = step_size * PacketSize;
224 for (Index i = first * PacketSize; i < vectorized_size;
225 i += vectorized_step_size) {
228 for (Index i = vectorized_size + first; i < last; i += step_size) {
234 template <
typename Evaluator,
typename Index>
236 __launch_bounds__(1024)
237 EigenMetaKernel(Evaluator
eval, Index size) {
239 const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
240 const Index step_size = blockDim.x * gridDim.x;
242 const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
243 EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
247 template <
typename Expression,
bool Vectorizable>
249 const Expression& expr,
const GpuDevice& device) {
251 const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
253 const int block_size = device.maxCudaThreadsPerBlock();
254 const int max_blocks = device.getNumCudaMultiProcessors() *
255 device.maxCudaThreadsPerMultiProcessor() / block_size;
256 const Index size = array_prod(evaluator.dimensions());
258 const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, divup<int>(size, block_size)), 1);
262 num_blocks, block_size, 0, device, evaluator, size);
268 #endif // EIGEN_USE_GPU 271 #ifdef EIGEN_USE_SYCL 273 template <
typename Expression,
bool Vectorizable>
276 static inline void run(
const Expression &expr,
const SyclDevice &device) {
278 TensorSycl::run(expr, device);
288 #endif // EIGEN_CXX11_TENSOR_TENSOR_EXECUTOR_H Definition: TensorExecutor.h:27
Definition: XprHelper.h:158
Definition: CoreEvaluators.h:90
Namespace containing all symbols from the Eigen library.
Definition: bench_norm.cpp:85
A cost model used to limit the number of threads used for evaluating tensor expression.
Definition: TensorEvaluator.h:28
Definition: TensorDeviceDefault.h:17
Definition: BandTriangularSolver.h:13
Definition: TensorCostModel.h:161
Definition: XprHelper.h:312