Expression Templates Library (ETL)
gpu_handler.hpp
1 //=======================================================================
2 // Copyright (c) 2014-2023 Baptiste Wicht
3 // Distributed under the terms of the MIT License.
4 // (See accompanying file LICENSE or copy at
5 // http://opensource.org/licenses/MIT)
6 //=======================================================================
7 
8 #pragma once
9 
10 #ifdef ETL_CUDA
11 #include "etl/impl/cublas/cuda.hpp"
12 #endif
13 
14 namespace etl {
15 
16 #ifdef ETL_CUDA
17 
23 struct gpu_memory_allocator {
24 private:
30  template <typename T>
31  static T* base_allocate(size_t size) {
32  T* memory = nullptr;
33 
34  auto cuda_status = cudaMalloc(&memory, size * sizeof(T));
35 
36  if (cuda_status != cudaSuccess) {
37  std::cout << "cuda: Failed to allocate GPU memory: " << cudaGetErrorString(cuda_status) << std::endl;
38  std::cout << " Tried to allocate " << size * sizeof(T) << "B" << std::endl;
39  exit(EXIT_FAILURE);
40  }
41 
42  inc_counter("gpu:allocate");
43 
44  return memory;
45  }
46 
52  static void base_release(const void* gpu_memory) {
53  //Note: the const_cast is only here to allow compilation
54  cuda_check(cudaFree((const_cast<void*>(gpu_memory))));
55 
56  inc_counter("gpu:release");
57  }
58 
59 #ifndef ETL_GPU_POOL
60 public:
66  template <typename T>
67  static T* allocate(size_t size) {
68  return base_allocate<T>(size);
69  }
70 
76  static void release(const void* gpu_memory, [[maybe_unused]] size_t size) {
77  base_release(gpu_memory);
78  }
79 
86  static void clear() {
87  // This allocator does not store memory
88  }
89 
90 #else // ETL_GPU_POOL
91 
92 #ifdef ETL_GPU_POOL_SIZE
93  static constexpr size_t entries = ETL_GPU_POOL_SIZE;
94 #else
95 #ifdef ETL_GPU_POOL_LIMIT
96  static constexpr size_t entries = 256;
97 #else
98  static constexpr size_t entries = 64;
99 #endif
100 #endif
101 
102 #ifdef ETL_GPU_POOL_LIMIT
103  static constexpr size_t limit = ETL_GPU_POOL_LIMIT;
104 #else
105  static constexpr size_t limit = 1024 * 1024 * 1024;
106 #endif
107 
111  struct mini_pool_entry {
112  size_t size = 0;
113  void* memory = nullptr;
114  };
115 
125  struct mini_pool {
126  std::array<mini_pool_entry, entries> cache;
127  };
128 
133  static mini_pool& get_pool() {
134  static mini_pool pool;
135  return pool;
136  }
137 
142  static size_t& get_pool_size() {
143  static size_t pool_size = 0;
144  return pool_size;
145  }
146 
151  static std::mutex& get_lock() {
152  static std::mutex lock;
153  return lock;
154  }
155 
156 public:
162  template <typename T>
163  static T* allocate(size_t size) {
164  const auto real_size = size * sizeof(T);
165 
166  // Try to get memory from the pool
167 
168  {
169  std::lock_guard<std::mutex> l(get_lock());
170 
171  if (get_pool_size()) {
172  for (auto& slot : get_pool().cache) {
173  if (slot.memory && slot.size == real_size) {
174  auto memory = slot.memory;
175  slot.memory = nullptr;
176 
177  get_pool_size() -= size;
178 
179  return static_cast<T*>(memory);
180  }
181  }
182  }
183  }
184 
185  // If a memory block is not found, allocate new memory
186 
187  return base_allocate<T>(size);
188  }
189 
195  template <typename T>
196  static void release(const T* gpu_memory, size_t size) {
197  // Try to get an empty slot
198 
199  {
200  std::lock_guard<std::mutex> l(get_lock());
201 
202  if (get_pool_size() + size < limit) {
203  for (auto& slot : get_pool().cache) {
204  if (!slot.memory) {
205  slot.memory = const_cast<void*>(static_cast<const void*>(gpu_memory));
206  slot.size = size * sizeof(T);
207 
208  get_pool_size() += size;
209 
210  return;
211  }
212  }
213  }
214  }
215 
216  // If the cache is full, release the memory
217 
218  base_release(gpu_memory);
219  }
220 
227  static void clear() {
228  std::lock_guard<std::mutex> l(get_lock());
229 
230  // Release each used slots
231  // and clear them
232 
233  for (auto& slot : get_pool().cache) {
234  if (slot.memory) {
235  base_release(slot.memory);
236 
237  slot.memory = nullptr;
238  slot.size = 0;
239  }
240  }
241 
242  get_pool_size() = 0;
243  }
244 #endif
245 };
246 
253 template <typename T>
254 struct gpu_memory_handler {
255 private:
256  mutable T* gpu_memory_ = nullptr;
257  mutable size_t gpu_memory_size = 0;
258 
259  mutable bool cpu_up_to_date = true;
260  mutable bool gpu_up_to_date = false;
261 
262 public:
263  gpu_memory_handler() = default;
264 
269  gpu_memory_handler(const gpu_memory_handler& rhs)
270  : gpu_memory_size(rhs.gpu_memory_size), cpu_up_to_date(rhs.cpu_up_to_date), gpu_up_to_date(rhs.gpu_up_to_date) {
271  if (rhs.gpu_up_to_date) {
272  gpu_allocate_impl(gpu_memory_size);
273 
274  gpu_copy_from(rhs.gpu_memory_, gpu_memory_size);
275 
276  // The CPU status can be erased by gpu_copy_from
277  if (rhs.cpu_up_to_date) {
278  validate_cpu();
279  }
280  } else {
281  gpu_memory_ = nullptr;
282  }
283 
284  cpp_assert(rhs.is_cpu_up_to_date() == this->is_cpu_up_to_date(), "gpu_memory_handler(&) must preserve CPU status");
285  cpp_assert(rhs.is_gpu_up_to_date() == this->is_gpu_up_to_date(), "gpu_memory_handler(&) must preserve GPU status");
286  }
287 
291  gpu_memory_handler(gpu_memory_handler&& rhs) noexcept
292  : gpu_memory_(rhs.gpu_memory_), gpu_memory_size(rhs.gpu_memory_size), cpu_up_to_date(rhs.cpu_up_to_date), gpu_up_to_date(rhs.gpu_up_to_date) {
293  rhs.gpu_memory_ = nullptr;
294  rhs.gpu_memory_size = 0;
295  }
296 
302  gpu_memory_handler& operator=(const gpu_memory_handler& rhs) {
303  if (this != &rhs) {
304  // Release the previous memory, if any
305  if (gpu_memory_) {
306  gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
307  gpu_memory_ = nullptr;
308  }
309 
310  // Copy the size from rhs
311  gpu_memory_size = rhs.gpu_memory_size;
312 
313  // Copy the contents of rhs
314  if (rhs.gpu_up_to_date) {
315  gpu_allocate_impl(gpu_memory_size);
316 
317  gpu_copy_from(rhs.gpu_memory_, gpu_memory_size);
318  } else {
319  gpu_memory_ = nullptr;
320  }
321 
322  // Copy the status (at the end, otherwise gpu_copy_from will screw them)
323  cpu_up_to_date = rhs.cpu_up_to_date;
324  gpu_up_to_date = rhs.gpu_up_to_date;
325  }
326 
327  return *this;
328  }
329 
333  gpu_memory_handler& operator=(gpu_memory_handler&& rhs) noexcept {
334  if (this != &rhs) {
335  // Release the previous memory, if any
336  if (gpu_memory_) {
337  gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
338  gpu_memory_ = nullptr;
339  }
340 
341  // Steal the values and contents from rhs
342  gpu_memory_ = rhs.gpu_memory_;
343  gpu_memory_size = rhs.gpu_memory_size;
344  cpu_up_to_date = rhs.cpu_up_to_date;
345  gpu_up_to_date = rhs.gpu_up_to_date;
346 
347  // Make sure rhs does not have point to the memory
348  rhs.gpu_memory_ = nullptr;
349  rhs.gpu_memory_size = 0;
350  }
351 
352  return *this;
353  }
354 
359  ~gpu_memory_handler() {
360  if (gpu_memory_) {
361  gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
362  }
363  }
364 
369  bool is_cpu_up_to_date() const noexcept {
370  return cpu_up_to_date;
371  }
372 
377  bool is_gpu_up_to_date() const noexcept {
378  return gpu_up_to_date;
379  }
380 
385  T* gpu_memory() const noexcept {
386  return gpu_memory_;
387  }
388 
392  void gpu_evict() const noexcept {
393  if (gpu_memory_) {
394  gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
395 
396  gpu_memory_ = nullptr;
397  gpu_memory_size = 0;
398  }
399 
400  invalidate_gpu();
401  }
402 
406  void invalidate_cpu() const noexcept {
407  cpu_up_to_date = false;
408 
409  cpp_assert(gpu_up_to_date, "Cannot invalidate the CPU if the GPU is not up to date");
410  }
411 
415  void invalidate_gpu() const noexcept {
416  gpu_up_to_date = false;
417 
418  cpp_assert(cpu_up_to_date, "Cannot invalidate the GPU if the CPU is not up to date");
419  }
420 
424  void validate_cpu() const noexcept {
425  cpu_up_to_date = true;
426  }
427 
431  void validate_gpu() const noexcept {
432  gpu_up_to_date = true;
433  }
434 
440  void ensure_gpu_allocated(size_t etl_size) const {
441  if (!is_gpu_allocated()) {
442  gpu_allocate_impl(etl_size);
443  }
444  }
445 
451  void ensure_gpu_up_to_date(const T* cpu_memory, size_t etl_size) const {
452  // Make sure there is some memory allocate
453  if (!is_gpu_allocated()) {
454  gpu_allocate_impl(etl_size);
455  }
456 
457  if (!gpu_up_to_date) {
458  cpu_to_gpu(cpu_memory, etl_size);
459  }
460  }
461 
468  void ensure_cpu_up_to_date(const T* cpu_memory, size_t etl_size) const {
469  if (!cpu_up_to_date) {
470  gpu_to_cpu(cpu_memory, etl_size);
471  }
472  }
473 
479  void gpu_copy_from(const T* gpu_memory, size_t etl_size) const {
480  cpp_assert(is_gpu_allocated(), "GPU must be allocated before copy");
481  cpp_assert(gpu_memory, "Cannot copy from invalid memory");
482  cpp_assert(etl_size, "Cannot copy with a size of zero");
483 
484  cuda_check(cudaMemcpy(const_cast<std::remove_const_t<T>*>(gpu_memory_), const_cast<std::remove_const_t<T>*>(gpu_memory), etl_size * sizeof(T),
485  cudaMemcpyDeviceToDevice));
486 
487  gpu_up_to_date = true;
488  cpu_up_to_date = false;
489  }
490 
491 private:
495  void gpu_allocate_impl(size_t etl_size) const {
496  cpp_assert(!is_gpu_allocated(), "Trying to allocate already allocated GPU gpu_memory_");
497 
498  gpu_memory_ = gpu_memory_allocator::allocate<T>(etl_size);
499  gpu_memory_size = etl_size;
500  }
501 
505  void cpu_to_gpu(const T* cpu_memory, size_t etl_size) const {
506  cpp_assert(is_gpu_allocated(), "Cannot copy to unallocated GPU memory");
507  cpp_assert(!gpu_up_to_date, "Copy must only be done if necessary");
508  cpp_assert(cpu_up_to_date, "Copy from invalid memory!");
509  cpp_assert(cpu_memory, "cpu_memory is nullptr in entry to cpu_to_gpu");
510  cpp_assert(gpu_memory_, "gpu_memory_ is nullptr in entry to cpu_to_gpu");
511 
512  cuda_check(cudaMemcpy(const_cast<std::remove_const_t<T>*>(gpu_memory_), const_cast<std::remove_const_t<T>*>(cpu_memory), etl_size * sizeof(T),
513  cudaMemcpyHostToDevice));
514 
515  gpu_up_to_date = true;
516 
517  inc_counter("gpu:cpu_to_gpu");
518  }
519 
523  void gpu_to_cpu(const T* cpu_memory, size_t etl_size) const {
524  cpp_assert(is_gpu_allocated(), "Cannot copy from unallocated GPU memory()");
525  cpp_assert(gpu_up_to_date, "Cannot copy from invalid memory");
526  cpp_assert(!cpu_up_to_date, "Copy done without reason");
527  cpp_assert(cpu_memory, "cpu_memory is nullptr in entry to gpu_to_cpu");
528  cpp_assert(gpu_memory_, "gpu_memory_ is nullptr in entry to gpu_to_cpu");
529 
530  cuda_check(cudaMemcpy(const_cast<std::remove_const_t<T>*>(cpu_memory), const_cast<std::remove_const_t<T>*>(gpu_memory_), etl_size * sizeof(T),
531  cudaMemcpyDeviceToHost));
532 
533  cpu_up_to_date = true;
534 
535  inc_counter("gpu:gpu_to_cpu");
536  }
537 
542  bool is_gpu_allocated() const noexcept {
543  return gpu_memory_;
544  }
545 };
546 
547 #else
548 template <typename T>
554  T* gpu_memory() const noexcept {
555  return nullptr;
556  }
557 
562  bool is_cpu_up_to_date() const noexcept {
563  return true;
564  }
565 
570  bool is_gpu_up_to_date() const noexcept {
571  return false;
572  }
573 
577  void gpu_evict() const noexcept {}
578 
582  void invalidate_cpu() const noexcept {}
583 
587  void invalidate_gpu() const noexcept {}
588 
592  void validate_cpu() const noexcept {}
593 
597  void validate_gpu() const noexcept {}
598 
604  void ensure_gpu_allocated([[maybe_unused]] size_t etl_size) const {}
605 
611  void ensure_gpu_up_to_date([[maybe_unused]] const T* cpu_memory, [[maybe_unused]] size_t etl_size) const {}
612 
619  void ensure_cpu_up_to_date([[maybe_unused]] const T* cpu_memory, [[maybe_unused]] size_t etl_size) const {}
620 
626  void gpu_copy_from([[maybe_unused]] const T* gpu_memory, [[maybe_unused]] size_t etl_size) const {}
627 };
628 #endif
629 
630 } //end of namespace etl
631 
632 #ifdef ETL_CUDA
633 #include "etl/impl/cublas/cuda_memory.hpp"
634 #endif
void ensure_gpu_allocated() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: sub_view.hpp:717
void ensure_gpu_up_to_date([[maybe_unused]] const T *cpu_memory, [[maybe_unused]] size_t etl_size) const
Allocate memory on the GPU for the expression and copy the values into the GPU.
Definition: gpu_handler.hpp:611
void exit()
Exit from ETL, releasing any possible resource.
Definition: exit.hpp:22
void gpu_evict() const noexcept
Evict the expression from GPU.
Definition: gpu_handler.hpp:577
T * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: gpu_handler.hpp:554
bool is_gpu_up_to_date() const noexcept
Indicates if the GPU memory is up to date.
Definition: gpu_handler.hpp:570
bool is_cpu_up_to_date() const noexcept
Indicates if the CPU memory is up to date.
Definition: sub_view.hpp:777
void gpu_copy_from([[maybe_unused]] const value_type *new_gpu_memory) const
Copy from GPU to GPU.
Definition: sub_view.hpp:761
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: gpu_handler.hpp:597
void gpu_evict() const noexcept
Evict the expression from GPU.
Definition: sub_view.hpp:681
Root namespace for the ETL library.
Definition: adapter.hpp:15
void invalidate_gpu() const noexcept
Invalidates the GPU memory.
Definition: gpu_handler.hpp:587
void invalidate_gpu() const noexcept
Invalidates the GPU memory.
Definition: sub_view.hpp:695
void ensure_cpu_up_to_date() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: dyn_matrix_view.hpp:271
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: sub_view.hpp:688
bool is_cpu_up_to_date() const noexcept
Indicates if the CPU memory is up to date.
Definition: gpu_handler.hpp:562
void ensure_gpu_allocated([[maybe_unused]] size_t etl_size) const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: gpu_handler.hpp:604
bool is_gpu_up_to_date() const noexcept
Indicates if the GPU memory is up to date.
Definition: sub_view.hpp:785
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: sub_view.hpp:709
void validate_cpu() const noexcept
Validates the CPU memory.
Definition: gpu_handler.hpp:592
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: gpu_handler.hpp:582
auto allocate(size_t size, mangling_faker< S >=mangling_faker< S >())
Allocate an array of the given size for the given type.
Definition: allocator.hpp:80
void ensure_cpu_up_to_date([[maybe_unused]] const T *cpu_memory, [[maybe_unused]] size_t etl_size) const
Copy back from the GPU to the expression memory if necessary.
Definition: gpu_handler.hpp:619
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
void validate_cpu() const noexcept
Validates the CPU memory.
Definition: sub_view.hpp:702
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674
Definition: gpu_handler.hpp:549
void gpu_copy_from([[maybe_unused]] const T *gpu_memory, [[maybe_unused]] size_t etl_size) const
Copy from GPU to GPU.
Definition: gpu_handler.hpp:626