11 #include "etl/impl/cublas/cuda.hpp" 23 struct gpu_memory_allocator {
31 static T* base_allocate(
size_t size) {
34 auto cuda_status = cudaMalloc(&memory, size *
sizeof(T));
36 if (cuda_status != cudaSuccess) {
37 std::cout <<
"cuda: Failed to allocate GPU memory: " << cudaGetErrorString(cuda_status) << std::endl;
38 std::cout <<
" Tried to allocate " << size *
sizeof(T) <<
"B" << std::endl;
52 static void base_release(
const void*
gpu_memory) {
54 cuda_check(cudaFree((const_cast<void*>(gpu_memory))));
68 return base_allocate<T>(size);
76 static void release(
const void* gpu_memory, [[maybe_unused]]
size_t size) {
77 base_release(gpu_memory);
92 #ifdef ETL_GPU_POOL_SIZE 93 static constexpr
size_t entries = ETL_GPU_POOL_SIZE;
95 #ifdef ETL_GPU_POOL_LIMIT 96 static constexpr
size_t entries = 256;
98 static constexpr
size_t entries = 64;
102 #ifdef ETL_GPU_POOL_LIMIT 103 static constexpr
size_t limit = ETL_GPU_POOL_LIMIT;
105 static constexpr
size_t limit = 1024 * 1024 * 1024;
111 struct mini_pool_entry {
113 void* memory =
nullptr;
126 std::array<mini_pool_entry, entries> cache;
133 static mini_pool& get_pool() {
134 static mini_pool pool;
142 static size_t& get_pool_size() {
143 static size_t pool_size = 0;
151 static std::mutex& get_lock() {
152 static std::mutex lock;
162 template <
typename T>
164 const auto real_size = size *
sizeof(T);
169 std::lock_guard<std::mutex> l(get_lock());
171 if (get_pool_size()) {
172 for (
auto& slot : get_pool().cache) {
173 if (slot.memory && slot.size == real_size) {
174 auto memory = slot.memory;
175 slot.memory =
nullptr;
177 get_pool_size() -= size;
179 return static_cast<T*
>(memory);
187 return base_allocate<T>(size);
195 template <
typename T>
196 static void release(
const T* gpu_memory,
size_t size) {
200 std::lock_guard<std::mutex> l(get_lock());
202 if (get_pool_size() + size < limit) {
203 for (
auto& slot : get_pool().cache) {
205 slot.memory =
const_cast<void*
>(
static_cast<const void*
>(
gpu_memory));
206 slot.size = size *
sizeof(T);
208 get_pool_size() += size;
218 base_release(gpu_memory);
227 static void clear() {
228 std::lock_guard<std::mutex> l(get_lock());
233 for (
auto& slot : get_pool().cache) {
235 base_release(slot.memory);
237 slot.memory =
nullptr;
253 template <
typename T>
254 struct gpu_memory_handler {
256 mutable T* gpu_memory_ =
nullptr;
257 mutable size_t gpu_memory_size = 0;
259 mutable bool cpu_up_to_date =
true;
260 mutable bool gpu_up_to_date =
false;
263 gpu_memory_handler() =
default;
269 gpu_memory_handler(
const gpu_memory_handler& rhs)
270 : gpu_memory_size(rhs.gpu_memory_size), cpu_up_to_date(rhs.cpu_up_to_date), gpu_up_to_date(rhs.gpu_up_to_date) {
271 if (rhs.gpu_up_to_date) {
272 gpu_allocate_impl(gpu_memory_size);
277 if (rhs.cpu_up_to_date) {
281 gpu_memory_ =
nullptr;
284 cpp_assert(rhs.is_cpu_up_to_date() == this->
is_cpu_up_to_date(),
"gpu_memory_handler(&) must preserve CPU status");
285 cpp_assert(rhs.is_gpu_up_to_date() == this->
is_gpu_up_to_date(),
"gpu_memory_handler(&) must preserve GPU status");
291 gpu_memory_handler(gpu_memory_handler&& rhs) noexcept
292 : gpu_memory_(rhs.gpu_memory_), gpu_memory_size(rhs.gpu_memory_size), cpu_up_to_date(rhs.cpu_up_to_date), gpu_up_to_date(rhs.gpu_up_to_date) {
293 rhs.gpu_memory_ =
nullptr;
294 rhs.gpu_memory_size = 0;
302 gpu_memory_handler& operator=(
const gpu_memory_handler& rhs) {
306 gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
307 gpu_memory_ =
nullptr;
311 gpu_memory_size = rhs.gpu_memory_size;
314 if (rhs.gpu_up_to_date) {
315 gpu_allocate_impl(gpu_memory_size);
319 gpu_memory_ =
nullptr;
323 cpu_up_to_date = rhs.cpu_up_to_date;
324 gpu_up_to_date = rhs.gpu_up_to_date;
333 gpu_memory_handler& operator=(gpu_memory_handler&& rhs) noexcept {
337 gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
338 gpu_memory_ =
nullptr;
342 gpu_memory_ = rhs.gpu_memory_;
343 gpu_memory_size = rhs.gpu_memory_size;
344 cpu_up_to_date = rhs.cpu_up_to_date;
345 gpu_up_to_date = rhs.gpu_up_to_date;
348 rhs.gpu_memory_ =
nullptr;
349 rhs.gpu_memory_size = 0;
359 ~gpu_memory_handler() {
361 gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
370 return cpu_up_to_date;
378 return gpu_up_to_date;
394 gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
396 gpu_memory_ =
nullptr;
407 cpu_up_to_date =
false;
409 cpp_assert(gpu_up_to_date,
"Cannot invalidate the CPU if the GPU is not up to date");
416 gpu_up_to_date =
false;
418 cpp_assert(cpu_up_to_date,
"Cannot invalidate the GPU if the CPU is not up to date");
425 cpu_up_to_date =
true;
432 gpu_up_to_date =
true;
441 if (!is_gpu_allocated()) {
442 gpu_allocate_impl(etl_size);
453 if (!is_gpu_allocated()) {
454 gpu_allocate_impl(etl_size);
457 if (!gpu_up_to_date) {
458 cpu_to_gpu(cpu_memory, etl_size);
469 if (!cpu_up_to_date) {
470 gpu_to_cpu(cpu_memory, etl_size);
479 void gpu_copy_from(
const T* gpu_memory,
size_t etl_size)
const {
480 cpp_assert(is_gpu_allocated(),
"GPU must be allocated before copy");
481 cpp_assert(gpu_memory,
"Cannot copy from invalid memory");
482 cpp_assert(etl_size,
"Cannot copy with a size of zero");
484 cuda_check(cudaMemcpy(
const_cast<std::remove_const_t<T>*
>(gpu_memory_),
const_cast<std::remove_const_t<T>*
>(gpu_memory), etl_size *
sizeof(T),
485 cudaMemcpyDeviceToDevice));
487 gpu_up_to_date =
true;
488 cpu_up_to_date =
false;
495 void gpu_allocate_impl(
size_t etl_size)
const {
496 cpp_assert(!is_gpu_allocated(),
"Trying to allocate already allocated GPU gpu_memory_");
498 gpu_memory_ = gpu_memory_allocator::allocate<T>(etl_size);
499 gpu_memory_size = etl_size;
505 void cpu_to_gpu(
const T* cpu_memory,
size_t etl_size)
const {
506 cpp_assert(is_gpu_allocated(),
"Cannot copy to unallocated GPU memory");
507 cpp_assert(!gpu_up_to_date,
"Copy must only be done if necessary");
508 cpp_assert(cpu_up_to_date,
"Copy from invalid memory!");
509 cpp_assert(cpu_memory,
"cpu_memory is nullptr in entry to cpu_to_gpu");
510 cpp_assert(gpu_memory_,
"gpu_memory_ is nullptr in entry to cpu_to_gpu");
512 cuda_check(cudaMemcpy(
const_cast<std::remove_const_t<T>*
>(gpu_memory_),
const_cast<std::remove_const_t<T>*
>(cpu_memory), etl_size *
sizeof(T),
513 cudaMemcpyHostToDevice));
515 gpu_up_to_date =
true;
523 void gpu_to_cpu(
const T* cpu_memory,
size_t etl_size)
const {
524 cpp_assert(is_gpu_allocated(),
"Cannot copy from unallocated GPU memory()");
525 cpp_assert(gpu_up_to_date,
"Cannot copy from invalid memory");
526 cpp_assert(!cpu_up_to_date,
"Copy done without reason");
527 cpp_assert(cpu_memory,
"cpu_memory is nullptr in entry to gpu_to_cpu");
528 cpp_assert(gpu_memory_,
"gpu_memory_ is nullptr in entry to gpu_to_cpu");
530 cuda_check(cudaMemcpy(
const_cast<std::remove_const_t<T>*
>(cpu_memory),
const_cast<std::remove_const_t<T>*
>(gpu_memory_), etl_size *
sizeof(T),
531 cudaMemcpyDeviceToHost));
533 cpu_up_to_date =
true;
542 bool is_gpu_allocated() const noexcept {
548 template <
typename T>
626 void gpu_copy_from([[maybe_unused]]
const T* gpu_memory, [[maybe_unused]]
size_t etl_size)
const {}
633 #include "etl/impl/cublas/cuda_memory.hpp" void ensure_gpu_allocated() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: sub_view.hpp:717
void ensure_gpu_up_to_date([[maybe_unused]] const T *cpu_memory, [[maybe_unused]] size_t etl_size) const
Allocate memory on the GPU for the expression and copy the values into the GPU.
Definition: gpu_handler.hpp:611
void exit()
Exit from ETL, releasing any possible resource.
Definition: exit.hpp:22
void gpu_evict() const noexcept
Evict the expression from GPU.
Definition: gpu_handler.hpp:577
T * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: gpu_handler.hpp:554
bool is_gpu_up_to_date() const noexcept
Indicates if the GPU memory is up to date.
Definition: gpu_handler.hpp:570
bool is_cpu_up_to_date() const noexcept
Indicates if the CPU memory is up to date.
Definition: sub_view.hpp:777
void gpu_copy_from([[maybe_unused]] const value_type *new_gpu_memory) const
Copy from GPU to GPU.
Definition: sub_view.hpp:761
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: gpu_handler.hpp:597
void gpu_evict() const noexcept
Evict the expression from GPU.
Definition: sub_view.hpp:681
Root namespace for the ETL library.
Definition: adapter.hpp:15
void invalidate_gpu() const noexcept
Invalidates the GPU memory.
Definition: gpu_handler.hpp:587
void invalidate_gpu() const noexcept
Invalidates the GPU memory.
Definition: sub_view.hpp:695
void ensure_cpu_up_to_date() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: dyn_matrix_view.hpp:271
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: sub_view.hpp:688
bool is_cpu_up_to_date() const noexcept
Indicates if the CPU memory is up to date.
Definition: gpu_handler.hpp:562
void ensure_gpu_allocated([[maybe_unused]] size_t etl_size) const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: gpu_handler.hpp:604
bool is_gpu_up_to_date() const noexcept
Indicates if the GPU memory is up to date.
Definition: sub_view.hpp:785
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: sub_view.hpp:709
void validate_cpu() const noexcept
Validates the CPU memory.
Definition: gpu_handler.hpp:592
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: gpu_handler.hpp:582
auto allocate(size_t size, mangling_faker< S >=mangling_faker< S >())
Allocate an array of the given size for the given type.
Definition: allocator.hpp:80
void ensure_cpu_up_to_date([[maybe_unused]] const T *cpu_memory, [[maybe_unused]] size_t etl_size) const
Copy back from the GPU to the expression memory if necessary.
Definition: gpu_handler.hpp:619
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25
void validate_cpu() const noexcept
Validates the CPU memory.
Definition: sub_view.hpp:702
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674
Definition: gpu_handler.hpp:549
void gpu_copy_from([[maybe_unused]] const T *gpu_memory, [[maybe_unused]] size_t etl_size) const
Copy from GPU to GPU.
Definition: gpu_handler.hpp:626