wichtounet/etl/gpu__handler_8hpp_source.html

 //=======================================================================
 // Copyright (c) 2014-2023 Baptiste Wicht
 // Distributed under the terms of the MIT License.
 // (See accompanying file LICENSE or copy at
 //  http://opensource.org/licenses/MIT)
 //=======================================================================

 #pragma once

 #ifdef ETL_CUDA
 #include "etl/impl/cublas/cuda.hpp"
 #endif

 namespace etl {

 #ifdef ETL_CUDA

 struct gpu_memory_allocator {
 private:
     template <typename T>
     static T* base_allocate(size_t size) {
         T* memory = nullptr;

         auto cuda_status = cudaMalloc(&memory, size * sizeof(T));

         if (cuda_status != cudaSuccess) {
             std::cout << "cuda: Failed to allocate GPU memory: " << cudaGetErrorString(cuda_status) << std::endl;
             std::cout << "      Tried to allocate " << size * sizeof(T) << "B" << std::endl;
             exit(EXIT_FAILURE);
         }

         inc_counter("gpu:allocate");

         return memory;
     }

     static void base_release(const void* gpu_memory) {
         //Note: the const_cast is only here to allow compilation
         cuda_check(cudaFree((const_cast<void*>(gpu_memory))));

         inc_counter("gpu:release");
     }

 #ifndef ETL_GPU_POOL
 public:
     template <typename T>
     static T* allocate(size_t size) {
         return base_allocate<T>(size);
     }

     static void release(const void* gpu_memory, [[maybe_unused]] size_t size) {
         base_release(gpu_memory);
     }

     static void clear() {
         // This allocator does not store memory
     }

 #else // ETL_GPU_POOL

 #ifdef ETL_GPU_POOL_SIZE
     static constexpr size_t entries = ETL_GPU_POOL_SIZE;
 #else
 #ifdef ETL_GPU_POOL_LIMIT
     static constexpr size_t entries = 256;
 #else
     static constexpr size_t entries = 64;
 #endif
 #endif

 #ifdef ETL_GPU_POOL_LIMIT
     static constexpr size_t limit   = ETL_GPU_POOL_LIMIT;
 #else
     static constexpr size_t limit   = 1024 * 1024 * 1024;
 #endif

     struct mini_pool_entry {
         size_t size  = 0;
         void* memory = nullptr;
     };

     struct mini_pool {
         std::array<mini_pool_entry, entries> cache;
     };

     static mini_pool& get_pool() {
         static mini_pool pool;
         return pool;
     }

     static size_t& get_pool_size() {
         static size_t pool_size = 0;
         return pool_size;
     }

     static std::mutex& get_lock() {
         static std::mutex lock;
         return lock;
     }

 public:
     template <typename T>
     static T* allocate(size_t size) {
         const auto real_size = size * sizeof(T);

         // Try to get memory from the pool

         {
             std::lock_guard<std::mutex> l(get_lock());

             if (get_pool_size()) {
                 for (auto& slot : get_pool().cache) {
                     if (slot.memory && slot.size == real_size) {
                         auto memory = slot.memory;
                         slot.memory = nullptr;

                         get_pool_size() -= size;

                         return static_cast<T*>(memory);
                     }
                 }
             }
         }

         // If a memory block is not found, allocate new memory

         return base_allocate<T>(size);
     }

     template <typename T>
     static void release(const T* gpu_memory, size_t size) {
         // Try to get an empty slot

         {
             std::lock_guard<std::mutex> l(get_lock());

             if (get_pool_size() + size < limit) {
                 for (auto& slot : get_pool().cache) {
                     if (!slot.memory) {
                         slot.memory = const_cast<void*>(static_cast<const void*>(gpu_memory));
                         slot.size   = size * sizeof(T);

                         get_pool_size() += size;

                         return;
                     }
                 }
             }
         }

         // If the cache is full, release the memory

         base_release(gpu_memory);
     }

     static void clear() {
         std::lock_guard<std::mutex> l(get_lock());

         // Release each used slots
         // and clear them

         for (auto& slot : get_pool().cache) {
             if (slot.memory) {
                 base_release(slot.memory);

                 slot.memory = nullptr;
                 slot.size   = 0;
             }
         }

         get_pool_size() = 0;
     }
 #endif
 };

 template <typename T>
 struct gpu_memory_handler {
 private:
     mutable T* gpu_memory_         = nullptr;
     mutable size_t gpu_memory_size = 0;

     mutable bool cpu_up_to_date = true;
     mutable bool gpu_up_to_date = false;

 public:
     gpu_memory_handler() = default;

     gpu_memory_handler(const gpu_memory_handler& rhs)
             : gpu_memory_size(rhs.gpu_memory_size), cpu_up_to_date(rhs.cpu_up_to_date), gpu_up_to_date(rhs.gpu_up_to_date) {
         if (rhs.gpu_up_to_date) {
             gpu_allocate_impl(gpu_memory_size);

             gpu_copy_from(rhs.gpu_memory_, gpu_memory_size);

             // The CPU status can be erased by gpu_copy_from
             if (rhs.cpu_up_to_date) {
                 validate_cpu();
             }
         } else {
             gpu_memory_ = nullptr;
         }

         cpp_assert(rhs.is_cpu_up_to_date() == this->is_cpu_up_to_date(), "gpu_memory_handler(&) must preserve CPU status");
         cpp_assert(rhs.is_gpu_up_to_date() == this->is_gpu_up_to_date(), "gpu_memory_handler(&) must preserve GPU status");
     }

     gpu_memory_handler(gpu_memory_handler&& rhs) noexcept
             : gpu_memory_(rhs.gpu_memory_), gpu_memory_size(rhs.gpu_memory_size), cpu_up_to_date(rhs.cpu_up_to_date), gpu_up_to_date(rhs.gpu_up_to_date) {
         rhs.gpu_memory_     = nullptr;
         rhs.gpu_memory_size = 0;
     }

     gpu_memory_handler& operator=(const gpu_memory_handler& rhs) {
         if (this != &rhs) {
             // Release the previous memory, if any
             if (gpu_memory_) {
                 gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
                 gpu_memory_ = nullptr;
             }

             // Copy the size from rhs
             gpu_memory_size = rhs.gpu_memory_size;

             // Copy the contents of rhs
             if (rhs.gpu_up_to_date) {
                 gpu_allocate_impl(gpu_memory_size);

                 gpu_copy_from(rhs.gpu_memory_, gpu_memory_size);
             } else {
                 gpu_memory_ = nullptr;
             }

             // Copy the status (at the end, otherwise gpu_copy_from will screw them)
             cpu_up_to_date = rhs.cpu_up_to_date;
             gpu_up_to_date = rhs.gpu_up_to_date;
         }

         return *this;
     }

     gpu_memory_handler& operator=(gpu_memory_handler&& rhs) noexcept {
         if (this != &rhs) {
             // Release the previous memory, if any
             if (gpu_memory_) {
                 gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
                 gpu_memory_ = nullptr;
             }

             // Steal the values and contents from rhs
             gpu_memory_     = rhs.gpu_memory_;
             gpu_memory_size = rhs.gpu_memory_size;
             cpu_up_to_date  = rhs.cpu_up_to_date;
             gpu_up_to_date  = rhs.gpu_up_to_date;

             // Make sure rhs does not have point to the memory
             rhs.gpu_memory_     = nullptr;
             rhs.gpu_memory_size = 0;
         }

         return *this;
     }

     ~gpu_memory_handler() {
         if (gpu_memory_) {
             gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);
         }
     }

     bool is_cpu_up_to_date() const noexcept {
         return cpu_up_to_date;
     }

     bool is_gpu_up_to_date() const noexcept {
         return gpu_up_to_date;
     }

     T* gpu_memory() const noexcept {
         return gpu_memory_;
     }

     void gpu_evict() const noexcept {
         if (gpu_memory_) {
             gpu_memory_allocator::release(gpu_memory_, gpu_memory_size);

             gpu_memory_     = nullptr;
             gpu_memory_size = 0;
         }

         invalidate_gpu();
     }

     void invalidate_cpu() const noexcept {
         cpu_up_to_date = false;

         cpp_assert(gpu_up_to_date, "Cannot invalidate the CPU if the GPU is not up to date");
     }

     void invalidate_gpu() const noexcept {
         gpu_up_to_date = false;

         cpp_assert(cpu_up_to_date, "Cannot invalidate the GPU if the CPU is not up to date");
     }

     void validate_cpu() const noexcept {
         cpu_up_to_date = true;
     }

     void validate_gpu() const noexcept {
         gpu_up_to_date = true;
     }

     void ensure_gpu_allocated(size_t etl_size) const {
         if (!is_gpu_allocated()) {
             gpu_allocate_impl(etl_size);
         }
     }

     void ensure_gpu_up_to_date(const T* cpu_memory, size_t etl_size) const {
         // Make sure there is some memory allocate
         if (!is_gpu_allocated()) {
             gpu_allocate_impl(etl_size);
         }

         if (!gpu_up_to_date) {
             cpu_to_gpu(cpu_memory, etl_size);
         }
     }

     void ensure_cpu_up_to_date(const T* cpu_memory, size_t etl_size) const {
         if (!cpu_up_to_date) {
             gpu_to_cpu(cpu_memory, etl_size);
         }
     }

     void gpu_copy_from(const T* gpu_memory, size_t etl_size) const {
         cpp_assert(is_gpu_allocated(), "GPU must be allocated before copy");
         cpp_assert(gpu_memory, "Cannot copy from invalid memory");
         cpp_assert(etl_size, "Cannot copy with a size of zero");

         cuda_check(cudaMemcpy(const_cast<std::remove_const_t<T>*>(gpu_memory_), const_cast<std::remove_const_t<T>*>(gpu_memory), etl_size * sizeof(T),
                               cudaMemcpyDeviceToDevice));

         gpu_up_to_date = true;
         cpu_up_to_date = false;
     }

 private:
     void gpu_allocate_impl(size_t etl_size) const {
         cpp_assert(!is_gpu_allocated(), "Trying to allocate already allocated GPU gpu_memory_");

         gpu_memory_     = gpu_memory_allocator::allocate<T>(etl_size);
         gpu_memory_size = etl_size;
     }

     void cpu_to_gpu(const T* cpu_memory, size_t etl_size) const {
         cpp_assert(is_gpu_allocated(), "Cannot copy to unallocated GPU memory");
         cpp_assert(!gpu_up_to_date, "Copy must only be done if necessary");
         cpp_assert(cpu_up_to_date, "Copy from invalid memory!");
         cpp_assert(cpu_memory, "cpu_memory is nullptr in entry to cpu_to_gpu");
         cpp_assert(gpu_memory_, "gpu_memory_ is nullptr in entry to cpu_to_gpu");

         cuda_check(cudaMemcpy(const_cast<std::remove_const_t<T>*>(gpu_memory_), const_cast<std::remove_const_t<T>*>(cpu_memory), etl_size * sizeof(T),
                               cudaMemcpyHostToDevice));

         gpu_up_to_date = true;

         inc_counter("gpu:cpu_to_gpu");
     }

     void gpu_to_cpu(const T* cpu_memory, size_t etl_size) const {
         cpp_assert(is_gpu_allocated(), "Cannot copy from unallocated GPU memory()");
         cpp_assert(gpu_up_to_date, "Cannot copy from invalid memory");
         cpp_assert(!cpu_up_to_date, "Copy done without reason");
         cpp_assert(cpu_memory, "cpu_memory is nullptr in entry to gpu_to_cpu");
         cpp_assert(gpu_memory_, "gpu_memory_ is nullptr in entry to gpu_to_cpu");

         cuda_check(cudaMemcpy(const_cast<std::remove_const_t<T>*>(cpu_memory), const_cast<std::remove_const_t<T>*>(gpu_memory_), etl_size * sizeof(T),
                               cudaMemcpyDeviceToHost));

         cpu_up_to_date = true;

         inc_counter("gpu:gpu_to_cpu");
     }

     bool is_gpu_allocated() const noexcept {
         return gpu_memory_;
     }
 };

 #else
 template <typename T>
 struct gpu_memory_handler {
     T* gpu_memory() const noexcept {
         return nullptr;
     }

     bool is_cpu_up_to_date() const noexcept {
         return true;
     }

     bool is_gpu_up_to_date() const noexcept {
         return false;
     }

     void gpu_evict() const noexcept {}

     void invalidate_cpu() const noexcept {}

     void invalidate_gpu() const noexcept {}

     void validate_cpu() const noexcept {}

     void validate_gpu() const noexcept {}

     void ensure_gpu_allocated([[maybe_unused]] size_t etl_size) const {}

     void ensure_gpu_up_to_date([[maybe_unused]] const T* cpu_memory, [[maybe_unused]] size_t etl_size) const {}

     void ensure_cpu_up_to_date([[maybe_unused]] const T* cpu_memory, [[maybe_unused]] size_t etl_size) const {}

     void gpu_copy_from([[maybe_unused]] const T* gpu_memory, [[maybe_unused]] size_t etl_size) const {}
 };
 #endif

 } //end of namespace etl

 #ifdef ETL_CUDA
 #include "etl/impl/cublas/cuda_memory.hpp"
 #endif
etl::ensure_gpu_allocated
void ensure_gpu_allocated() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: sub_view.hpp:717

etl::gpu_memory_handler::ensure_gpu_up_to_date
void ensure_gpu_up_to_date([[maybe_unused]] const T *cpu_memory, [[maybe_unused]] size_t etl_size) const
Allocate memory on the GPU for the expression and copy the values into the GPU.
Definition: gpu_handler.hpp:611

etl::exit
void exit()
Exit from ETL, releasing any possible resource.
Definition: exit.hpp:22

etl::gpu_memory_handler::gpu_evict
void gpu_evict() const noexcept
Evict the expression from GPU.
Definition: gpu_handler.hpp:577

etl::gpu_memory_handler::gpu_memory
T * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: gpu_handler.hpp:554

etl::gpu_memory_handler::is_gpu_up_to_date
bool is_gpu_up_to_date() const noexcept
Indicates if the GPU memory is up to date.
Definition: gpu_handler.hpp:570

etl::is_cpu_up_to_date
bool is_cpu_up_to_date() const noexcept
Indicates if the CPU memory is up to date.
Definition: sub_view.hpp:777

etl::gpu_copy_from
void gpu_copy_from([[maybe_unused]] const value_type *new_gpu_memory) const
Copy from GPU to GPU.
Definition: sub_view.hpp:761

etl::gpu_memory_handler::validate_gpu
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: gpu_handler.hpp:597

etl::gpu_evict
void gpu_evict() const noexcept
Evict the expression from GPU.
Definition: sub_view.hpp:681

etl
Root namespace for the ETL library.
Definition: adapter.hpp:15

etl::gpu_memory_handler::invalidate_gpu
void invalidate_gpu() const noexcept
Invalidates the GPU memory.
Definition: gpu_handler.hpp:587

etl::invalidate_gpu
void invalidate_gpu() const noexcept
Invalidates the GPU memory.
Definition: sub_view.hpp:695

etl::ensure_cpu_up_to_date
void ensure_cpu_up_to_date() const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: dyn_matrix_view.hpp:271

etl::invalidate_cpu
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: sub_view.hpp:688

etl::gpu_memory_handler::is_cpu_up_to_date
bool is_cpu_up_to_date() const noexcept
Indicates if the CPU memory is up to date.
Definition: gpu_handler.hpp:562

etl::gpu_memory_handler::ensure_gpu_allocated
void ensure_gpu_allocated([[maybe_unused]] size_t etl_size) const
Ensures that the GPU memory is allocated and that the GPU memory is up to date (to undefined value)...
Definition: gpu_handler.hpp:604

etl::is_gpu_up_to_date
bool is_gpu_up_to_date() const noexcept
Indicates if the GPU memory is up to date.
Definition: sub_view.hpp:785

etl::ensure_gpu_up_to_date
void ensure_gpu_up_to_date() const
Copy back from the GPU to the expression memory if necessary.
Definition: dyn_matrix_view.hpp:280

etl::validate_gpu
void validate_gpu() const noexcept
Validates the GPU memory.
Definition: sub_view.hpp:709

etl::gpu_memory_handler::validate_cpu
void validate_cpu() const noexcept
Validates the CPU memory.
Definition: gpu_handler.hpp:592

etl::gpu_memory_handler::invalidate_cpu
void invalidate_cpu() const noexcept
Invalidates the CPU memory.
Definition: gpu_handler.hpp:582

etl::allocate
auto allocate(size_t size, mangling_faker< S >=mangling_faker< S >())
Allocate an array of the given size for the given type.
Definition: allocator.hpp:80

etl::gpu_memory_handler::ensure_cpu_up_to_date
void ensure_cpu_up_to_date([[maybe_unused]] const T *cpu_memory, [[maybe_unused]] size_t etl_size) const
Copy back from the GPU to the expression memory if necessary.
Definition: gpu_handler.hpp:619

etl::inc_counter
void inc_counter([[maybe_unused]] const char *name)
Increase the given counter.
Definition: counters.hpp:25

etl::validate_cpu
void validate_cpu() const noexcept
Validates the CPU memory.
Definition: sub_view.hpp:702

etl::gpu_memory
value_type * gpu_memory() const noexcept
Return GPU memory of this expression, if any.
Definition: sub_view.hpp:674

etl::gpu_memory_handler
Definition: gpu_handler.hpp:549

etl::gpu_memory_handler::gpu_copy_from
void gpu_copy_from([[maybe_unused]] const T *gpu_memory, [[maybe_unused]] size_t etl_size) const
Copy from GPU to GPU.
Definition: gpu_handler.hpp:626