FZGPUModules/mempool_8h_source.html

#pragma once


#include <cuda_runtime.h>


#include <cstddef>

#include <memory>

#include <string>

#include <unordered_map>

#include <vector>


namespace fz {


struct AllocationInfo {

    void*       ptr;

    size_t      size;

    std::string tag;

    bool        in_use;


    AllocationInfo(void* p = nullptr, size_t s = 0, const std::string& t = "")

        : ptr(p), size(s), tag(t), in_use(true) {}

};


struct PersistentAllocInfo {

    void*       ptr;

    size_t      bytes;

    std::string tag;

    bool        is_pinned;

};


struct MemoryPoolConfig {

    size_t input_data_size;

    float  pool_size_multiplier;

    int    device_id;

    bool   enable_reuse;


    bool force_fallback;


    MemoryPoolConfig(

        size_t input_size    = 0,

        float  multiplier    = 3.0f,

        int    device        = 0,

        bool   reuse         = true,

        bool   force_fallback = false)

        : input_data_size(input_size),

          pool_size_multiplier(multiplier),

          device_id(device),

          enable_reuse(reuse),

          force_fallback(force_fallback) {}


    size_t getPoolSize() const {

        if (input_data_size == 0) return 1024ULL * 1024 * 1024;

        return static_cast<size_t>(input_data_size * pool_size_multiplier);

    }


};


class MemoryPool {

public:

    explicit MemoryPool(const MemoryPoolConfig& config = MemoryPoolConfig());

    ~MemoryPool();


    MemoryPool(const MemoryPool&)            = delete;

    MemoryPool& operator=(const MemoryPool&) = delete;


    // ── Stream-ordered allocation (I/O buffers) ───────────────────────────────


    void* allocate(size_t size, cudaStream_t stream,

                   const std::string& tag = "", bool persistent = false);


    void free(void* ptr, cudaStream_t stream);


    // ── Persistent allocation (stage-internal scratch) ────────────────────────


    void* allocatePersistentDevice(size_t bytes, const std::string& tag = "");


    void* allocatePersistentPinned(size_t bytes, const std::string& tag = "");


    void freePersistentDevice(void* ptr);


    void freePersistentPinned(void* ptr);


    // ── Lifecycle ─────────────────────────────────────────────────────────────


    void reset(cudaStream_t stream);


    void trim();


    void setReleaseThreshold(size_t bytes);


    void synchronize(cudaStream_t stream);


    // ── Stats & debug ─────────────────────────────────────────────────────────


    size_t getPersistentDeviceBytes() const { return persistent_device_bytes_; }


    size_t getPersistentPinnedBytes() const { return persistent_pinned_bytes_; }


    size_t getCurrentUsage() const {

        if (!mem_pool_) return current_allocated_bytes_;

        uint64_t used = 0;

        cudaMemPoolGetAttribute(mem_pool_, cudaMemPoolAttrUsedMemCurrent, &used);

        return static_cast<size_t>(used);

    }


    size_t getPeakUsage() const {

        if (!mem_pool_) return current_allocated_bytes_;

        uint64_t high = 0;

        cudaMemPoolGetAttribute(mem_pool_, cudaMemPoolAttrUsedMemHigh, &high);

        return static_cast<size_t>(high);

    }


    size_t getAllocationCount() const { return allocations_.size() + graph_allocations_.size(); }


    void printStats() const;


    size_t getConfiguredSize() const { return config_.getPoolSize(); }


    cudaMemPool_t getMemPool() const { return mem_pool_; }


    bool isFallbackMode() const { return mem_pool_ == nullptr; }


    int getDeviceId() const { return config_.device_id; }


private:

    MemoryPoolConfig config_;

    cudaMemPool_t    mem_pool_;


    std::unordered_map<void*, AllocationInfo> allocations_;

    std::unordered_map<void*, AllocationInfo> graph_allocations_;


    // Persistent allocations (device + pinned host) — freed individually or in destructor.

    std::vector<PersistentAllocInfo> persistent_allocs_;

    size_t persistent_device_bytes_ = 0;

    size_t persistent_pinned_bytes_ = 0;


    size_t total_allocations_;

    size_t total_frees_;

    // Host-side running total of live bytes — used for overflow detection without

    // querying a CUDA attribute on every hot-path allocation.

    size_t current_allocated_bytes_;

    // Set the first time current_allocated_bytes_ exceeds configured pool size

    // so the overflow warning fires only once per reset() cycle.

    bool   overflow_warned_;

    bool   initialized_;


    void initializeMemPool();

};


} // namespace fz

fz::MemoryPool
Definition mempool.h:82

fz::MemoryPool::reset
void reset(cudaStream_t stream)

fz::MemoryPool::getPersistentPinnedBytes
size_t getPersistentPinnedBytes() const
Definition mempool.h:168

fz::MemoryPool::trim
void trim()

fz::MemoryPool::getMemPool
cudaMemPool_t getMemPool() const
Definition mempool.h:198

fz::MemoryPool::freePersistentPinned
void freePersistentPinned(void *ptr)

fz::MemoryPool::isFallbackMode
bool isFallbackMode() const
Definition mempool.h:201

fz::MemoryPool::free
void free(void *ptr, cudaStream_t stream)

fz::MemoryPool::synchronize
void synchronize(cudaStream_t stream)

fz::MemoryPool::getPeakUsage
size_t getPeakUsage() const
Definition mempool.h:179

fz::MemoryPool::allocatePersistentPinned
void * allocatePersistentPinned(size_t bytes, const std::string &tag="")

fz::MemoryPool::getCurrentUsage
size_t getCurrentUsage() const
Definition mempool.h:171

fz::MemoryPool::getAllocationCount
size_t getAllocationCount() const
Definition mempool.h:187

fz::MemoryPool::setReleaseThreshold
void setReleaseThreshold(size_t bytes)

fz::MemoryPool::allocatePersistentDevice
void * allocatePersistentDevice(size_t bytes, const std::string &tag="")

fz::MemoryPool::getPersistentDeviceBytes
size_t getPersistentDeviceBytes() const
Definition mempool.h:165

fz::MemoryPool::allocate
void * allocate(size_t size, cudaStream_t stream, const std::string &tag="", bool persistent=false)

fz::MemoryPool::getConfiguredSize
size_t getConfiguredSize() const
Definition mempool.h:195

fz::MemoryPool::freePersistentDevice
void freePersistentDevice(void *ptr)

fz
Definition fzm_format.h:25

fz::AllocationInfo
Definition mempool.h:19

fz::AllocationInfo::size
size_t size
Size in bytes.
Definition mempool.h:21

fz::AllocationInfo::tag
std::string tag
Debug label (e.g. "lorenzo_output").
Definition mempool.h:22

fz::AllocationInfo::ptr
void * ptr
Device pointer.
Definition mempool.h:20

fz::AllocationInfo::in_use
bool in_use
True while allocated.
Definition mempool.h:23

fz::MemoryPoolConfig
Definition mempool.h:46

fz::MemoryPoolConfig::device_id
int device_id
CUDA device index.
Definition mempool.h:49

fz::MemoryPoolConfig::enable_reuse
bool enable_reuse
Enable opportunistic buffer reuse.
Definition mempool.h:50

fz::MemoryPoolConfig::getPoolSize
size_t getPoolSize() const
Definition mempool.h:67

fz::MemoryPoolConfig::input_data_size
size_t input_data_size
Input byte count used to size the pool.
Definition mempool.h:47

fz::MemoryPoolConfig::pool_size_multiplier
float pool_size_multiplier
Pool capacity = input_data_size × multiplier.
Definition mempool.h:48

fz::MemoryPoolConfig::force_fallback
bool force_fallback
Skip pool creation and use cudaMalloc; for vGPU or testing.
Definition mempool.h:52

fz::PersistentAllocInfo
Definition mempool.h:38

fz::PersistentAllocInfo::is_pinned
bool is_pinned
true = cudaMallocHost; false = cudaMalloc
Definition mempool.h:42