FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
fz::MemoryPool Class Reference

#include <mempool.h>

Public Member Functions

void * allocate (size_t size, cudaStream_t stream, const std::string &tag="", bool persistent=false)
 
void free (void *ptr, cudaStream_t stream)
 
void * allocatePersistentDevice (size_t bytes, const std::string &tag="")
 
void * allocatePersistentPinned (size_t bytes, const std::string &tag="")
 
void freePersistentDevice (void *ptr)
 
void freePersistentPinned (void *ptr)
 
void reset (cudaStream_t stream)
 
void trim ()
 
void setReleaseThreshold (size_t bytes)
 
void synchronize (cudaStream_t stream)
 
size_t getPersistentDeviceBytes () const
 
size_t getPersistentPinnedBytes () const
 
size_t getCurrentUsage () const
 
size_t getPeakUsage () const
 
size_t getAllocationCount () const
 
size_t getConfiguredSize () const
 
cudaMemPool_t getMemPool () const
 
bool isFallbackMode () const
 

Detailed Description

Stream-ordered CUDA memory pool.

Uses cudaMallocAsync/cudaFreeAsync over a cudaMemPool_t for efficient reuse and CUDA Graph compatibility. All allocations are tracked for overflow warnings and debug printing.

Note
Non-copyable. Not thread-safe.

Member Function Documentation

◆ allocate()

void * fz::MemoryPool::allocate ( size_t  size,
cudaStream_t  stream,
const std::string &  tag = "",
bool  persistent = false 
)

Allocate size bytes from the pool on stream.

Parameters
sizeBytes to allocate.
streamCUDA stream ordering the allocation.
tagDebug label stored in AllocationInfo.
persistentIf true, allocation survives reset() (graph replay); if false, reset() will free it.
Returns
Device pointer, or nullptr on failure.

◆ free()

void fz::MemoryPool::free ( void *  ptr,
cudaStream_t  stream 
)

Free ptr back to the pool, ordered on stream.

◆ allocatePersistentDevice()

void * fz::MemoryPool::allocatePersistentDevice ( size_t  bytes,
const std::string &  tag = "" 
)

Allocate bytes bytes of persistent device memory via cudaMalloc.

Use for stage-internal buffers that live for the stage's lifetime: codebooks, histograms, partition metadata. Not stream-ordered; not subject to MINIMAL/PREALLOCATE policy; safe across CUDA Graph captures (stable device address).

Freed explicitly via freePersistentDevice() or in bulk by the pool destructor. Tracked for getPersistentDeviceBytes() reporting.

◆ allocatePersistentPinned()

void * fz::MemoryPool::allocatePersistentPinned ( size_t  bytes,
const std::string &  tag = "" 
)

Allocate bytes bytes of persistent pinned host memory via cudaMallocHost.

Use for host-side stage buffers that participate in async D2H/H2D transfers (codebook tables, partition metadata arrays). Pinned memory enables DMA without staging and is required for reliable async transfers.

Freed explicitly via freePersistentPinned() or in bulk by the pool destructor. Tracked for getPersistentPinnedBytes() reporting.

◆ freePersistentDevice()

void fz::MemoryPool::freePersistentDevice ( void *  ptr)

Return a previously persistent-device allocation to the pool.

◆ freePersistentPinned()

void fz::MemoryPool::freePersistentPinned ( void *  ptr)

Return a previously persistent-pinned allocation to the pool.

◆ reset()

void fz::MemoryPool::reset ( cudaStream_t  stream)

Free all non-persistent allocations. Call between compression runs.

◆ trim()

void fz::MemoryPool::trim ( )

Release pool memory back to the OS if usage exceeds the release threshold.

◆ setReleaseThreshold()

void fz::MemoryPool::setReleaseThreshold ( size_t  bytes)

Update the CUDA pool's release threshold and keep config in sync.

Called by Pipeline::finalize() after topology-aware sizing to replace the blunt input_size × multiplier estimate with a tighter bound.

Parameters
bytesNew threshold in bytes.

◆ synchronize()

void fz::MemoryPool::synchronize ( cudaStream_t  stream)

Block until all stream-ordered operations on stream complete.

◆ getPersistentDeviceBytes()

size_t fz::MemoryPool::getPersistentDeviceBytes ( ) const
inline

Bytes currently held in persistent device allocations.

◆ getPersistentPinnedBytes()

size_t fz::MemoryPool::getPersistentPinnedBytes ( ) const
inline

Bytes currently held in persistent pinned-host allocations.

◆ getCurrentUsage()

size_t fz::MemoryPool::getCurrentUsage ( ) const
inline

Current live bytes (queries cudaMemPoolAttrUsedMemCurrent).

◆ getPeakUsage()

size_t fz::MemoryPool::getPeakUsage ( ) const
inline

Peak live bytes since last reset (queries cudaMemPoolAttrUsedMemHigh).

◆ getAllocationCount()

size_t fz::MemoryPool::getAllocationCount ( ) const
inline

Total number of currently live allocations (stream + graph).

◆ getConfiguredSize()

size_t fz::MemoryPool::getConfiguredSize ( ) const
inline

Soft-capacity hint passed at construction (used only for overflow warnings; the CUDA pool itself is not hard-capped).

◆ getMemPool()

cudaMemPool_t fz::MemoryPool::getMemPool ( ) const
inline

Raw cudaMemPool_t handle for advanced usage.

◆ isFallbackMode()

bool fz::MemoryPool::isFallbackMode ( ) const
inline

Returns true if operating in cudaMalloc fallback mode (pool creation failed or was forced).