FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
mempool.h
Go to the documentation of this file.
1#pragma once
2
8#include <cuda_runtime.h>
9
10#include <cstddef>
11#include <memory>
12#include <string>
13#include <unordered_map>
14#include <vector>
15
16namespace fz {
17
20 void* ptr;
21 size_t size;
22 std::string tag;
23 bool in_use;
24
25 AllocationInfo(void* p = nullptr, size_t s = 0, const std::string& t = "")
26 : ptr(p), size(s), tag(t), in_use(true) {}
27};
28
39 void* ptr;
40 size_t bytes;
41 std::string tag;
42 bool is_pinned;
43};
44
51
53
55 size_t input_size = 0,
56 float multiplier = 3.0f,
57 int device = 0,
58 bool reuse = true,
59 bool force_fallback = false)
60 : input_data_size(input_size),
61 pool_size_multiplier(multiplier),
62 device_id(device),
63 enable_reuse(reuse),
65
67 size_t getPoolSize() const {
68 if (input_data_size == 0) return 1024ULL * 1024 * 1024;
69 return static_cast<size_t>(input_data_size * pool_size_multiplier);
70 }
71};
72
83public:
84 explicit MemoryPool(const MemoryPoolConfig& config = MemoryPoolConfig());
86
87 MemoryPool(const MemoryPool&) = delete;
88 MemoryPool& operator=(const MemoryPool&) = delete;
89
90 // ── Stream-ordered allocation (I/O buffers) ───────────────────────────────
91
102 void* allocate(size_t size, cudaStream_t stream,
103 const std::string& tag = "", bool persistent = false);
104
106 void free(void* ptr, cudaStream_t stream);
107
108 // ── Persistent allocation (stage-internal scratch) ────────────────────────
109
121 void* allocatePersistentDevice(size_t bytes, const std::string& tag = "");
122
133 void* allocatePersistentPinned(size_t bytes, const std::string& tag = "");
134
136 void freePersistentDevice(void* ptr);
137
139 void freePersistentPinned(void* ptr);
140
141 // ── Lifecycle ─────────────────────────────────────────────────────────────
142
144 void reset(cudaStream_t stream);
145
147 void trim();
148
157 void setReleaseThreshold(size_t bytes);
158
160 void synchronize(cudaStream_t stream);
161
162 // ── Stats & debug ─────────────────────────────────────────────────────────
163
165 size_t getPersistentDeviceBytes() const { return persistent_device_bytes_; }
166
168 size_t getPersistentPinnedBytes() const { return persistent_pinned_bytes_; }
169
171 size_t getCurrentUsage() const {
172 if (!mem_pool_) return current_allocated_bytes_;
173 uint64_t used = 0;
174 cudaMemPoolGetAttribute(mem_pool_, cudaMemPoolAttrUsedMemCurrent, &used);
175 return static_cast<size_t>(used);
176 }
177
179 size_t getPeakUsage() const {
180 if (!mem_pool_) return current_allocated_bytes_;
181 uint64_t high = 0;
182 cudaMemPoolGetAttribute(mem_pool_, cudaMemPoolAttrUsedMemHigh, &high);
183 return static_cast<size_t>(high);
184 }
185
187 size_t getAllocationCount() const { return allocations_.size() + graph_allocations_.size(); }
188
189 void printStats() const;
190
195 size_t getConfiguredSize() const { return config_.getPoolSize(); }
196
198 cudaMemPool_t getMemPool() const { return mem_pool_; }
199
201 bool isFallbackMode() const { return mem_pool_ == nullptr; }
202
203 int getDeviceId() const { return config_.device_id; }
204
205private:
206 MemoryPoolConfig config_;
207 cudaMemPool_t mem_pool_;
208
209 std::unordered_map<void*, AllocationInfo> allocations_;
210 std::unordered_map<void*, AllocationInfo> graph_allocations_;
211
212 // Persistent allocations (device + pinned host) — freed individually or in destructor.
213 std::vector<PersistentAllocInfo> persistent_allocs_;
214 size_t persistent_device_bytes_ = 0;
215 size_t persistent_pinned_bytes_ = 0;
216
217 size_t total_allocations_;
218 size_t total_frees_;
219 // Host-side running total of live bytes — used for overflow detection without
220 // querying a CUDA attribute on every hot-path allocation.
221 size_t current_allocated_bytes_;
222 // Set the first time current_allocated_bytes_ exceeds configured pool size
223 // so the overflow warning fires only once per reset() cycle.
224 bool overflow_warned_;
225 bool initialized_;
226
227 void initializeMemPool();
228};
229
230} // namespace fz
Definition mempool.h:82
void reset(cudaStream_t stream)
size_t getPersistentPinnedBytes() const
Definition mempool.h:168
cudaMemPool_t getMemPool() const
Definition mempool.h:198
void freePersistentPinned(void *ptr)
bool isFallbackMode() const
Definition mempool.h:201
void free(void *ptr, cudaStream_t stream)
void synchronize(cudaStream_t stream)
size_t getPeakUsage() const
Definition mempool.h:179
void * allocatePersistentPinned(size_t bytes, const std::string &tag="")
size_t getCurrentUsage() const
Definition mempool.h:171
size_t getAllocationCount() const
Definition mempool.h:187
void setReleaseThreshold(size_t bytes)
void * allocatePersistentDevice(size_t bytes, const std::string &tag="")
size_t getPersistentDeviceBytes() const
Definition mempool.h:165
void * allocate(size_t size, cudaStream_t stream, const std::string &tag="", bool persistent=false)
size_t getConfiguredSize() const
Definition mempool.h:195
void freePersistentDevice(void *ptr)
Definition fzm_format.h:25
Definition mempool.h:19
size_t size
Size in bytes.
Definition mempool.h:21
std::string tag
Debug label (e.g. "lorenzo_output").
Definition mempool.h:22
void * ptr
Device pointer.
Definition mempool.h:20
bool in_use
True while allocated.
Definition mempool.h:23
Definition mempool.h:46
int device_id
CUDA device index.
Definition mempool.h:49
bool enable_reuse
Enable opportunistic buffer reuse.
Definition mempool.h:50
size_t getPoolSize() const
Definition mempool.h:67
size_t input_data_size
Input byte count used to size the pool.
Definition mempool.h:47
float pool_size_multiplier
Pool capacity = input_data_size × multiplier.
Definition mempool.h:48
bool force_fallback
Skip pool creation and use cudaMalloc; for vGPU or testing.
Definition mempool.h:52
Definition mempool.h:38
bool is_pinned
true = cudaMallocHost; false = cudaMalloc
Definition mempool.h:42