5#include <cuda_runtime.h>
8#include <unordered_map>
9#include <unordered_set>
28 size_t allocated_size;
32 int remaining_consumers;
33 std::vector<int> consumer_stage_ids;
34 int producer_stage_id;
35 int producer_output_index;
42 : size(0), initial_size(0), allocated_size(0), d_ptr(nullptr), tag(
""),
43 remaining_consumers(0), producer_stage_id(-1), producer_output_index(0),
53 std::vector<int> input_buffer_ids;
54 std::vector<int> output_buffer_ids;
55 std::unordered_map<int, int> output_index_to_buffer_id;
57 std::vector<DAGNode*> dependencies;
58 std::vector<DAGNode*> dependents;
65 cudaEvent_t completion_event;
70 std::vector<void*> exec_inputs;
71 std::vector<void*> exec_outputs;
72 std::vector<size_t> exec_sizes;
75 : id(-1), stage(s), level(-1), execution_order(-1),
76 stream(nullptr), is_executed(false), completion_event(nullptr),
start_event(nullptr) {}
107 size_t buffer_size = 0,
int output_index = 0);
109 void setInputBuffer(
DAGNode* node,
size_t size,
const std::string& tag =
"input");
110 void setOutputBuffer(
DAGNode* node,
size_t size,
const std::string& tag =
"output");
124 void updateBufferTag(
int buffer_id,
const std::string& tag);
125 void setBufferPersistent(
int buffer_id,
bool persistent);
135 void execute(cudaStream_t stream);
144 void reset(cudaStream_t stream = 0);
148 void* getBuffer(
int buffer_id)
const;
156 void updateBufferSize(
int buffer_id,
size_t new_size);
173 size_t getPeakMemoryUsage()
const {
return peak_memory_usage_; }
174 size_t getCurrentMemoryUsage()
const {
return current_memory_usage_; }
175 size_t getBufferSize(
int buffer_id)
const;
176 const BufferInfo& getBufferInfo(
int buffer_id)
const;
177 const std::vector<std::vector<DAGNode*>>& getLevels()
const {
return levels_; }
178 const std::vector<DAGNode*>& getNodes()
const {
return nodes_; }
186 void printDAG()
const;
187 void printBufferLifetimes()
const;
195 bool isBoundsCheckEnabled()
const {
return bounds_check_enabled_; }
203 bool isColoringEnabled()
const {
return coloring_applied_; }
204 size_t getColorRegionCount()
const {
return color_region_sizes_.size(); }
214 bool isCaptureMode()
const {
return capture_mode_; }
223 bool isProfilingEnabled()
const {
return profiling_enabled_; }
232 std::vector<DAGNode*> nodes_;
233 std::unordered_map<int, BufferInfo> buffers_;
238 std::vector<cudaStream_t> streams_;
241 std::vector<std::vector<DAGNode*>> levels_;
244 size_t current_memory_usage_;
245 size_t peak_memory_usage_;
247 bool profiling_enabled_;
248 bool bounds_check_enabled_;
253 bool coloring_disabled_;
254 bool coloring_applied_;
255 std::unordered_map<int, int> buffer_color_;
256 std::vector<size_t> color_region_sizes_;
257 std::vector<void*> color_region_ptrs_;
260 void assignStreams();
261 void allocateBuffer(
int buffer_id, cudaStream_t stream);
262 void freeBuffer(
int buffer_id, cudaStream_t stream);
263 void planPreallocation();
void preallocateBuffers(cudaStream_t stream=0)
void setCaptureMode(bool capture)
int addDependency(DAGNode *dependent, DAGNode *dependency, size_t buffer_size=0, int output_index=0)
void setColoringEnabled(bool enable)
Definition dag.h:202
size_t computeTopoPoolSize() const
size_t getTotalBufferSize() const
int addUnconnectedOutput(DAGNode *node, size_t size, int output_index, const std::string &tag)
void setExternalPointer(int buffer_id, void *external_ptr)
size_t getStreamCount() const
Definition dag.h:184
void enableBoundsCheck(bool enable)
Definition dag.h:194
void enableProfiling(bool enable)
std::vector< StageTimingResult > collectTimings()
void configureStreams(int num_streams)
int getMaxParallelism() const
DAGNode * addStage(Stage *stage, const std::string &name="")
bool connectExistingOutput(DAGNode *producer, DAGNode *consumer, int output_index)
void reset(cudaStream_t stream=0)
Definition fzm_format.h:25
MemoryStrategy
Definition dag.h:19
@ MINIMAL
Allocate on-demand, free at last consumer. Lowest peak memory.
@ PREALLOCATE
Allocate everything upfront at finalize(). Required for graph mode.
Pipeline and per-stage profiling result types.
bool is_external
If true, pointer is caller-owned — DAG never allocs or frees.
Definition dag.h:39
bool is_persistent
If true, survives reset() until DAG destruction.
Definition dag.h:38
cudaEvent_t start_event
Non-null only when profiling is enabled.
Definition dag.h:66