FZGPUModules/dag_8h_source.html

#pragma once


#include "pipeline/perf.h"


#include <cuda_runtime.h>

#include <memory>

#include <string>

#include <unordered_map>

#include <unordered_set>

#include <vector>


namespace fz {


// Forward declarations

class Stage;

class MemoryPool;


enum class MemoryStrategy {

    MINIMAL,

    PREALLOCATE

};


struct BufferInfo {

    size_t size;

    size_t initial_size;

    size_t allocated_size;

    void*  d_ptr;

    std::string tag;


    int remaining_consumers;

    std::vector<int> consumer_stage_ids;

    int producer_stage_id;

    int producer_output_index;


    bool is_allocated;

    bool is_persistent;

    bool is_external;


    BufferInfo()

        : size(0), initial_size(0), allocated_size(0), d_ptr(nullptr), tag(""),

          remaining_consumers(0), producer_stage_id(-1), producer_output_index(0),

          is_allocated(false), is_persistent(false), is_external(false) {}

};


struct DAGNode {

    int     id;

    Stage*  stage;

    std::string name;


    std::vector<int>     input_buffer_ids;

    std::vector<int>     output_buffer_ids;

    std::unordered_map<int, int> output_index_to_buffer_id;


    std::vector<DAGNode*> dependencies;

    std::vector<DAGNode*> dependents;


    int          level;

    int          execution_order;

    cudaStream_t stream;


    bool         is_executed;

    cudaEvent_t  completion_event;

    cudaEvent_t  start_event;


    // Pre-sized vectors for execute() — allocated at finalize(), reused every call

    // to avoid per-call heap allocations of the input/output/sizes arrays.

    std::vector<void*>   exec_inputs;

    std::vector<void*>   exec_outputs;

    std::vector<size_t>  exec_sizes;


    DAGNode(Stage* s = nullptr)

        : id(-1), stage(s), level(-1), execution_order(-1),

          stream(nullptr), is_executed(false), completion_event(nullptr), start_event(nullptr) {}

};


class CompressionDAG {

public:

    CompressionDAG(MemoryPool* mem_pool, MemoryStrategy strategy = MemoryStrategy::MINIMAL);

    ~CompressionDAG();


    // ── Construction ──────────────────────────────────────────────────────────


    DAGNode* addStage(Stage* stage, std::string name = "");


    int addDependency(DAGNode* dependent, DAGNode* dependency,

                      size_t buffer_size = 0, int output_index = 0);


    void setInputBuffer(DAGNode* node, size_t size, const std::string& tag = "input");

    void setOutputBuffer(DAGNode* node, size_t size, const std::string& tag = "output");


    int addUnconnectedOutput(DAGNode* node, size_t size, int output_index, const std::string& tag);


    bool connectExistingOutput(DAGNode* producer, DAGNode* consumer, int output_index);


    void updateBufferTag(int buffer_id, const std::string& tag);

    void setBufferPersistent(int buffer_id, bool persistent);


    void finalize();


    void configureStreams(int num_streams);


    // ── Execution ─────────────────────────────────────────────────────────────


    void execute(cudaStream_t stream);


    void preallocateBuffers(cudaStream_t stream = 0);


    void reset(cudaStream_t stream = 0);


    // ── Buffer access ─────────────────────────────────────────────────────────


    void* getBuffer(int buffer_id) const;


    void setExternalPointer(int buffer_id, void* external_ptr);


    void updateBufferSize(int buffer_id, size_t new_size);


    // ── Query & debug ─────────────────────────────────────────────────────────


    size_t getTotalBufferSize() const;


    size_t computeTopoPoolSize() const;


    size_t getPeakMemoryUsage()    const { return peak_memory_usage_; }

    size_t getCurrentMemoryUsage() const { return current_memory_usage_; }

    size_t getBufferSize(int buffer_id) const;

    const BufferInfo& getBufferInfo(int buffer_id) const;

    const std::vector<std::vector<DAGNode*>>& getLevels() const { return levels_; }

    const std::vector<DAGNode*>& getNodes() const { return nodes_; }


    int getMaxParallelism() const;


    size_t getStreamCount() const { return streams_.size(); }


    void printDAG() const;

    void printBufferLifetimes() const;


    void enableBoundsCheck(bool enable) { bounds_check_enabled_ = enable; }

    bool isBoundsCheckEnabled() const   { return bounds_check_enabled_; }


    void setColoringEnabled(bool enable) { coloring_disabled_ = !enable; }

    bool isColoringEnabled() const       { return coloring_applied_; }

    size_t getColorRegionCount() const   { return color_region_sizes_.size(); }


    void setCaptureMode(bool capture);

    bool isCaptureMode() const { return capture_mode_; }


    // ── Profiling ─────────────────────────────────────────────────────────────


    void enableProfiling(bool enable);

    bool isProfilingEnabled() const { return profiling_enabled_; }


    std::vector<StageTimingResult> collectTimings();


private:

    MemoryPool*     mem_pool_;

    MemoryStrategy  strategy_;


    std::vector<DAGNode*>           nodes_;

    std::unordered_map<int, BufferInfo> buffers_;


    int  next_buffer_id_;

    bool is_finalized_;


    std::vector<cudaStream_t> streams_;

    bool owns_streams_;


    std::vector<std::vector<DAGNode*>> levels_;

    int max_level_;


    size_t current_memory_usage_;

    size_t peak_memory_usage_;


    bool profiling_enabled_;

    bool bounds_check_enabled_;

    bool capture_mode_;


    // Buffer coloring (PREALLOCATE only). Non-overlapping buffers share a color

    // and are aliased into one pool region. color_region_ptrs_ owns the allocations.

    bool coloring_disabled_;

    bool coloring_applied_;

    std::unordered_map<int, int> buffer_color_;

    std::vector<size_t>          color_region_sizes_;

    std::vector<void*>           color_region_ptrs_;


    void assignLevels();

    void assignStreams();

    void allocateBuffer(int buffer_id, cudaStream_t stream);

    void freeBuffer(int buffer_id, cudaStream_t stream);

    void planPreallocation();

    void colorBuffers();

};


} // namespace fz

fz::CompressionDAG
Definition dag.h:92

fz::CompressionDAG::preallocateBuffers
void preallocateBuffers(cudaStream_t stream=0)

fz::CompressionDAG::addStage
DAGNode * addStage(Stage *stage, std::string name="")

fz::CompressionDAG::setCaptureMode
void setCaptureMode(bool capture)

fz::CompressionDAG::addDependency
int addDependency(DAGNode *dependent, DAGNode *dependency, size_t buffer_size=0, int output_index=0)

fz::CompressionDAG::setColoringEnabled
void setColoringEnabled(bool enable)
Definition dag.h:206

fz::CompressionDAG::computeTopoPoolSize
size_t computeTopoPoolSize() const

fz::CompressionDAG::getTotalBufferSize
size_t getTotalBufferSize() const

fz::CompressionDAG::addUnconnectedOutput
int addUnconnectedOutput(DAGNode *node, size_t size, int output_index, const std::string &tag)

fz::CompressionDAG::setExternalPointer
void setExternalPointer(int buffer_id, void *external_ptr)

fz::CompressionDAG::getStreamCount
size_t getStreamCount() const
Definition dag.h:188

fz::CompressionDAG::enableBoundsCheck
void enableBoundsCheck(bool enable)
Definition dag.h:198

fz::CompressionDAG::enableProfiling
void enableProfiling(bool enable)

fz::CompressionDAG::finalize
void finalize()

fz::CompressionDAG::collectTimings
std::vector< StageTimingResult > collectTimings()

fz::CompressionDAG::configureStreams
void configureStreams(int num_streams)

fz::CompressionDAG::getMaxParallelism
int getMaxParallelism() const

fz::CompressionDAG::connectExistingOutput
bool connectExistingOutput(DAGNode *producer, DAGNode *consumer, int output_index)

fz::CompressionDAG::reset
void reset(cudaStream_t stream=0)

fz::MemoryPool
Definition mempool.h:82

fz::Stage
Definition stage.h:30

fz
Definition fzm_format.h:25

fz::MemoryStrategy
MemoryStrategy
Definition dag.h:23

fz::MemoryStrategy::MINIMAL
@ MINIMAL
Allocate on-demand, free at last consumer. Lowest peak memory.

fz::MemoryStrategy::PREALLOCATE
@ PREALLOCATE
Allocate everything upfront at finalize(). Required for graph mode.

perf.h
Pipeline and per-stage profiling result types.

fz::BufferInfo
Definition dag.h:29

fz::BufferInfo::is_external
bool is_external
If true, pointer is caller-owned — DAG never allocs or frees.
Definition dag.h:43

fz::BufferInfo::is_persistent
bool is_persistent
If true, survives reset() until DAG destruction.
Definition dag.h:42

fz::DAGNode
Definition dag.h:52

fz::DAGNode::start_event
cudaEvent_t start_event
Non-null only when profiling is enabled.
Definition dag.h:70