FZGPUModules 1.0
GPU-accelerated modular compression pipeline
Loading...
Searching...
No Matches
fz::CompressionDAG Class Reference

#include <dag.h>

Public Member Functions

DAGNodeaddStage (Stage *stage, const std::string &name="")
 
int addDependency (DAGNode *dependent, DAGNode *dependency, size_t buffer_size=0, int output_index=0)
 
int addUnconnectedOutput (DAGNode *node, size_t size, int output_index, const std::string &tag)
 
bool connectExistingOutput (DAGNode *producer, DAGNode *consumer, int output_index)
 
void finalize ()
 
void configureStreams (int num_streams)
 
void preallocateBuffers (cudaStream_t stream=0)
 
void reset (cudaStream_t stream=0)
 
void setExternalPointer (int buffer_id, void *external_ptr)
 
size_t getTotalBufferSize () const
 
size_t computeTopoPoolSize () const
 
int getMaxParallelism () const
 
size_t getStreamCount () const
 
void enableBoundsCheck (bool enable)
 
void setColoringEnabled (bool enable)
 
void setCaptureMode (bool capture)
 
void enableProfiling (bool enable)
 
std::vector< StageTimingResultcollectTimings ()
 

Detailed Description

Execution DAG for compression pipelines.

Manages buffer lifetimes, stream assignment, and level-based parallel execution. Pipeline uses this internally; prefer the Pipeline API over direct DAG access unless you need low-level control.

Note
Not thread-safe. All calls must originate from the same host thread.

Member Function Documentation

◆ addStage()

DAGNode * fz::CompressionDAG::addStage ( Stage stage,
const std::string &  name = "" 
)

Add a stage and return its node for wiring dependencies.

◆ addDependency()

int fz::CompressionDAG::addDependency ( DAGNode dependent,
DAGNode dependency,
size_t  buffer_size = 0,
int  output_index = 0 
)

Add a dependency between two nodes, creating an intermediate buffer.

Parameters
dependentThe node that consumes the output.
dependencyThe node that produces the output.
buffer_sizeByte capacity of the intermediate buffer (0 = infer later).
output_indexWhich output port of dependency to connect.
Returns
Buffer ID of the created intermediate buffer.

◆ addUnconnectedOutput()

int fz::CompressionDAG::addUnconnectedOutput ( DAGNode node,
size_t  size,
int  output_index,
const std::string &  tag 
)

Add a placeholder buffer for an output port that has no downstream consumer. The stage still needs a buffer for every declared output even if it's unused.

◆ connectExistingOutput()

bool fz::CompressionDAG::connectExistingOutput ( DAGNode producer,
DAGNode consumer,
int  output_index 
)

Promote an unconnected output buffer to a connected one when a consumer is wired. Reuses the existing allocation rather than creating a new buffer.

◆ finalize()

void fz::CompressionDAG::finalize ( )

Assign execution levels and streams. Must be called before execute().

◆ configureStreams()

void fz::CompressionDAG::configureStreams ( int  num_streams)

Set number of CUDA streams for parallel level execution.

◆ preallocateBuffers()

void fz::CompressionDAG::preallocateBuffers ( cudaStream_t  stream = 0)

Pre-allocate all buffers upfront. Called automatically by finalize() for PREALLOCATE strategy; call explicitly when input sizes change between runs.

◆ reset()

void fz::CompressionDAG::reset ( cudaStream_t  stream = 0)

Free non-persistent buffers and reset execution state.

◆ setExternalPointer()

void fz::CompressionDAG::setExternalPointer ( int  buffer_id,
void *  external_ptr 
)

Mark a buffer as externally managed — DAG will not allocate or free it. Use to pass user-owned device pointers directly into the DAG (zero-copy input).

◆ getTotalBufferSize()

size_t fz::CompressionDAG::getTotalBufferSize ( ) const

Used by printDAG(); not part of the stable public API.

◆ computeTopoPoolSize()

size_t fz::CompressionDAG::computeTopoPoolSize ( ) const

Peak bytes that must be held simultaneously in the pool.

PREALLOCATE: sum of all non-external buffer sizes (all live at once). MINIMAL: simulates level-by-level alloc/free to find peak concurrent live bytes.

Must be called after finalize() and propagateBufferSizes().

◆ getMaxParallelism()

int fz::CompressionDAG::getMaxParallelism ( ) const

Maximum nodes at any single level — useful for choosing stream count.

◆ getStreamCount()

size_t fz::CompressionDAG::getStreamCount ( ) const
inline

Used by Pipeline::finalize(); not part of the stable public API.

◆ enableBoundsCheck()

void fz::CompressionDAG::enableBoundsCheck ( bool  enable)
inline

Enable runtime buffer-overwrite detection. After each stage executes, checks actual output size ≤ allocated capacity. Always active in debug builds regardless of this flag.

◆ setColoringEnabled()

void fz::CompressionDAG::setColoringEnabled ( bool  enable)
inline

Enable or disable buffer coloring for PREALLOCATE mode (default: enabled). Coloring aliases non-overlapping buffers to reduce peak pool size. Must be called before finalize().

◆ setCaptureMode()

void fz::CompressionDAG::setCaptureMode ( bool  capture)

Enable or disable CUDA Graph capture mode.

When true, execute() suppresses host-synchronous operations so the call is safe inside a cudaStreamBeginCapture bracket. Throws if any stage returns false from isGraphCompatible().

◆ enableProfiling()

void fz::CompressionDAG::enableProfiling ( bool  enable)

Enable per-stage CUDA event profiling. Zero overhead when disabled. Call collectTimings() after execute() + stream sync to read results.

◆ collectTimings()

std::vector< StageTimingResult > fz::CompressionDAG::collectTimings ( )

Sync all DAG streams and collect per-stage timing results.