#include <dag.h>

Public Member Functions
DAGNode *	addStage (Stage *stage, std::string name="")

int	addDependency (DAGNode dependent, DAGNode dependency, size_t buffer_size=0, int output_index=0)

int	addUnconnectedOutput (DAGNode *node, size_t size, int output_index, const std::string &tag)

bool	connectExistingOutput (DAGNode producer, DAGNode consumer, int output_index)

void	finalize ()

void	configureStreams (int num_streams)

void	preallocateBuffers (cudaStream_t stream=0)

void	reset (cudaStream_t stream=0)

void	setExternalPointer (int buffer_id, void *external_ptr)

size_t	getTotalBufferSize () const

size_t	computeTopoPoolSize () const

int	getMaxParallelism () const

size_t	getStreamCount () const

void	enableBoundsCheck (bool enable)

void	setColoringEnabled (bool enable)

void	setCaptureMode (bool capture)

void	enableProfiling (bool enable)

std::vector< StageTimingResult >	collectTimings ()

Detailed Description

Execution DAG for compression pipelines.

Manages buffer lifetimes, stream assignment, and level-based parallel execution. Pipeline uses this internally; prefer the Pipeline API over direct DAG access unless you need low-level control.

Note: Not thread-safe. All calls must originate from the same host thread.

Member Function Documentation

◆ addStage()

DAGNode * fz::CompressionDAG::addStage	(	Stage *	stage,
		std::string	name = `""`
	)

Add a stage and return its node for wiring dependencies.

◆ addDependency()

int fz::CompressionDAG::addDependency	(	DAGNode *	dependent,
		DAGNode *	dependency,
		size_t	buffer_size = `0`,
		int	output_index = `0`
	)

Add a dependency between two nodes, creating an intermediate buffer.

Parameters

dependent	The node that consumes the output.
dependency	The node that produces the output.
buffer_size	Byte capacity of the intermediate buffer (0 = infer later).
output_index	Which output port of `dependency` to connect.

Returns: Buffer ID of the created intermediate buffer.

◆ addUnconnectedOutput()

int fz::CompressionDAG::addUnconnectedOutput	(	DAGNode *	node,
		size_t	size,
		int	output_index,
		const std::string &	tag
	)

Add a placeholder buffer for an output port that has no downstream consumer. The stage still needs a buffer for every declared output even if it's unused.

◆ connectExistingOutput()

bool fz::CompressionDAG::connectExistingOutput	(	DAGNode *	producer,
		DAGNode *	consumer,
		int	output_index
	)

Promote an unconnected output buffer to a connected one when a consumer is wired. Reuses the existing allocation rather than creating a new buffer.

◆ finalize()

void fz::CompressionDAG::finalize ( )

Assign execution levels and streams. Must be called before execute().

◆ configureStreams()

void fz::CompressionDAG::configureStreams ( int num_streams )

Set number of CUDA streams for parallel level execution.

◆ preallocateBuffers()

void fz::CompressionDAG::preallocateBuffers ( cudaStream_t stream = 0 )

Pre-allocate all buffers upfront. Called automatically by finalize() for PREALLOCATE strategy; call explicitly when input sizes change between runs.

◆ reset()

void fz::CompressionDAG::reset ( cudaStream_t stream = 0 )

Free non-persistent buffers and reset execution state.

◆ setExternalPointer()

void fz::CompressionDAG::setExternalPointer	(	int	buffer_id,
		void *	external_ptr
	)

Mark a buffer as externally managed — DAG will not allocate or free it. Use to pass user-owned device pointers directly into the DAG (zero-copy input).

◆ getTotalBufferSize()

size_t fz::CompressionDAG::getTotalBufferSize ( ) const

Used by printDAG(); not part of the stable public API.

◆ computeTopoPoolSize()

size_t fz::CompressionDAG::computeTopoPoolSize ( ) const

Peak bytes that must be held simultaneously in the pool.

PREALLOCATE: sum of all non-external buffer sizes (all live at once). MINIMAL: simulates level-by-level alloc/free to find peak concurrent live bytes.

Must be called after finalize() and propagateBufferSizes().

◆ getMaxParallelism()

int fz::CompressionDAG::getMaxParallelism ( ) const

Maximum nodes at any single level — useful for choosing stream count.

◆ getStreamCount()

size_t fz::CompressionDAG::getStreamCount ( ) const

inline

Used by Pipeline::finalize(); not part of the stable public API.

◆ enableBoundsCheck()

void fz::CompressionDAG::enableBoundsCheck ( bool enable )

inline

Enable runtime buffer-overwrite detection. After each stage executes, checks actual output size ≤ allocated capacity. Always active in debug builds regardless of this flag.

◆ setColoringEnabled()

void fz::CompressionDAG::setColoringEnabled ( bool enable )

inline

Enable or disable buffer coloring for PREALLOCATE mode (default: enabled). Coloring aliases non-overlapping buffers to reduce peak pool size. Must be called before finalize().

◆ setCaptureMode()

void fz::CompressionDAG::setCaptureMode ( bool capture )

Enable or disable CUDA Graph capture mode.

When true, execute() suppresses host-synchronous operations so the call is safe inside a cudaStreamBeginCapture bracket. Throws if any stage returns false from isGraphCompatible().

◆ enableProfiling()

void fz::CompressionDAG::enableProfiling ( bool enable )

Enable per-stage CUDA event profiling. Zero overhead when disabled. Call collectTimings() after execute() + stream sync to read results.

◆ collectTimings()

std::vector< StageTimingResult > fz::CompressionDAG::collectTimings ( )

Sync all DAG streams and collect per-stage timing results.

Public Member Functions

Detailed Description

Member Function Documentation

◆ addStage()

◆ addDependency()

◆ addUnconnectedOutput()

◆ connectExistingOutput()

◆ finalize()

◆ configureStreams()

◆ preallocateBuffers()

◆ reset()

◆ setExternalPointer()

◆ getTotalBufferSize()

◆ computeTopoPoolSize()

◆ getMaxParallelism()

◆ getStreamCount()

◆ enableBoundsCheck()

◆ setColoringEnabled()

◆ setCaptureMode()

◆ enableProfiling()

◆ collectTimings()