#include <lorenzo_quant.h>

Inheritance diagram for fz::LorenzoQuantStage< TInput, TCode >:

Classes
struct	Config

Public Member Functions
void	execute (cudaStream_t stream, MemoryPool pool, const std::vector< void > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override

void	postStreamSync (cudaStream_t stream) override

void	onFinalize (size_t estimated_inlen, MemoryPool *pool) override

size_t	estimateDeviceFootprintBytes (size_t) const override

std::string	getName () const override

std::vector< std::string >	getOutputNames () const override

std::vector< size_t >	estimateOutputSizes (const std::vector< size_t > &input_sizes) const override

std::unordered_map< std::string, size_t >	getActualOutputSizesByName () const override

size_t	getActualOutputSize (int index) const override

void	saveState () override

void	setDims (const std::array< size_t, 3 > &dims) override

void	setErrorBoundMode (ErrorBoundMode mode)

int	ndim () const
	Returns the effective spatial dimensionality (1, 2, or 3).

void	setInverse (bool inverse)

uint16_t	getStageTypeId () const override

uint8_t	getOutputDataType (size_t output_index) const override

uint8_t	getInputDataType (size_t) const override

size_t	serializeHeader (size_t output_index, uint8_t *header_buffer, size_t max_size) const override

size_t	getMaxHeaderSize (size_t output_index) const override

void	deserializeHeader (const uint8_t *header_buffer, size_t size) override

Public Member Functions inherited from fz::Stage
virtual size_t	getRequiredInputAlignment () const

int	getOutputIndex (const std::string &name) const

virtual size_t	estimatePinnedFootprintBytes (size_t) const

virtual bool	isGraphCompatible () const

virtual size_t	estimateScratchBytes (const std::vector< size_t > &input_sizes) const

Detailed Description

template<typename TInput = float, typename TCode = uint16_t>
class fz::LorenzoQuantStage< TInput, TCode >

Lorenzo predictor with error-bounded quantization (1-D, 2-D, 3-D).

Note: Prior work: fused predictor+quantizer kernels and the multi-output design follow the cuSZ Lorenzo implementation (lrz_c.cuhip.inl, lrz_x.cuhip.inl) by the cuSZ team (BSD-3-Clause). See THIRD_PARTY.md.

Forward outputs (compression):

[0] codes — quantization codes for all elements (TCode)
[1] outlier_errors — prediction errors for outliers (TInput)
[2] outlier_indices — outlier element indices (uint32_t)

The outlier count is not a DAG output port — it lives in a stage-private 4-byte device scratch (allocated via pool->allocatePersistentDevice in onFinalize()), is D2H'd in postStreamSync(), and is serialized in the FZM header. The inverse path receives it as a uint32_t kernel-launch argument (read from the deserialized header), so the inverse kernel never has to dereference a device pointer to know its loop bound.

Inverse (decompression): takes the three forward outputs, produces the reconstructed data as a single TInput array.

Template Parameters

TInput	Floating-point input type (`float` or `double`).
TCode	Quantization code type (`uint8_t`, `uint16_t`, `uint32_t`).

Member Function Documentation

◆ execute()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::execute	(	cudaStream_t	stream,
		MemoryPool *	pool,
		const std::vector< void * > &	inputs,
		const std::vector< void * > &	outputs,
		const std::vector< size_t > &	sizes
	)

overridevirtual

Execute the stage. Inputs, outputs, and sizes are device pointers/bytes.

Stages may call cudaStreamSynchronize(stream) or issue blocking D2H copies when the algorithm requires it (e.g. Huffman histogram readback for codebook construction, ANS renormalization tables). Such stages must return false from isGraphCompatible() and must document the sync points.

Note: the DAG dispatches sibling nodes (same topological level) via a sequential CPU loop, each enqueuing to its own stream. A sync inside execute() blocks the CPU from dispatching subsequent siblings until the synced stream is idle — this delays parallel branches in wide DAGs. In a linear pipeline there are no siblings and no extra cost.

Implements fz::Stage.

◆ postStreamSync()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::postStreamSync ( cudaStream_t stream )

overridevirtual

Reads back the actual outlier count from the device (4 bytes) and trims actual_output_sizes_ to the real values. Called by Pipeline::compress() after the stream is synchronized — avoids a mid-pipeline stall.

Reimplemented from fz::Stage.

◆ onFinalize()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::onFinalize	(	size_t	estimated_inlen,
		MemoryPool *	pool
	)

overridevirtual

Pre-allocate the stage-private 4-byte outlier-count device scratch (via pool->allocatePersistentDevice) in PREALLOCATE mode. In MINIMAL mode this is deferred to the first compress execute(). The 4-byte scratch lives for the stage's lifetime.

Reimplemented from fz::Stage.

◆ estimateDeviceFootprintBytes()

template<typename TInput = float, typename TCode = uint16_t>

size_t fz::LorenzoQuantStage< TInput, TCode >::estimateDeviceFootprintBytes ( size_t ) const

inlineoverridevirtual

Estimated persistent device memory this stage allocates outside the pool (via pool->allocatePersistentDevice). Used for total footprint reporting. Default: 0.

Reimplemented from fz::Stage.

◆ getName()

template<typename TInput = float, typename TCode = uint16_t>

std::string fz::LorenzoQuantStage< TInput, TCode >::getName ( ) const

inlineoverridevirtual

Human-readable name used in error messages and debug output.

Implements fz::Stage.

◆ getOutputNames()

template<typename TInput = float, typename TCode = uint16_t>

std::vector< std::string > fz::LorenzoQuantStage< TInput, TCode >::getOutputNames ( ) const

inlineoverridevirtual

Output port names in order. Default: single port named "output". Multi-output stages (e.g. Lorenzo: "codes", "outliers") override this.

Reimplemented from fz::Stage.

◆ estimateOutputSizes()

template<typename TInput = float, typename TCode = uint16_t>

std::vector< size_t > fz::LorenzoQuantStage< TInput, TCode >::estimateOutputSizes ( const std::vector< size_t > & input_sizes ) const

overridevirtual

Estimate output buffer sizes given input sizes. Used for buffer allocation planning in PREALLOCATE mode — must be a safe upper bound; under-estimation causes buffer overruns.

Implements fz::Stage.

◆ getActualOutputSizesByName()

template<typename TInput = float, typename TCode = uint16_t>

std::unordered_map< std::string, size_t > fz::LorenzoQuantStage< TInput, TCode >::getActualOutputSizesByName ( ) const

inlineoverridevirtual

Actual output sizes after execute(), keyed by output port name.

Implements fz::Stage.

◆ getActualOutputSize()

template<typename TInput = float, typename TCode = uint16_t>

size_t fz::LorenzoQuantStage< TInput, TCode >::getActualOutputSize ( int index ) const

inlineoverridevirtual

Actual size of a single output by index after execute(). Avoids constructing the map for the common single-output case. Default delegates to getActualOutputSizesByName(); override to return directly from an internal field.

Reimplemented from fz::Stage.

◆ saveState()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::saveState ( )

inlineoverridevirtual

Save/restore config state around a decompression pass. deserializeHeader() overwrites the stage's forward-pass config; saveState() is called before and restoreState() after so the stage returns to its original configuration.

Reimplemented from fz::Stage.

◆ setDims()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::setDims ( const std::array< size_t, 3 > & dims )

inlineoverridevirtual

Called once by Pipeline::finalize() so stages can react to the dataset dimensions set via Pipeline::setDims() after construction.

Parameters

dims	{x, y, z} extents (z==1 → 2-D; y==z==1 → 1-D)

Reimplemented from fz::Stage.

◆ setErrorBoundMode()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::setErrorBoundMode ( ErrorBoundMode mode )

inline

REL here is global-approximate (abs_eb = eb * max(|data|)), NOT the exact per-element PFPL bound — use QuantizerStage REL for that. See the error-bound mode notes in the file-level doc.

◆ setInverse()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::setInverse ( bool inverse )

inlinevirtual

Switch between forward (compression) and inverse (decompression) mode. Affects getNumInputs()/getNumOutputs() for stages with asymmetric port counts.

Reimplemented from fz::Stage.

◆ getStageTypeId()

template<typename TInput = float, typename TCode = uint16_t>

uint16_t fz::LorenzoQuantStage< TInput, TCode >::getStageTypeId ( ) const

inlineoverridevirtual

Stage type identifier written into the FZM file header.

Implements fz::Stage.

◆ getOutputDataType()

template<typename TInput = float, typename TCode = uint16_t>

uint8_t fz::LorenzoQuantStage< TInput, TCode >::getOutputDataType ( size_t output_index ) const

inlineoverridevirtual

DataType enum of the given output port.

Implements fz::Stage.

◆ getInputDataType()

template<typename TInput = float, typename TCode = uint16_t>

uint8_t fz::LorenzoQuantStage< TInput, TCode >::getInputDataType ( size_t ) const

inlineoverridevirtual

Expected DataType of the given input port.

Used by Pipeline::finalize() to detect type mismatches between connected stages before any execution. Return DataType::UNKNOWN to opt out of checking — byte-transparent stages (Bitshuffle, RZE, RRE) and mock stages must return UNKNOWN; finalize() skips any connection where either side is UNKNOWN.

Reimplemented from fz::Stage.

◆ serializeHeader()

template<typename TInput = float, typename TCode = uint16_t>

size_t fz::LorenzoQuantStage< TInput, TCode >::serializeHeader	(	size_t	output_index,
		uint8_t *	header_buffer,
		size_t	max_size
	)		const

inlineoverridevirtual

Serialize stage config into header_buffer (max 128 bytes) for the FZM file. Return the number of bytes written, or 0 if the stage has no config.

Reimplemented from fz::Stage.

◆ getMaxHeaderSize()

template<typename TInput = float, typename TCode = uint16_t>

size_t fz::LorenzoQuantStage< TInput, TCode >::getMaxHeaderSize ( size_t output_index ) const

inlineoverridevirtual

Maximum bytes this stage writes into its per-output FZM header slot.

Reimplemented from fz::Stage.

◆ deserializeHeader()

template<typename TInput = float, typename TCode = uint16_t>

void fz::LorenzoQuantStage< TInput, TCode >::deserializeHeader	(	const uint8_t *	header_buffer,
		size_t	size
	)

inlineoverridevirtual

Restore stage config from header_buffer during decompression.

Reimplemented from fz::Stage.

Classes

Public Member Functions

Detailed Description

Member Function Documentation

◆ execute()

◆ postStreamSync()

◆ onFinalize()

◆ estimateDeviceFootprintBytes()

◆ getName()

◆ getOutputNames()

◆ estimateOutputSizes()

◆ getActualOutputSizesByName()

◆ getActualOutputSize()

◆ saveState()

◆ setDims()

◆ setErrorBoundMode()

◆ setInverse()

◆ getStageTypeId()

◆ getOutputDataType()

◆ getInputDataType()

◆ serializeHeader()

◆ getMaxHeaderSize()

◆ deserializeHeader()