FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
fz::HuffmanStage< T > Class Template Reference

#include <huffman_stage.h>

+ Inheritance diagram for fz::HuffmanStage< T >:

Public Member Functions

void setBklen (uint32_t bklen)
 
void setEncodeMode (HuffmanEncodeMode mode)
 
void setInverse (bool inv) override
 
bool isGraphCompatible () const override
 
void onFinalize (size_t estimated_inlen, MemoryPool *pool) override
 
size_t estimateDeviceFootprintBytes (size_t inlen) const override
 
size_t estimatePinnedFootprintBytes (size_t inlen) const override
 
void execute (cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
 
std::string getName () const override
 
std::vector< size_t > estimateOutputSizes (const std::vector< size_t > &input_sizes) const override
 
std::unordered_map< std::string, size_t > getActualOutputSizesByName () const override
 
size_t getActualOutputSize (int index) const override
 
uint16_t getStageTypeId () const override
 
uint8_t getOutputDataType (size_t) const override
 
uint8_t getInputDataType (size_t) const override
 
size_t serializeHeader (size_t, uint8_t *buf, size_t max_size) const override
 
void deserializeHeader (const uint8_t *buf, size_t size) override
 
size_t getMaxHeaderSize (size_t) const override
 
void saveState () override
 
- Public Member Functions inherited from fz::Stage
virtual size_t getRequiredInputAlignment () const
 
virtual std::vector< std::string > getOutputNames () const
 
int getOutputIndex (const std::string &name) const
 
virtual void setDims (const std::array< size_t, 3 > &dims)
 
virtual void postStreamSync (cudaStream_t stream)
 
virtual size_t estimateScratchBytes (const std::vector< size_t > &input_sizes) const
 

Detailed Description

template<typename T>
class fz::HuffmanStage< T >

Huffman entropy coding stage.

Forward: T[] → uint8_t[] PHF-encoded bitstream with embedded phf_header. Inverse: uint8_t[] → T[] Decoded symbol stream.

Note
Prior work: PHF source files (hf.h, hf_bk*.cc, hf_buf.cc, hf_canon.cc, hf_hl.cc, hf_kernels.cu, hf_impl.hh) are vendored and adapted from the cuSZ PHF codec (origin/v1.1.0_dev), by the cuSZ team (BSD-3-Clause). Changes are documented at the top of each file. See THIRD_PARTY.md.
Template Parameters
TInput element type: uint8_t, uint16_t, or uint32_t.

Member Function Documentation

◆ setBklen()

template<typename T >
void fz::HuffmanStage< T >::setBklen ( uint32_t  bklen)
inline

Set the Huffman codebook length (number of distinct symbols).

Must be ≤ 2^(8*sizeof(T)). Typical values: uint8_t : 256 (covers all possible byte values) uint16_t : 1024 (covers quantization codes in [-512, 511]) uint32_t : 1024 (must be set explicitly; 2^32 is too large for a codebook)

Set before the first compress() call. Changing bklen after the first execute() forces Buf reallocation on the next call (old buffers returned to pool, new ones allocated). Buf is also reallocated when inlen grows past the previously allocated capacity; shrinking inlen reuses the existing allocation. Default: 256 for uint8_t, 1024 for uint16_t/uint32_t.

◆ setEncodeMode()

template<typename T >
void fz::HuffmanStage< T >::setEncodeMode ( HuffmanEncodeMode  mode)
inline

Select the encode algorithm for the forward path.

Must be called before the first compress() / execute() call (or before the next one if changing mode at runtime — triggers Buf reallocation). Default: HuffmanEncodeMode::Coarse.

◆ setInverse()

template<typename T >
void fz::HuffmanStage< T >::setInverse ( bool  inverse)
inlineoverridevirtual

Switch between forward (compression) and inverse (decompression) mode. Affects getNumInputs()/getNumOutputs() for stages with asymmetric port counts.

Reimplemented from fz::Stage.

◆ isGraphCompatible()

template<typename T >
bool fz::HuffmanStage< T >::isGraphCompatible ( ) const
inlineoverridevirtual

Whether this stage is safe inside a CUDA Graph capture.

A stage is graph-compatible if execute() enqueues only device-side work (kernel launches, cudaMemcpyAsync D2D/H2D) and makes no host-synchronous calls. Override and return false if execute() contains D2H copies or dynamic decisions based on device data — the DAG will throw at setCaptureMode(true) time rather than producing a broken graph.

Default: true. Inverse-mode stages that do D2H reads (e.g. RZE inverse) must return false.

Reimplemented from fz::Stage.

◆ onFinalize()

template<typename T >
void fz::HuffmanStage< T >::onFinalize ( size_t  estimated_inlen,
MemoryPool pool 
)
overridevirtual

Called by Pipeline::finalize() after buffer-size propagation.

Pre-allocates phf::Buf<T> from the pool using the estimated input size so PREALLOCATE mode commits all memory at finalize time. If estimated_inlen is 0 (no size hint available), allocation is deferred to the first execute() call.

Reimplemented from fz::Stage.

◆ estimateDeviceFootprintBytes()

template<typename T >
size_t fz::HuffmanStage< T >::estimateDeviceFootprintBytes ( size_t  ) const
overridevirtual

Estimated persistent device memory this stage allocates outside the pool (via pool->allocatePersistentDevice). Used for total footprint reporting. Default: 0.

Reimplemented from fz::Stage.

◆ estimatePinnedFootprintBytes()

template<typename T >
size_t fz::HuffmanStage< T >::estimatePinnedFootprintBytes ( size_t  ) const
overridevirtual

Estimated persistent pinned-host memory this stage allocates outside the pool (via pool->allocatePersistentPinned). Used for total footprint reporting. Default: 0.

Reimplemented from fz::Stage.

◆ execute()

template<typename T >
void fz::HuffmanStage< T >::execute ( cudaStream_t  stream,
MemoryPool pool,
const std::vector< void * > &  inputs,
const std::vector< void * > &  outputs,
const std::vector< size_t > &  sizes 
)
overridevirtual

Execute the stage. Inputs, outputs, and sizes are device pointers/bytes.

Stages may call cudaStreamSynchronize(stream) or issue blocking D2H copies when the algorithm requires it (e.g. Huffman histogram readback for codebook construction, ANS renormalization tables). Such stages must return false from isGraphCompatible() and must document the sync points.

Note: the DAG dispatches sibling nodes (same topological level) via a sequential CPU loop, each enqueuing to its own stream. A sync inside execute() blocks the CPU from dispatching subsequent siblings until the synced stream is idle — this delays parallel branches in wide DAGs. In a linear pipeline there are no siblings and no extra cost.

Implements fz::Stage.

◆ getName()

template<typename T >
std::string fz::HuffmanStage< T >::getName ( ) const
inlineoverridevirtual

Human-readable name used in error messages and debug output.

Implements fz::Stage.

◆ estimateOutputSizes()

template<typename T >
std::vector< size_t > fz::HuffmanStage< T >::estimateOutputSizes ( const std::vector< size_t > &  input_sizes) const
inlineoverridevirtual

Estimate output buffer sizes given input sizes. Used for buffer allocation planning in PREALLOCATE mode — must be a safe upper bound; under-estimation causes buffer overruns.

Implements fz::Stage.

◆ getActualOutputSizesByName()

template<typename T >
std::unordered_map< std::string, size_t > fz::HuffmanStage< T >::getActualOutputSizesByName ( ) const
inlineoverridevirtual

Actual output sizes after execute(), keyed by output port name.

Implements fz::Stage.

◆ getActualOutputSize()

template<typename T >
size_t fz::HuffmanStage< T >::getActualOutputSize ( int  index) const
inlineoverridevirtual

Actual size of a single output by index after execute(). Avoids constructing the map for the common single-output case. Default delegates to getActualOutputSizesByName(); override to return directly from an internal field.

Reimplemented from fz::Stage.

◆ getStageTypeId()

template<typename T >
uint16_t fz::HuffmanStage< T >::getStageTypeId ( ) const
inlineoverridevirtual

Stage type identifier written into the FZM file header.

Implements fz::Stage.

◆ getOutputDataType()

template<typename T >
uint8_t fz::HuffmanStage< T >::getOutputDataType ( size_t  output_index) const
inlineoverridevirtual

DataType enum of the given output port.

Implements fz::Stage.

◆ getInputDataType()

template<typename T >
uint8_t fz::HuffmanStage< T >::getInputDataType ( size_t  ) const
inlineoverridevirtual

Expected DataType of the given input port.

Used by Pipeline::finalize() to detect type mismatches between connected stages before any execution. Return DataType::UNKNOWN to opt out of checking — byte-transparent stages (Bitshuffle, RZE) and mock stages must return UNKNOWN; finalize() skips any connection where either side is UNKNOWN.

Reimplemented from fz::Stage.

◆ serializeHeader()

template<typename T >
size_t fz::HuffmanStage< T >::serializeHeader ( size_t  output_index,
uint8_t *  header_buffer,
size_t  max_size 
) const
inlineoverridevirtual

Serialize stage config into header_buffer (max 128 bytes) for the FZM file. Return the number of bytes written, or 0 if the stage has no config.

Reimplemented from fz::Stage.

◆ deserializeHeader()

template<typename T >
void fz::HuffmanStage< T >::deserializeHeader ( const uint8_t *  header_buffer,
size_t  size 
)
inlineoverridevirtual

Restore stage config from header_buffer during decompression.

Reimplemented from fz::Stage.

◆ getMaxHeaderSize()

template<typename T >
size_t fz::HuffmanStage< T >::getMaxHeaderSize ( size_t  output_index) const
inlineoverridevirtual

Maximum bytes this stage writes into its per-output FZM header slot.

Reimplemented from fz::Stage.

◆ saveState()

template<typename T >
void fz::HuffmanStage< T >::saveState ( )
inlineoverridevirtual

Save/restore config state around a decompression pass. deserializeHeader() overwrites the stage's forward-pass config; saveState() is called before and restoreState() after so the stage returns to its original configuration.

Reimplemented from fz::Stage.