FZGPUModules 1.0
GPU-accelerated modular compression pipeline
Loading...
Searching...
No Matches
fz::RLEStage< T > Class Template Reference

#include <rle.h>

+ Inheritance diagram for fz::RLEStage< T >:

Public Member Functions

void setInverse (bool inverse) override
 
void execute (cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
 
void postStreamSync (cudaStream_t stream) override
 
std::string getName () const override
 
size_t estimateScratchBytes (const std::vector< size_t > &input_sizes) const override
 
std::vector< size_t > estimateOutputSizes (const std::vector< size_t > &input_sizes) const override
 
std::unordered_map< std::string, size_t > getActualOutputSizesByName () const override
 
size_t getActualOutputSize (int index) const override
 
uint16_t getStageTypeId () const override
 
uint8_t getOutputDataType (size_t output_index) const override
 
uint8_t getInputDataType (size_t) const override
 
size_t serializeHeader (size_t output_index, uint8_t *header_buffer, size_t max_size) const override
 
void deserializeHeader (const uint8_t *header_buffer, size_t size) override
 
size_t getMaxHeaderSize (size_t output_index) const override
 
- Public Member Functions inherited from fz::Stage
virtual size_t getRequiredInputAlignment () const
 
virtual std::vector< std::string > getOutputNames () const
 
int getOutputIndex (const std::string &name) const
 
virtual void saveState ()
 
virtual void setDims (const std::array< size_t, 3 > &dims)
 
virtual bool isGraphCompatible () const
 

Detailed Description

template<typename T = uint16_t>
class fz::RLEStage< T >

Run-Length Encoding stage. Lossless; effective when data has long runs of identical values (e.g. quantized codes).

Forward wire format: [num_runs:u32][values:T×n (4B-aligned)][lengths:u32×n]

Worst-case output is 2× input + 4 bytes (no repeated values), so RLE should follow a predictor/quantizer stage that creates repetition.

Template Parameters
TElement type (uint8_t, uint16_t, uint32_t, …). Run counts are always uint32_t.

Member Function Documentation

◆ setInverse()

template<typename T = uint16_t>
void fz::RLEStage< T >::setInverse ( bool  inverse)
inlineoverridevirtual

Switch between forward (compression) and inverse (decompression) mode. Affects getNumInputs()/getNumOutputs() for stages with asymmetric port counts.

Reimplemented from fz::Stage.

◆ execute()

template<typename T = uint16_t>
void fz::RLEStage< T >::execute ( cudaStream_t  stream,
MemoryPool pool,
const std::vector< void * > &  inputs,
const std::vector< void * > &  outputs,
const std::vector< size_t > &  sizes 
)
overridevirtual

Execute the stage. Inputs, outputs, and sizes are device pointers/bytes.

Implements fz::Stage.

◆ postStreamSync()

template<typename T = uint16_t>
void fz::RLEStage< T >::postStreamSync ( cudaStream_t  stream)
overridevirtual

Completes the async D2H readback of num_runs started during forward execute() and sets actual_output_sizes_. Must be called after the stream passed to execute() has been synchronized.

Reimplemented from fz::Stage.

◆ getName()

template<typename T = uint16_t>
std::string fz::RLEStage< T >::getName ( ) const
inlineoverridevirtual

Human-readable name used in error messages and debug output.

Implements fz::Stage.

◆ estimateScratchBytes()

template<typename T = uint16_t>
size_t fz::RLEStage< T >::estimateScratchBytes ( const std::vector< size_t > &  input_sizes) const
inlineoverridevirtual

Persistent forward-path scratch: d_is_boundary_ : n bytes d_boundary_scan_ : n × u32 d_boundary_positions_: n × u32 (worst-case, avoids D2H for num_runs) d_values_scratch_ : n × T d_lengths_scratch_ : n × u32 All five arrays are sized to the largest n seen so far and reused across calls, eliminating per-call cudaMallocAsync overhead.

Reimplemented from fz::Stage.

◆ estimateOutputSizes()

template<typename T = uint16_t>
std::vector< size_t > fz::RLEStage< T >::estimateOutputSizes ( const std::vector< size_t > &  input_sizes) const
inlineoverridevirtual

Estimate output buffer sizes given input sizes. Used for buffer allocation planning in PREALLOCATE mode — must be a safe upper bound; under-estimation causes buffer overruns.

Implements fz::Stage.

◆ getActualOutputSizesByName()

template<typename T = uint16_t>
std::unordered_map< std::string, size_t > fz::RLEStage< T >::getActualOutputSizesByName ( ) const
inlineoverridevirtual

Actual output sizes after execute(), keyed by output port name.

Implements fz::Stage.

◆ getActualOutputSize()

template<typename T = uint16_t>
size_t fz::RLEStage< T >::getActualOutputSize ( int  index) const
inlineoverridevirtual

Actual size of a single output by index after execute(). Avoids constructing the map for the common single-output case. Default delegates to getActualOutputSizesByName(); override to return directly from an internal field.

Reimplemented from fz::Stage.

◆ getStageTypeId()

template<typename T = uint16_t>
uint16_t fz::RLEStage< T >::getStageTypeId ( ) const
inlineoverridevirtual

Stage type identifier written into the FZM file header.

Implements fz::Stage.

◆ getOutputDataType()

template<typename T = uint16_t>
uint8_t fz::RLEStage< T >::getOutputDataType ( size_t  output_index) const
inlineoverridevirtual

DataType enum of the given output port.

Implements fz::Stage.

◆ getInputDataType()

template<typename T = uint16_t>
uint8_t fz::RLEStage< T >::getInputDataType ( size_t  ) const
inlineoverridevirtual

Expected DataType of the given input port.

Used by Pipeline::finalize() to detect type mismatches between connected stages before any execution. Return DataType::UNKNOWN to opt out of checking — byte-transparent stages (Bitshuffle, RZE) and mock stages must return UNKNOWN; finalize() skips any connection where either side is UNKNOWN.

Reimplemented from fz::Stage.

◆ serializeHeader()

template<typename T = uint16_t>
size_t fz::RLEStage< T >::serializeHeader ( size_t  output_index,
uint8_t *  header_buffer,
size_t  max_size 
) const
inlineoverridevirtual

Serialize stage config into header_buffer (max 128 bytes) for the FZM file. Return the number of bytes written, or 0 if the stage has no config.

Reimplemented from fz::Stage.

◆ deserializeHeader()

template<typename T = uint16_t>
void fz::RLEStage< T >::deserializeHeader ( const uint8_t *  header_buffer,
size_t  size 
)
inlineoverridevirtual

Restore stage config from header_buffer during decompression.

Reimplemented from fz::Stage.

◆ getMaxHeaderSize()

template<typename T = uint16_t>
size_t fz::RLEStage< T >::getMaxHeaderSize ( size_t  output_index) const
inlineoverridevirtual

Maximum bytes this stage writes into its per-output FZM header slot.

Reimplemented from fz::Stage.