|
| void | setInverse (bool inv) override |
| |
| bool | isGraphCompatible () const override |
| |
| size_t | getRequiredInputAlignment () const override |
| |
| void | execute (cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override |
| |
| void | postStreamSync (cudaStream_t stream) override |
| |
| std::string | getName () const override |
| |
| std::vector< size_t > | estimateOutputSizes (const std::vector< size_t > &input_sizes) const override |
| |
| std::unordered_map< std::string, size_t > | getActualOutputSizesByName () const override |
| |
| size_t | getActualOutputSize (int index) const override |
| |
| size_t | estimateScratchBytes (const std::vector< size_t > &input_sizes) const override |
| |
| uint16_t | getStageTypeId () const override |
| |
| uint8_t | getOutputDataType (size_t) const override |
| |
| size_t | serializeHeader (size_t output_index, uint8_t *buf, size_t max_size) const override |
| |
| void | deserializeHeader (const uint8_t *buf, size_t size) override |
| |
| size_t | getMaxHeaderSize (size_t) const override |
| |
| void | saveState () override |
| |
| virtual std::vector< std::string > | getOutputNames () const |
| |
| int | getOutputIndex (const std::string &name) const |
| |
| virtual uint8_t | getInputDataType (size_t) const |
| |
| virtual void | setDims (const std::array< size_t, 3 > &dims) |
| |
| virtual void | onFinalize (size_t, MemoryPool *) |
| |
| virtual size_t | estimateDeviceFootprintBytes (size_t) const |
| |
| virtual size_t | estimatePinnedFootprintBytes (size_t) const |
| |
Zero-Elimination Encoding stage.
setChunkSize(bytes) — chunk size (default 16384; only 16384 is supported). setWordSize(bytes) — word granularity 1/2/4/8 (default 1 = LC RZE_1).
- Note
- Prior work: GPU kernels are a faithful port of
d_RZE.h, d_zero_elimination.h, and d_repetition_elimination.h from the LC framework (Burtscher et al., BSD-3-Clause), shared with RREStage via modules/coders/lc_common/lc_chunk_components.cuh. See THIRD_PARTY.md.
-
CUDA Graph capture is supported for compression only. The inverse path reads the stream header with blocking D2H copies before it can launch the decode kernel (same constraint as RZEStage).
| void fz::RZEStage::execute |
( |
cudaStream_t |
stream, |
|
|
MemoryPool * |
pool, |
|
|
const std::vector< void * > & |
inputs, |
|
|
const std::vector< void * > & |
outputs, |
|
|
const std::vector< size_t > & |
sizes |
|
) |
| |
|
overridevirtual |
Execute the stage. Inputs, outputs, and sizes are device pointers/bytes.
Stages may call cudaStreamSynchronize(stream) or issue blocking D2H copies when the algorithm requires it (e.g. Huffman histogram readback for codebook construction, ANS renormalization tables). Such stages must return false from isGraphCompatible() and must document the sync points.
Note: the DAG dispatches sibling nodes (same topological level) via a sequential CPU loop, each enqueuing to its own stream. A sync inside execute() blocks the CPU from dispatching subsequent siblings until the synced stream is idle — this delays parallel branches in wide DAGs. In a linear pipeline there are no siblings and no extra cost.
Implements fz::Stage.