|
| void | execute (cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override |
| |
| void | postStreamSync (cudaStream_t stream) override |
| |
| void | onFinalize (size_t estimated_inlen, MemoryPool *pool) override |
| |
| size_t | estimateDeviceFootprintBytes (size_t) const override |
| |
| std::string | getName () const override |
| |
| std::vector< std::string > | getOutputNames () const override |
| |
| std::vector< size_t > | estimateOutputSizes (const std::vector< size_t > &input_sizes) const override |
| |
| std::unordered_map< std::string, size_t > | getActualOutputSizesByName () const override |
| |
| size_t | getActualOutputSize (int index) const override |
| |
| void | saveState () override |
| |
| void | setDims (const std::array< size_t, 3 > &dims) override |
| |
| void | setErrorBoundMode (ErrorBoundMode mode) |
| |
|
int | ndim () const |
| | Returns the effective spatial dimensionality (1, 2, or 3).
|
| |
| void | setInverse (bool inverse) |
| |
| uint16_t | getStageTypeId () const override |
| |
| uint8_t | getOutputDataType (size_t output_index) const override |
| |
| uint8_t | getInputDataType (size_t) const override |
| |
| size_t | serializeHeader (size_t output_index, uint8_t *header_buffer, size_t max_size) const override |
| |
| size_t | getMaxHeaderSize (size_t output_index) const override |
| |
| void | deserializeHeader (const uint8_t *header_buffer, size_t size) override |
| |
| virtual size_t | getRequiredInputAlignment () const |
| |
| int | getOutputIndex (const std::string &name) const |
| |
| virtual size_t | estimatePinnedFootprintBytes (size_t) const |
| |
| virtual bool | isGraphCompatible () const |
| |
| virtual size_t | estimateScratchBytes (const std::vector< size_t > &input_sizes) const |
| |
template<typename TInput = float, typename TCode = uint16_t>
class fz::LorenzoQuantStage< TInput, TCode >
Lorenzo predictor with error-bounded quantization (1-D, 2-D, 3-D).
- Note
- Prior work: fused predictor+quantizer kernels and the multi-output design follow the cuSZ Lorenzo implementation (
lrz_c.cuhip.inl, lrz_x.cuhip.inl) by the cuSZ team (BSD-3-Clause). See THIRD_PARTY.md.
Forward outputs (compression):
- [0] codes — quantization codes for all elements (
TCode)
- [1] outlier_errors — prediction errors for outliers (
TInput)
- [2] outlier_indices — outlier element indices (
uint32_t)
The outlier count is not a DAG output port — it lives in a stage-private 4-byte device scratch (allocated via pool->allocatePersistentDevice in onFinalize()), is D2H'd in postStreamSync(), and is serialized in the FZM header. The inverse path receives it as a uint32_t kernel-launch argument (read from the deserialized header), so the inverse kernel never has to dereference a device pointer to know its loop bound.
Inverse (decompression): takes the three forward outputs, produces the reconstructed data as a single TInput array.
- Template Parameters
-
| TInput | Floating-point input type (float or double). |
| TCode | Quantization code type (uint8_t, uint16_t, uint32_t). |
template<typename TInput = float, typename TCode = uint16_t>
| void fz::LorenzoQuantStage< TInput, TCode >::execute |
( |
cudaStream_t |
stream, |
|
|
MemoryPool * |
pool, |
|
|
const std::vector< void * > & |
inputs, |
|
|
const std::vector< void * > & |
outputs, |
|
|
const std::vector< size_t > & |
sizes |
|
) |
| |
|
overridevirtual |
Execute the stage. Inputs, outputs, and sizes are device pointers/bytes.
Stages may call cudaStreamSynchronize(stream) or issue blocking D2H copies when the algorithm requires it (e.g. Huffman histogram readback for codebook construction, ANS renormalization tables). Such stages must return false from isGraphCompatible() and must document the sync points.
Note: the DAG dispatches sibling nodes (same topological level) via a sequential CPU loop, each enqueuing to its own stream. A sync inside execute() blocks the CPU from dispatching subsequent siblings until the synced stream is idle — this delays parallel branches in wide DAGs. In a linear pipeline there are no siblings and no extra cost.
Implements fz::Stage.
template<typename TInput = float, typename TCode = uint16_t>
Expected DataType of the given input port.
Used by Pipeline::finalize() to detect type mismatches between connected stages before any execution. Return DataType::UNKNOWN to opt out of checking — byte-transparent stages (Bitshuffle, RZE, RRE) and mock stages must return UNKNOWN; finalize() skips any connection where either side is UNKNOWN.
Reimplemented from fz::Stage.