|
| void | setInverse (bool inv) override |
| |
| void | setDims (const std::array< size_t, 3 > &dims) override |
| |
| void | setTileShape (uint32_t tx, uint32_t ty=1, uint32_t tz=1) |
| |
|
uint32_t | getTileElems () const |
| | Elements per tile = the AdaptiveBitpack block_size that aligns blocks to tiles.
|
| |
| void | execute (cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override |
| |
| std::string | getName () const override |
| |
| std::vector< size_t > | estimateOutputSizes (const std::vector< size_t > &input_sizes) const override |
| |
| std::unordered_map< std::string, size_t > | getActualOutputSizesByName () const override |
| |
| size_t | getActualOutputSize (int index) const override |
| |
| uint16_t | getStageTypeId () const override |
| |
| uint8_t | getOutputDataType (size_t) const override |
| |
| uint8_t | getInputDataType (size_t) const override |
| |
| size_t | serializeHeader (size_t, uint8_t *buf, size_t max_size) const override |
| |
| void | deserializeHeader (const uint8_t *buf, size_t size) override |
| |
| size_t | getMaxHeaderSize (size_t) const override |
| |
| virtual size_t | getRequiredInputAlignment () const |
| |
| virtual std::vector< std::string > | getOutputNames () const |
| |
| int | getOutputIndex (const std::string &name) const |
| |
| virtual void | saveState () |
| |
| virtual void | onFinalize (size_t, MemoryPool *) |
| |
| virtual size_t | estimateDeviceFootprintBytes (size_t) const |
| |
| virtual size_t | estimatePinnedFootprintBytes (size_t) const |
| |
| virtual void | postStreamSync (cudaStream_t stream) |
| |
| virtual bool | isGraphCompatible () const |
| |
| virtual size_t | estimateScratchBytes (const std::vector< size_t > &input_sizes) const |
| |
template<typename T>
class fz::TiledLorenzoStage< T >
Dimension-aware (tiled separable) Lorenzo predictor (cuSZp3). Lossless.
- Note
- Prior work: the dimension-aware separable delta is the cuSZp3 design (Yafan Huang et al., SC'25, BSD-3-Clause). This stage is a direct port of the cuSZp3 delta kernel logic (cuSZp_kernels_{2D,3D}_f32.cu); the tile-major modular decomposition, FZM header, and MemoryPool integration are FZGPUModules code. The cuSZp BSD-3-Clause copyright is reproduced in
THIRD_PARTY.md. See also memory/cuszp_stages.md (Part 8).
- Template Parameters
-
| T | Signed element type: int16_t or int32_t. |
template<typename T >
| void fz::TiledLorenzoStage< T >::execute |
( |
cudaStream_t |
stream, |
|
|
MemoryPool * |
pool, |
|
|
const std::vector< void * > & |
inputs, |
|
|
const std::vector< void * > & |
outputs, |
|
|
const std::vector< size_t > & |
sizes |
|
) |
| |
|
overridevirtual |
Execute the stage. Inputs, outputs, and sizes are device pointers/bytes.
Stages may call cudaStreamSynchronize(stream) or issue blocking D2H copies when the algorithm requires it (e.g. Huffman histogram readback for codebook construction, ANS renormalization tables). Such stages must return false from isGraphCompatible() and must document the sync points.
Note: the DAG dispatches sibling nodes (same topological level) via a sequential CPU loop, each enqueuing to its own stream. A sync inside execute() blocks the CPU from dispatching subsequent siblings until the synced stream is idle — this delays parallel branches in wide DAGs. In a linear pipeline there are no siblings and no extra cost.
Implements fz::Stage.