FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
lorenzo_quant.h
Go to the documentation of this file.
1
5#pragma once
6
7#include "stage/stage.h"
8#include "fzm_format.h"
9#include <cuda_runtime.h>
10#include <array>
11#include <cstdint>
12#include <cmath>
13#include <cstring>
14
15namespace fz {
16
17
30enum class ErrorBoundMode : uint8_t {
31 ABS = 0,
32 REL = 1,
33 NOA = 2,
34};
35
45 uint32_t quant_radius;
46 uint32_t num_elements;
47 uint32_t outlier_count;
50 uint8_t ndim;
51 uint8_t eb_mode;
52 uint32_t dim_x;
53 uint32_t dim_y;
54 uint32_t dim_z;
55 float user_eb;
56 float value_base;
57 uint8_t zigzag_codes;
58 uint8_t reserved[3];
59
60 // Total: 44 bytes (fits easily in 128B stage_config)
61
64 input_type(DataType::FLOAT32), code_type(DataType::UINT16),
65 ndim(1), eb_mode(0), dim_x(0), dim_y(1), dim_z(1),
66 user_eb(0.0f), value_base(0.0f), zigzag_codes(0), reserved{0, 0, 0} {}
67};
68static_assert(sizeof(LorenzoQuantConfig) <= FZM_STAGE_CONFIG_SIZE, "LorenzoQuantConfig must fit in FZM_STAGE_CONFIG_SIZE");
69
95template<typename TInput = float, typename TCode = uint16_t>
96class LorenzoQuantStage : public Stage {
97public:
99 struct Config {
100 float error_bound = 1e-3;
101 int quant_radius = 32768;
102 float outlier_capacity = 0.2f;
106 std::array<size_t, 3> dims = {0, 1, 1};
113 bool zigzag_codes = false;
114 Config() = default;
115 Config(TInput eb, TCode radius = 32768, float outlier_cap = 0.2f,
116 std::array<size_t, 3> d = {0, 1, 1})
117 : error_bound(eb), quant_radius(radius), outlier_capacity(outlier_cap),
118 dims(d) {}
119 };
120
121 explicit LorenzoQuantStage(const Config& config = Config());
122 ~LorenzoQuantStage() override;
123
125 cudaStream_t stream,
126 MemoryPool* pool,
127 const std::vector<void*>& inputs,
128 const std::vector<void*>& outputs,
129 const std::vector<size_t>& sizes
130 ) override;
131
137 void postStreamSync(cudaStream_t stream) override;
138
143 void onFinalize(size_t estimated_inlen, MemoryPool* pool) override;
144
145 size_t estimateDeviceFootprintBytes(size_t /*estimated_inlen*/) const override {
146 return sizeof(uint32_t);
147 }
148
149 std::string getName() const override { return "LorenzoQuant"; }
150 size_t getNumInputs() const override { return is_inverse_ ? 3 : 1; }
151 size_t getNumOutputs() const override { return is_inverse_ ? 1 : 3; }
152
153 std::vector<std::string> getOutputNames() const override {
154 return {"codes", "outlier_errors", "outlier_indices"};
155 }
156
157 std::vector<size_t> estimateOutputSizes(
158 const std::vector<size_t>& input_sizes
159 ) const override;
160
161 std::unordered_map<std::string, size_t> getActualOutputSizesByName() const override {
162 auto names = getOutputNames();
163 std::unordered_map<std::string, size_t> result;
164 for (size_t i = 0; i < names.size() && i < actual_output_sizes_.size(); i++) {
165 result[names[i]] = actual_output_sizes_[i];
166 }
167 return result;
168 }
169 size_t getActualOutputSize(int index) const override {
170 return (index >= 0 && index < static_cast<int>(actual_output_sizes_.size()))
171 ? actual_output_sizes_[index] : 0;
172 }
173
174 // Preserve the forward-mode actual_output_sizes_ across decompression passes.
175 // decompressMulti() calls saveState()/restoreState() around each inverse
176 // execute() to prevent the inverse pass from permanently corrupting the
177 // 4-element forward output-size vector (inverse sets it to a 1-element vector).
178 void saveState() override { saved_output_sizes_ = actual_output_sizes_; }
179 void restoreState() override { actual_output_sizes_ = saved_output_sizes_; }
180
181 // Configuration accessors
182 void setErrorBound(TInput error_bound) { config_.error_bound = error_bound; }
183 void setQuantRadius(TCode radius) { config_.quant_radius = radius; }
184 void setOutlierCapacity(float capacity) { config_.outlier_capacity = capacity; }
185 void setDims(const std::array<size_t, 3>& dims) override { config_.dims = dims; }
189 void setErrorBoundMode(ErrorBoundMode mode) { config_.eb_mode = mode; }
190 // Provide a pre-computed value_range (NOA) or max(|data|) (REL) to skip
191 // the internal data scan during execute(). Pass 0 to re-enable auto-scan.
192 void setValueBase(float value_base) { config_.precomputed_value_base = value_base; }
193 void setZigzagCodes(bool enable) { config_.zigzag_codes = enable; }
194 void setDims(size_t x, size_t y = 1, size_t z = 1) { config_.dims = {x, y, z}; }
195
196 TInput getErrorBound() const { return config_.error_bound; }
197 TCode getQuantRadius() const { return config_.quant_radius; }
198 float getOutlierCapacity() const { return config_.outlier_capacity; }
199 std::array<size_t, 3> getDims() const { return config_.dims; }
200 ErrorBoundMode getErrorBoundMode() const { return config_.eb_mode; }
201 float getValueBase() const { return config_.precomputed_value_base; }
202 bool getZigzagCodes() const { return config_.zigzag_codes; }
203
205 int ndim() const {
206 if (config_.dims[2] > 1) return 3;
207 if (config_.dims[1] > 1) return 2;
208 return 1;
209 }
210
211 void setInverse(bool inverse) { is_inverse_ = inverse; }
212 bool isInverse() const { return is_inverse_; }
213
214 // ── Serialization ─────────────────────────────────────────────────────────
215
216 uint16_t getStageTypeId() const override {
217 return static_cast<uint16_t>(StageType::LORENZO_QUANT);
218 }
219
220 uint8_t getOutputDataType(size_t output_index) const override {
221 switch (output_index) {
222 case 0: return static_cast<uint8_t>(getCodeDataType()); // codes
223 case 1: return static_cast<uint8_t>(getInputDataType()); // outlier_errors
224 case 2: return static_cast<uint8_t>(DataType::UINT32); // outlier_indices
225 default: return static_cast<uint8_t>(DataType::UINT8);
226 }
227 }
228
229 uint8_t getInputDataType(size_t /*input_index*/) const override {
230 return static_cast<uint8_t>(getInputDataType());
231 }
232
233 size_t serializeHeader(size_t output_index, uint8_t* header_buffer, size_t max_size) const override {
234 (void)output_index; // Lorenzo uses same header for all outputs
235
236 if (max_size < sizeof(LorenzoQuantConfig)) {
237 throw std::runtime_error("Insufficient buffer for Lorenzo config");
238 }
239
240 LorenzoQuantConfig config;
241 config.error_bound = static_cast<float>(computed_abs_eb_); // abs bound used by decompressor
242 config.quant_radius = static_cast<uint32_t>(config_.quant_radius);
243 config.num_elements = static_cast<uint32_t>(num_elements_);
244 config.outlier_count = actual_outlier_count_;
245 config.input_type = getInputDataType();
246 config.code_type = getCodeDataType();
247 config.ndim = static_cast<uint8_t>(ndim());
248 config.eb_mode = static_cast<uint8_t>(config_.eb_mode);
249 config.dim_x = static_cast<uint32_t>(config_.dims[0]);
250 config.dim_y = static_cast<uint32_t>(config_.dims[1]);
251 config.dim_z = static_cast<uint32_t>(config_.dims[2]);
252 config.user_eb = static_cast<float>(config_.error_bound); // original user-specified value
253 config.value_base = computed_value_base_;
254 config.zigzag_codes = config_.zigzag_codes ? uint8_t{1} : uint8_t{0};
255 config.reserved[0] = 0; config.reserved[1] = 0; config.reserved[2] = 0;
256
257 std::memcpy(header_buffer, &config, sizeof(LorenzoQuantConfig));
258 return sizeof(LorenzoQuantConfig);
259 }
260
261 size_t getMaxHeaderSize(size_t output_index) const override {
262 (void)output_index;
263 return sizeof(LorenzoQuantConfig);
264 }
265
266 void deserializeHeader(const uint8_t* header_buffer, size_t size) override {
267 // Minimum size is the original 32-byte layout (before user_eb/value_base were added).
268 constexpr size_t kLegacySize = 32;
269 if (size < kLegacySize) {
270 throw std::runtime_error("Invalid Lorenzo config size");
271 }
272
273 LorenzoQuantConfig config;
274 std::memcpy(&config, header_buffer, std::min(size, sizeof(LorenzoQuantConfig)));
275
276 // error_bound in the header is always the absolute bound used at compression.
277 config_.error_bound = config.error_bound;
278 computed_abs_eb_ = static_cast<TInput>(config.error_bound);
279 config_.quant_radius = static_cast<TCode>(config.quant_radius);
280 num_elements_ = config.num_elements;
281 actual_outlier_count_= config.outlier_count;
282 // New fields: present only in headers written by v1+ (≥40B, added user_eb/value_base/eb_mode).
283 constexpr size_t kV1Size = 40;
284 if (size >= kV1Size) {
285 config_.eb_mode = static_cast<ErrorBoundMode>(config.eb_mode);
286 config_.precomputed_value_base = config.value_base;
287 computed_value_base_ = config.value_base;
288 } else {
289 config_.eb_mode = ErrorBoundMode::ABS;
290 config_.precomputed_value_base = 0.0f;
291 computed_value_base_ = 0.0f;
292 }
293 // zigzag_codes field added in v2 (≥44B).
294 if (size >= sizeof(LorenzoQuantConfig)) {
295 config_.zigzag_codes = (config.zigzag_codes != 0);
296 } else {
297 config_.zigzag_codes = false;
298 }
299
300 // Restore spatial dimensions; handle old (pre-dims) files gracefully
301 int eff_ndim = (config.ndim == 0) ? 1 : static_cast<int>(config.ndim);
302 // dim_x: stored explicitly; fall back to derivation for old files
303 if (config.dim_x > 0) {
304 config_.dims[0] = config.dim_x;
305 } else if (config.num_elements > 0) {
306 size_t yz = std::max<size_t>(1, config.dim_y) * std::max<size_t>(1, config.dim_z);
307 config_.dims[0] = config.num_elements / yz;
308 } else {
309 config_.dims[0] = 0;
310 }
311 if (eff_ndim >= 2) {
312 config_.dims[1] = (config.dim_y > 0) ? config.dim_y : 1;
313 } else {
314 config_.dims[1] = 1;
315 }
316 if (eff_ndim >= 3) {
317 config_.dims[2] = (config.dim_z > 0) ? config.dim_z : 1;
318 } else {
319 config_.dims[2] = 1;
320 }
321 }
322
323private:
324 Config config_;
325 std::vector<size_t> actual_output_sizes_;
326 std::vector<size_t> saved_output_sizes_; // saved by saveState(), restored by restoreState()
327 size_t num_elements_ = 0; // Track for header
328 uint32_t actual_outlier_count_ = 0; // Track for header
329 bool is_inverse_ = false; // false = compress, true = decompress
333 TInput computed_abs_eb_ = 0;
336 float computed_value_base_ = 0.0f;
343 uint32_t* d_outlier_count_scratch_ = nullptr;
346 MemoryPool* persistent_pool_ = nullptr;
347
350 void initOutlierCountScratch(MemoryPool* pool);
351
352 DataType getInputDataType() const {
353 if (std::is_same<TInput, float>::value) return DataType::FLOAT32;
354 if (std::is_same<TInput, double>::value) return DataType::FLOAT64;
355 return DataType::FLOAT32;
356 }
357
358 DataType getCodeDataType() const {
359 if (std::is_same<TCode, uint8_t>::value) return DataType::UINT8;
360 if (std::is_same<TCode, uint16_t>::value) return DataType::UINT16;
361 if (std::is_same<TCode, uint32_t>::value) return DataType::UINT32;
362 return DataType::UINT16;
363 }
364
365 size_t getMaxOutlierCount(size_t num_elements) const {
366 return static_cast<size_t>(std::ceil(num_elements * config_.outlier_capacity));
367 }
368};
369
370extern template class LorenzoQuantStage<float, uint16_t>;
371extern template class LorenzoQuantStage<float, uint8_t>;
372extern template class LorenzoQuantStage<double, uint16_t>;
373extern template class LorenzoQuantStage<double, uint32_t>;
374
375// Kernel launcher declarations — defined in lorenzo.cu.
376
377template<typename TInput, typename TCode>
378void launchLorenzoKernel(
379 const TInput* d_input, size_t n,
380 TInput ebx2_r, TCode quant_radius,
381 TCode* d_codes, TInput* d_outlier_errors,
382 uint32_t* d_outlier_indices, uint32_t* d_outlier_count,
383 size_t max_outliers, int grid_size,
384 bool zigzag_codes,
385 cudaStream_t stream
386);
387
388template<typename TInput, typename TCode>
389void launchLorenzoInverseKernel(
390 const TCode* d_codes,
391 const TInput* d_outlier_errors, const uint32_t* d_outlier_indices,
392 uint32_t outlier_n,
393 size_t n,
394 TInput ebx2, TCode quant_radius,
395 TInput* d_output,
396 bool zigzag_codes,
397 cudaStream_t stream, MemoryPool* pool
398);
399
401template<typename TInput, typename TCode>
403 const TInput* d_input, size_t nx, size_t ny,
404 TInput ebx2_r, TCode quant_radius,
405 TCode* d_codes, TInput* d_outlier_errors,
406 uint32_t* d_outlier_indices, uint32_t* d_outlier_count,
407 size_t max_outliers,
408 bool zigzag_codes,
409 cudaStream_t stream
410);
411
413template<typename TInput, typename TCode>
415 const TCode* d_codes,
416 const TInput* d_outlier_errors, const uint32_t* d_outlier_indices,
417 uint32_t outlier_n,
418 size_t nx, size_t ny,
419 TInput ebx2, TCode quant_radius,
420 TInput* d_output,
421 bool zigzag_codes,
422 cudaStream_t stream, MemoryPool* pool
423);
424
426template<typename TInput, typename TCode>
428 const TInput* d_input, size_t nx, size_t ny, size_t nz,
429 TInput ebx2_r, TCode quant_radius,
430 TCode* d_codes, TInput* d_outlier_errors,
431 uint32_t* d_outlier_indices, uint32_t* d_outlier_count,
432 size_t max_outliers,
433 bool zigzag_codes,
434 cudaStream_t stream
435);
436
438template<typename TInput, typename TCode>
440 const TCode* d_codes,
441 const TInput* d_outlier_errors, const uint32_t* d_outlier_indices,
442 uint32_t outlier_n,
443 size_t nx, size_t ny, size_t nz,
444 TInput ebx2, TCode quant_radius,
445 TInput* d_output,
446 bool zigzag_codes,
447 cudaStream_t stream, MemoryPool* pool
448);
449
450} // namespace fz
Definition lorenzo_quant.h:96
std::unordered_map< std::string, size_t > getActualOutputSizesByName() const override
Definition lorenzo_quant.h:161
int ndim() const
Returns the effective spatial dimensionality (1, 2, or 3).
Definition lorenzo_quant.h:205
uint8_t getOutputDataType(size_t output_index) const override
Definition lorenzo_quant.h:220
uint16_t getStageTypeId() const override
Definition lorenzo_quant.h:216
void setErrorBoundMode(ErrorBoundMode mode)
Definition lorenzo_quant.h:189
size_t getActualOutputSize(int index) const override
Definition lorenzo_quant.h:169
void saveState() override
Definition lorenzo_quant.h:178
size_t serializeHeader(size_t output_index, uint8_t *header_buffer, size_t max_size) const override
Definition lorenzo_quant.h:233
size_t getMaxHeaderSize(size_t output_index) const override
Definition lorenzo_quant.h:261
size_t estimateDeviceFootprintBytes(size_t) const override
Definition lorenzo_quant.h:145
std::string getName() const override
Definition lorenzo_quant.h:149
void execute(cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
uint8_t getInputDataType(size_t) const override
Definition lorenzo_quant.h:229
void deserializeHeader(const uint8_t *header_buffer, size_t size) override
Definition lorenzo_quant.h:266
std::vector< std::string > getOutputNames() const override
Definition lorenzo_quant.h:153
void onFinalize(size_t estimated_inlen, MemoryPool *pool) override
void postStreamSync(cudaStream_t stream) override
void setDims(const std::array< size_t, 3 > &dims) override
Definition lorenzo_quant.h:185
void setInverse(bool inverse)
Definition lorenzo_quant.h:211
std::vector< size_t > estimateOutputSizes(const std::vector< size_t > &input_sizes) const override
Definition mempool.h:82
Definition stage.h:30
FZM binary file format definitions — structs, enums, and helpers.
Definition fzm_format.h:25
void launchLorenzoInverseKernel3D(const TCode *d_codes, const TInput *d_outlier_errors, const uint32_t *d_outlier_indices, uint32_t outlier_n, size_t nx, size_t ny, size_t nz, TInput ebx2, TCode quant_radius, TInput *d_output, bool zigzag_codes, cudaStream_t stream, MemoryPool *pool)
3-D inverse Lorenzo kernel launcher.
ErrorBoundMode
Definition lorenzo_quant.h:30
@ NOA
Value-range relative bound (norm-of-absolute).
@ ABS
Absolute error bound.
@ REL
Global-approximate point-wise relative bound.
void launchLorenzoKernel3D(const TInput *d_input, size_t nx, size_t ny, size_t nz, TInput ebx2_r, TCode quant_radius, TCode *d_codes, TInput *d_outlier_errors, uint32_t *d_outlier_indices, uint32_t *d_outlier_count, size_t max_outliers, bool zigzag_codes, cudaStream_t stream)
3-D forward Lorenzo kernel launcher.
constexpr size_t FZM_STAGE_CONFIG_SIZE
Per-stage serialized config slot (bytes)
Definition fzm_format.h:65
void launchLorenzoKernel2D(const TInput *d_input, size_t nx, size_t ny, TInput ebx2_r, TCode quant_radius, TCode *d_codes, TInput *d_outlier_errors, uint32_t *d_outlier_indices, uint32_t *d_outlier_count, size_t max_outliers, bool zigzag_codes, cudaStream_t stream)
2-D forward Lorenzo kernel launcher. nx is the fast (x) dimension.
DataType
Element data type identifiers used in buffer and stage descriptors.
Definition fzm_format.h:109
void launchLorenzoInverseKernel2D(const TCode *d_codes, const TInput *d_outlier_errors, const uint32_t *d_outlier_indices, uint32_t outlier_n, size_t nx, size_t ny, TInput ebx2, TCode quant_radius, TInput *d_output, bool zigzag_codes, cudaStream_t stream, MemoryPool *pool)
2-D inverse Lorenzo kernel launcher.
Base class interface for all compression stages.
Definition lorenzo_quant.h:43
uint8_t zigzag_codes
1 if codes are zigzag-encoded, else 0.
Definition lorenzo_quant.h:57
float value_base
value_range (NOA) or max(|data|) (REL) used in conversion.
Definition lorenzo_quant.h:56
DataType input_type
Original input type (1B).
Definition lorenzo_quant.h:48
uint32_t quant_radius
Quantization radius.
Definition lorenzo_quant.h:45
uint8_t reserved[3]
Must be zero.
Definition lorenzo_quant.h:58
float error_bound
Absolute bound after mode conversion (used by decompressor).
Definition lorenzo_quant.h:44
uint8_t eb_mode
ErrorBoundMode cast to uint8_t.
Definition lorenzo_quant.h:51
uint32_t num_elements
Total element count.
Definition lorenzo_quant.h:46
uint8_t ndim
Spatial dimensionality 1/2/3 (0 treated as 1).
Definition lorenzo_quant.h:50
uint32_t dim_z
Z dimension (1 for 1-D/2-D).
Definition lorenzo_quant.h:54
DataType code_type
Quantization code type (1B).
Definition lorenzo_quant.h:49
uint32_t dim_y
Y dimension (1 for 1-D).
Definition lorenzo_quant.h:53
uint32_t outlier_count
Actual number of outliers.
Definition lorenzo_quant.h:47
float user_eb
Original user-specified error bound value.
Definition lorenzo_quant.h:55
uint32_t dim_x
X (fast) dimension; 0 = infer from num_elements.
Definition lorenzo_quant.h:52
Definition lorenzo_quant.h:99
int quant_radius
Quantization radius (2^15 for uint16_t).
Definition lorenzo_quant.h:101
float error_bound
Error bound (interpretation depends on eb_mode).
Definition lorenzo_quant.h:100
bool zigzag_codes
Definition lorenzo_quant.h:113
float outlier_capacity
Definition lorenzo_quant.h:102
float precomputed_value_base
Definition lorenzo_quant.h:110
std::array< size_t, 3 > dims
Definition lorenzo_quant.h:106