9#include <cuda_runtime.h>
68static_assert(
sizeof(LorenzoQuantConfig) <=
FZM_STAGE_CONFIG_SIZE,
"LorenzoQuantConfig must fit in FZM_STAGE_CONFIG_SIZE");
89template<
typename TInput =
float,
typename TCode = u
int16_t>
100 std::array<size_t, 3>
dims = {0, 1, 1};
109 Config(TInput eb, TCode radius = 32768,
float outlier_cap = 0.2f,
110 std::array<size_t, 3> d = {0, 1, 1})
115 explicit LorenzoQuantStage(
const Config& config = Config());
120 const std::vector<void*>& inputs,
121 const std::vector<void*>& outputs,
122 const std::vector<size_t>& sizes
132 std::string
getName()
const override {
return "LorenzoQuant"; }
133 size_t getNumInputs()
const override {
return is_inverse_ ? 4 : 1; }
134 size_t getNumOutputs()
const override {
return is_inverse_ ? 1 : 4; }
137 return {
"codes",
"outlier_errors",
"outlier_indices",
"outlier_count"};
141 const std::vector<size_t>& input_sizes
146 std::unordered_map<std::string, size_t> result;
147 for (
size_t i = 0; i < names.size() && i < actual_output_sizes_.size(); i++) {
148 result[names[i]] = actual_output_sizes_[i];
153 return (index >= 0 && index <
static_cast<int>(actual_output_sizes_.size()))
154 ? actual_output_sizes_[index] : 0;
161 void saveState()
override { saved_output_sizes_ = actual_output_sizes_; }
162 void restoreState()
override { actual_output_sizes_ = saved_output_sizes_; }
165 void setErrorBound(TInput error_bound) { config_.
error_bound = error_bound; }
166 void setQuantRadius(TCode radius) { config_.
quant_radius = radius; }
167 void setOutlierCapacity(
float capacity) { config_.
outlier_capacity = capacity; }
168 void setDims(
const std::array<size_t, 3>& dims)
override { config_.
dims = dims; }
169 void setErrorBoundMode(
ErrorBoundMode mode) { config_.eb_mode = mode; }
173 void setZigzagCodes(
bool enable) { config_.
zigzag_codes = enable; }
174 void setDims(
size_t x,
size_t y = 1,
size_t z = 1) { config_.
dims = {x, y, z}; }
176 TInput getErrorBound()
const {
return config_.
error_bound; }
177 TCode getQuantRadius()
const {
return config_.
quant_radius; }
179 std::array<size_t, 3> getDims()
const {
return config_.
dims; }
180 ErrorBoundMode getErrorBoundMode()
const {
return config_.eb_mode; }
182 bool getZigzagCodes()
const {
return config_.
zigzag_codes; }
186 if (config_.
dims[2] > 1)
return 3;
187 if (config_.
dims[1] > 1)
return 2;
192 bool isInverse()
const {
return is_inverse_; }
197 return static_cast<uint16_t
>(StageType::LORENZO_QUANT);
201 switch (output_index) {
202 case 0:
return static_cast<uint8_t
>(getCodeDataType());
204 case 2:
return static_cast<uint8_t
>(DataType::UINT32);
205 case 3:
return static_cast<uint8_t
>(DataType::UINT32);
206 default:
return static_cast<uint8_t
>(DataType::UINT8);
214 size_t serializeHeader(
size_t output_index, uint8_t* header_buffer,
size_t max_size)
const override {
218 throw std::runtime_error(
"Insufficient buffer for Lorenzo config");
222 config.
error_bound =
static_cast<float>(computed_abs_eb_);
224 config.
num_elements =
static_cast<uint32_t
>(num_elements_);
228 config.
ndim =
static_cast<uint8_t
>(
ndim());
229 config.
eb_mode =
static_cast<uint8_t
>(config_.eb_mode);
230 config.
dim_x =
static_cast<uint32_t
>(config_.
dims[0]);
231 config.
dim_y =
static_cast<uint32_t
>(config_.
dims[1]);
232 config.
dim_z =
static_cast<uint32_t
>(config_.
dims[2]);
249 constexpr size_t kLegacySize = 32;
250 if (size < kLegacySize) {
251 throw std::runtime_error(
"Invalid Lorenzo config size");
259 computed_abs_eb_ =
static_cast<TInput
>(config.
error_bound);
264 constexpr size_t kV1Size = 40;
265 if (size >= kV1Size) {
272 computed_value_base_ = 0.0f;
282 int eff_ndim = (config.
ndim == 0) ? 1 :
static_cast<int>(config.
ndim);
284 if (config.
dim_x > 0) {
287 size_t yz = std::max<size_t>(1, config.
dim_y) * std::max<size_t>(1, config.
dim_z);
306 std::vector<size_t> actual_output_sizes_;
307 std::vector<size_t> saved_output_sizes_;
308 size_t num_elements_ = 0;
309 uint32_t actual_outlier_count_ = 0;
310 bool is_inverse_ =
false;
314 TInput computed_abs_eb_ = 0;
317 float computed_value_base_ = 0.0f;
320 const void* d_outlier_count_ptr_ =
nullptr;
323 if (std::is_same<TInput, float>::value)
return DataType::FLOAT32;
324 if (std::is_same<TInput, double>::value)
return DataType::FLOAT64;
325 return DataType::FLOAT32;
329 if (std::is_same<TCode, uint8_t>::value)
return DataType::UINT8;
330 if (std::is_same<TCode, uint16_t>::value)
return DataType::UINT16;
331 if (std::is_same<TCode, uint32_t>::value)
return DataType::UINT32;
332 return DataType::UINT16;
335 size_t getMaxOutlierCount(
size_t num_elements)
const {
336 return static_cast<size_t>(std::ceil(num_elements * config_.outlier_capacity));
340extern template class LorenzoQuantStage<float, uint16_t>;
341extern template class LorenzoQuantStage<float, uint8_t>;
342extern template class LorenzoQuantStage<double, uint16_t>;
343extern template class LorenzoQuantStage<double, uint32_t>;
347template<
typename TInput,
typename TCode>
348void launchLorenzoKernel(
349 const TInput* d_input,
size_t n,
350 TInput ebx2_r, TCode quant_radius,
351 TCode* d_codes, TInput* d_outlier_errors,
352 uint32_t* d_outlier_indices, uint32_t* d_outlier_count,
353 size_t max_outliers,
int grid_size,
358template<
typename TInput,
typename TCode>
359void launchLorenzoInverseKernel(
360 const TCode* d_codes,
361 const TInput* d_outlier_errors,
const uint32_t* d_outlier_indices,
362 const uint32_t* d_outlier_count,
363 size_t n,
size_t max_outliers,
364 TInput ebx2, TCode quant_radius,
367 cudaStream_t stream, MemoryPool* pool
371template<
typename TInput,
typename TCode>
373 const TInput* d_input,
size_t nx,
size_t ny,
374 TInput ebx2_r, TCode quant_radius,
375 TCode* d_codes, TInput* d_outlier_errors,
376 uint32_t* d_outlier_indices, uint32_t* d_outlier_count,
383template<
typename TInput,
typename TCode>
385 const TCode* d_codes,
386 const TInput* d_outlier_errors,
const uint32_t* d_outlier_indices,
387 const uint32_t* d_outlier_count,
388 size_t nx,
size_t ny,
size_t max_outliers,
389 TInput ebx2, TCode quant_radius,
396template<
typename TInput,
typename TCode>
398 const TInput* d_input,
size_t nx,
size_t ny,
size_t nz,
399 TInput ebx2_r, TCode quant_radius,
400 TCode* d_codes, TInput* d_outlier_errors,
401 uint32_t* d_outlier_indices, uint32_t* d_outlier_count,
408template<
typename TInput,
typename TCode>
410 const TCode* d_codes,
411 const TInput* d_outlier_errors,
const uint32_t* d_outlier_indices,
412 const uint32_t* d_outlier_count,
413 size_t nx,
size_t ny,
size_t nz,
size_t max_outliers,
414 TInput ebx2, TCode quant_radius,
Definition lorenzo_quant.h:90
std::unordered_map< std::string, size_t > getActualOutputSizesByName() const override
Definition lorenzo_quant.h:144
int ndim() const
Returns the effective spatial dimensionality (1, 2, or 3).
Definition lorenzo_quant.h:185
uint8_t getOutputDataType(size_t output_index) const override
Definition lorenzo_quant.h:200
uint16_t getStageTypeId() const override
Definition lorenzo_quant.h:196
size_t getActualOutputSize(int index) const override
Definition lorenzo_quant.h:152
void saveState() override
Definition lorenzo_quant.h:161
size_t serializeHeader(size_t output_index, uint8_t *header_buffer, size_t max_size) const override
Definition lorenzo_quant.h:214
size_t getMaxHeaderSize(size_t output_index) const override
Definition lorenzo_quant.h:242
std::string getName() const override
Definition lorenzo_quant.h:132
void execute(cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
uint8_t getInputDataType(size_t) const override
Definition lorenzo_quant.h:210
void deserializeHeader(const uint8_t *header_buffer, size_t size) override
Definition lorenzo_quant.h:247
std::vector< std::string > getOutputNames() const override
Definition lorenzo_quant.h:136
void postStreamSync(cudaStream_t stream) override
void setDims(const std::array< size_t, 3 > &dims) override
Definition lorenzo_quant.h:168
void setInverse(bool inverse)
Definition lorenzo_quant.h:191
std::vector< size_t > estimateOutputSizes(const std::vector< size_t > &input_sizes) const override
Definition fzm_format.h:25
void launchLorenzoInverseKernel3D(const TCode *d_codes, const TInput *d_outlier_errors, const uint32_t *d_outlier_indices, const uint32_t *d_outlier_count, size_t nx, size_t ny, size_t nz, size_t max_outliers, TInput ebx2, TCode quant_radius, TInput *d_output, bool zigzag_codes, cudaStream_t stream, MemoryPool *pool)
3-D inverse Lorenzo kernel launcher.
ErrorBoundMode
Definition lorenzo_quant.h:30
@ NOA
Value-range relative bound (norm-of-absolute).
@ ABS
Absolute error bound.
@ REL
Global-approximate point-wise relative bound.
void launchLorenzoKernel3D(const TInput *d_input, size_t nx, size_t ny, size_t nz, TInput ebx2_r, TCode quant_radius, TCode *d_codes, TInput *d_outlier_errors, uint32_t *d_outlier_indices, uint32_t *d_outlier_count, size_t max_outliers, bool zigzag_codes, cudaStream_t stream)
3-D forward Lorenzo kernel launcher.
void launchLorenzoInverseKernel2D(const TCode *d_codes, const TInput *d_outlier_errors, const uint32_t *d_outlier_indices, const uint32_t *d_outlier_count, size_t nx, size_t ny, size_t max_outliers, TInput ebx2, TCode quant_radius, TInput *d_output, bool zigzag_codes, cudaStream_t stream, MemoryPool *pool)
2-D inverse Lorenzo kernel launcher.
constexpr size_t FZM_STAGE_CONFIG_SIZE
Per-stage serialized config slot (bytes)
Definition fzm_format.h:65
void launchLorenzoKernel2D(const TInput *d_input, size_t nx, size_t ny, TInput ebx2_r, TCode quant_radius, TCode *d_codes, TInput *d_outlier_errors, uint32_t *d_outlier_indices, uint32_t *d_outlier_count, size_t max_outliers, bool zigzag_codes, cudaStream_t stream)
2-D forward Lorenzo kernel launcher. nx is the fast (x) dimension.
DataType
Element data type identifiers used in buffer and stage descriptors.
Definition fzm_format.h:104
Base class interface for all compression stages.
Definition lorenzo_quant.h:43
uint8_t zigzag_codes
1 if codes are zigzag-encoded, else 0.
Definition lorenzo_quant.h:57
float value_base
value_range (NOA) or max(|data|) (REL) used in conversion.
Definition lorenzo_quant.h:56
DataType input_type
Original input type (1B).
Definition lorenzo_quant.h:48
uint32_t quant_radius
Quantization radius.
Definition lorenzo_quant.h:45
uint8_t reserved[3]
Must be zero.
Definition lorenzo_quant.h:58
float error_bound
Absolute bound after mode conversion (used by decompressor).
Definition lorenzo_quant.h:44
uint8_t eb_mode
ErrorBoundMode cast to uint8_t.
Definition lorenzo_quant.h:51
uint32_t num_elements
Total element count.
Definition lorenzo_quant.h:46
uint8_t ndim
Spatial dimensionality 1/2/3 (0 treated as 1).
Definition lorenzo_quant.h:50
uint32_t dim_z
Z dimension (1 for 1-D/2-D).
Definition lorenzo_quant.h:54
DataType code_type
Quantization code type (1B).
Definition lorenzo_quant.h:49
uint32_t dim_y
Y dimension (1 for 1-D).
Definition lorenzo_quant.h:53
uint32_t outlier_count
Actual number of outliers.
Definition lorenzo_quant.h:47
float user_eb
Original user-specified error bound value.
Definition lorenzo_quant.h:55
uint32_t dim_x
X (fast) dimension; 0 = infer from num_elements.
Definition lorenzo_quant.h:52
Definition lorenzo_quant.h:93
int quant_radius
Quantization radius (2^15 for uint16_t).
Definition lorenzo_quant.h:95
float error_bound
Error bound (interpretation depends on eb_mode).
Definition lorenzo_quant.h:94
bool zigzag_codes
Definition lorenzo_quant.h:107
float outlier_capacity
Definition lorenzo_quant.h:96
float precomputed_value_base
Definition lorenzo_quant.h:104
std::array< size_t, 3 > dims
Definition lorenzo_quant.h:100