FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
adaptive_bitpack_stage.h
Go to the documentation of this file.
1#pragma once
2
11#include "stage/stage.h"
12#include "fzm_format.h"
13#include <cuda_runtime.h>
14#include <cstdint>
15#include <cstring>
16#include <stdexcept>
17#include <string>
18#include <type_traits>
19#include <unordered_map>
20#include <vector>
21
22namespace fz {
23
39static_assert(sizeof(AdaptiveBitpackConfig) <= FZM_STAGE_CONFIG_SIZE,
40 "AdaptiveBitpackConfig must fit in FZM_STAGE_CONFIG_SIZE");
41
74template<typename T>
76 static_assert(std::is_same_v<T, int16_t> || std::is_same_v<T, int32_t>,
77 "AdaptiveBitpackStage: T must be int16_t or int32_t.");
78public:
79 AdaptiveBitpackStage() = default;
80 ~AdaptiveBitpackStage() override;
81
82 // ── Stage control ──────────────────────────────────────────────────────
83 void setInverse(bool inv) override { is_inverse_ = inv; }
84 bool isInverse() const override { return is_inverse_; }
85 // Forward (compress) is graph-capturable: execute() enqueues only
86 // device-side work and the data-dependent compressed-size readback is
87 // deferred to postStreamSync() (run after the launch, outside any capture
88 // window). The inverse path keeps a per-execute layout and is left out of
89 // graph capture, mirroring RZEStage.
90 bool isGraphCompatible() const override { return !is_inverse_; }
91
94 void setBlockSize(uint32_t n) {
95 if (n == 0 || n > 1024)
96 throw std::invalid_argument(
97 "AdaptiveBitpackStage::setBlockSize: n must be in [1, 1024], got "
98 + std::to_string(n));
99 block_size_ = n;
100 }
101 uint32_t getBlockSize() const { return block_size_; }
102
107 void setOutlierSelection(bool enable) { outlier_selection_ = enable; }
108 bool getOutlierSelection() const { return outlier_selection_; }
109
110 // ── Execution ──────────────────────────────────────────────────────────
112 cudaStream_t stream,
113 MemoryPool* pool,
114 const std::vector<void*>& inputs,
115 const std::vector<void*>& outputs,
116 const std::vector<size_t>& sizes
117 ) override;
118
123 void postStreamSync(cudaStream_t stream) override;
124
125 // ── Metadata ───────────────────────────────────────────────────────────
126 std::string getName() const override { return "AdaptiveBitpack"; }
127 size_t getNumInputs() const override { return 1; }
128 size_t getNumOutputs() const override { return 1; }
129
130 std::vector<size_t> estimateOutputSizes(
131 const std::vector<size_t>& input_sizes
132 ) const override;
133
139 const std::vector<size_t>& input_sizes
140 ) const override;
141
142 std::unordered_map<std::string, size_t>
143 getActualOutputSizesByName() const override {
144 return {{"output", actual_output_size_}};
145 }
146 size_t getActualOutputSize(int index) const override {
147 return (index == 0) ? actual_output_size_ : 0;
148 }
149
150 uint16_t getStageTypeId() const override {
151 return static_cast<uint16_t>(StageType::ADAPTIVE_BITPACK);
152 }
153
154 // Forward: signed codes -> uint8 archive. Inverse: archive -> signed codes.
155 uint8_t getOutputDataType(size_t) const override {
156 return static_cast<uint8_t>(is_inverse_ ? getElementDataType()
157 : DataType::UINT8);
158 }
159 uint8_t getInputDataType(size_t) const override {
160 return static_cast<uint8_t>(is_inverse_ ? DataType::UINT8
161 : getElementDataType());
162 }
163
164 // ── Serialization ──────────────────────────────────────────────────────
165 size_t serializeHeader(size_t, uint8_t* buf, size_t max_size) const override {
166 if (max_size < sizeof(AdaptiveBitpackConfig)) return 0;
168 cfg.data_type = getElementDataType();
169 cfg.outlier_selection = outlier_selection_ ? uint8_t{1} : uint8_t{0};
170 cfg.block_size = block_size_;
171 cfg.num_elements = static_cast<uint64_t>(num_elements_);
172 std::memcpy(buf, &cfg, sizeof(cfg));
173 return sizeof(cfg);
174 }
175 void deserializeHeader(const uint8_t* buf, size_t size) override {
176 if (size < sizeof(AdaptiveBitpackConfig))
177 throw std::runtime_error("AdaptiveBitpackStage: header too small");
179 std::memcpy(&cfg, buf, sizeof(cfg));
180 block_size_ = cfg.block_size ? cfg.block_size : 32u;
181 num_elements_ = static_cast<size_t>(cfg.num_elements);
182 outlier_selection_ = (cfg.outlier_selection != 0);
183 }
184 size_t getMaxHeaderSize(size_t) const override {
185 return sizeof(AdaptiveBitpackConfig);
186 }
187
188 void saveState() override {
189 saved_block_size_ = block_size_;
190 saved_num_elements_ = num_elements_;
191 saved_actual_size_ = actual_output_size_;
192 saved_outlier_select_ = outlier_selection_;
193 }
194 void restoreState() override {
195 block_size_ = saved_block_size_;
196 num_elements_ = saved_num_elements_;
197 actual_output_size_ = saved_actual_size_;
198 outlier_selection_ = saved_outlier_select_;
199 }
200
201 size_t getNumElements() const { return num_elements_; }
202
203private:
204 bool is_inverse_ = false;
205 uint32_t block_size_ = 32;
206 bool outlier_selection_ = false;
207 size_t num_elements_ = 0;
208 size_t actual_output_size_ = 0;
209
210 // Forward-path persistent scratch (kept alive across execute() so the
211 // compressed-size readback can be deferred to postStreamSync(), and so no
212 // allocation happens inside a captured graph replay). Grown lazily when a
213 // larger input is seen; freed in the destructor. Pool-managed (persistent).
214 uint32_t* d_cost_ = nullptr;
215 uint32_t* d_offset_ = nullptr;
216 size_t scratch_blocks_ = 0;
217 MemoryPool* scratch_pool_ = nullptr;
218 size_t fwd_num_blocks_ = 0;
219 size_t fwd_meta_region_ = 0;
220
221 uint32_t saved_block_size_ = 32;
222 bool saved_outlier_select_ = false;
223 size_t saved_num_elements_ = 0;
224 size_t saved_actual_size_ = 0;
225
226 static DataType getElementDataType() {
227 if (std::is_same<T, int16_t>::value) return DataType::INT16;
228 return DataType::INT32;
229 }
230};
231
232extern template class AdaptiveBitpackStage<int16_t>;
233extern template class AdaptiveBitpackStage<int32_t>;
234
235} // namespace fz
Definition adaptive_bitpack_stage.h:75
void setBlockSize(uint32_t n)
Definition adaptive_bitpack_stage.h:94
void saveState() override
Definition adaptive_bitpack_stage.h:188
size_t serializeHeader(size_t, uint8_t *buf, size_t max_size) const override
Definition adaptive_bitpack_stage.h:165
uint8_t getInputDataType(size_t) const override
Definition adaptive_bitpack_stage.h:159
void deserializeHeader(const uint8_t *buf, size_t size) override
Definition adaptive_bitpack_stage.h:175
std::string getName() const override
Definition adaptive_bitpack_stage.h:126
void execute(cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
uint16_t getStageTypeId() const override
Definition adaptive_bitpack_stage.h:150
size_t getMaxHeaderSize(size_t) const override
Definition adaptive_bitpack_stage.h:184
uint8_t getOutputDataType(size_t) const override
Definition adaptive_bitpack_stage.h:155
std::unordered_map< std::string, size_t > getActualOutputSizesByName() const override
Definition adaptive_bitpack_stage.h:143
size_t getActualOutputSize(int index) const override
Definition adaptive_bitpack_stage.h:146
size_t estimateScratchBytes(const std::vector< size_t > &input_sizes) const override
bool isGraphCompatible() const override
Definition adaptive_bitpack_stage.h:90
void setInverse(bool inv) override
Definition adaptive_bitpack_stage.h:83
std::vector< size_t > estimateOutputSizes(const std::vector< size_t > &input_sizes) const override
void postStreamSync(cudaStream_t stream) override
void setOutlierSelection(bool enable)
Definition adaptive_bitpack_stage.h:107
Definition mempool.h:82
Definition stage.h:30
FZM binary file format definitions — structs, enums, and helpers.
Definition fzm_format.h:25
constexpr size_t FZM_STAGE_CONFIG_SIZE
Per-stage serialized config slot (bytes)
Definition fzm_format.h:65
@ ADAPTIVE_BITPACK
Per-block adaptive fixed-rate bit-plane coder (cuSZp plain mode)
DataType
Element data type identifiers used in buffer and stage descriptors.
Definition fzm_format.h:109
Base class interface for all compression stages.
Definition adaptive_bitpack_stage.h:28
uint8_t _pad[2]
Must be zero.
Definition adaptive_bitpack_stage.h:31
uint8_t outlier_selection
1 = cuSZp2 per-block plain/outlier selection.
Definition adaptive_bitpack_stage.h:30
uint32_t block_size
Elements per logical block (reset period).
Definition adaptive_bitpack_stage.h:32
DataType data_type
Signed element type (1B): INT16 / INT32.
Definition adaptive_bitpack_stage.h:29
uint64_t num_elements
Original element count (sizes the inverse output).
Definition adaptive_bitpack_stage.h:33