FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
lorenzo_stage.h
Go to the documentation of this file.
1#pragma once
2
8#include "stage/stage.h"
9#include "fzm_format.h"
10#include <cuda_runtime.h>
11#include <algorithm>
12#include <array>
13#include <cstdint>
14#include <cstring>
15#include <stdexcept>
16#include <string>
17#include <type_traits>
18#include <unordered_map>
19#include <vector>
20
21namespace fz {
22
30 uint8_t ndim;
31 uint8_t reserved[2];
32 uint32_t dim_x;
33 uint32_t dim_y;
34 uint32_t dim_z;
35 uint32_t block_size;
36
38 : data_type(DataType::INT32), ndim(1), reserved{0, 0},
39 dim_x(0), dim_y(1), dim_z(1), block_size(0) {}
40};
41static_assert(sizeof(LorenzoConfig) <= FZM_STAGE_CONFIG_SIZE,
42 "LorenzoConfig must fit in FZM_STAGE_CONFIG_SIZE");
43
52template<typename T>
53class LorenzoStage : public Stage {
54 static_assert(std::is_integral<T>::value && std::is_signed<T>::value,
55 "LorenzoStage requires a signed integer type");
56public:
57 LorenzoStage() = default;
58
59 void setInverse(bool inv) override { is_inverse_ = inv; }
60 bool isInverse() const override { return is_inverse_; }
61
62 void setDims(const std::array<size_t, 3>& dims) override { dims_ = dims; }
63 void setDims(size_t x, size_t y = 1, size_t z = 1) { dims_ = {x, y, z}; }
64 std::array<size_t, 3> getDims() const { return dims_; }
65
78 void setBlockSize(uint32_t n) {
79 if (n > 1024)
80 throw std::invalid_argument(
81 "LorenzoStage::setBlockSize: n must be in [0, 1024], got "
82 + std::to_string(n));
83 block_size_ = n;
84 }
85 uint32_t getBlockSize() const { return block_size_; }
86
87 int ndim() const {
88 if (dims_[2] > 1) return 3;
89 if (dims_[1] > 1) return 2;
90 return 1;
91 }
92
93 void execute(
94 cudaStream_t stream,
95 MemoryPool* pool,
96 const std::vector<void*>& inputs,
97 const std::vector<void*>& outputs,
98 const std::vector<size_t>& sizes
99 ) override;
100
101 std::string getName() const override { return "Lorenzo"; }
102 size_t getNumInputs() const override { return 1; }
103 size_t getNumOutputs() const override { return 1; }
104
105 std::vector<size_t> estimateOutputSizes(
106 const std::vector<size_t>& input_sizes
107 ) const override {
108 return {input_sizes.empty() ? 0 : input_sizes[0]};
109 }
110
111 std::unordered_map<std::string, size_t>
112 getActualOutputSizesByName() const override {
113 return {{"output", actual_output_size_}};
114 }
115
116 size_t getActualOutputSize(int index) const override {
117 return (index == 0) ? actual_output_size_ : 0;
118 }
119
120 uint16_t getStageTypeId() const override {
121 return static_cast<uint16_t>(StageType::LORENZO);
122 }
123
124 uint8_t getOutputDataType(size_t /*output_index*/) const override {
125 return static_cast<uint8_t>(getElementDataType());
126 }
127
128 uint8_t getInputDataType(size_t /*input_index*/) const override {
129 return static_cast<uint8_t>(getElementDataType());
130 }
131
132 size_t serializeHeader(size_t /*output_index*/, uint8_t* buf, size_t max_size) const override {
133 if (max_size < sizeof(LorenzoConfig))
134 throw std::runtime_error("LorenzoStage: header buffer too small");
135 LorenzoConfig cfg;
136 cfg.data_type = getElementDataType();
137 cfg.ndim = static_cast<uint8_t>(ndim());
138 cfg.dim_x = static_cast<uint32_t>(dims_[0]);
139 cfg.dim_y = static_cast<uint32_t>(dims_[1]);
140 cfg.dim_z = static_cast<uint32_t>(dims_[2]);
141 cfg.block_size = block_size_;
142 std::memcpy(buf, &cfg, sizeof(LorenzoConfig));
143 return sizeof(LorenzoConfig);
144 }
145
146 void deserializeHeader(const uint8_t* buf, size_t size) override {
147 // Accept legacy 16-byte headers (no block_size field).
148 constexpr size_t kMinSize = 16;
149 if (size < kMinSize)
150 throw std::runtime_error("LorenzoStage: header too small");
151 LorenzoConfig cfg; // default-constructed: block_size = 0
152 std::memcpy(&cfg, buf, std::min(size, sizeof(LorenzoConfig)));
153 int eff_ndim = (cfg.ndim == 0) ? 1 : static_cast<int>(cfg.ndim);
154 dims_[0] = cfg.dim_x;
155 dims_[1] = (eff_ndim >= 2) ? cfg.dim_y : 1;
156 dims_[2] = (eff_ndim >= 3) ? cfg.dim_z : 1;
157 block_size_ = (size >= sizeof(LorenzoConfig)) ? cfg.block_size : 0;
158 }
159
160 size_t getMaxHeaderSize(size_t /*output_index*/) const override {
161 return sizeof(LorenzoConfig);
162 }
163
164private:
165 bool is_inverse_ = false;
166 size_t actual_output_size_ = 0;
167 std::array<size_t, 3> dims_ = {0, 1, 1};
168 uint32_t block_size_ = 0;
169
170 static DataType getElementDataType() {
171 if (std::is_same<T, int8_t>::value) return DataType::INT8;
172 if (std::is_same<T, int16_t>::value) return DataType::INT16;
173 if (std::is_same<T, int32_t>::value) return DataType::INT32;
174 if (std::is_same<T, int64_t>::value) return DataType::INT64;
175 return DataType::INT32;
176 }
177};
178
179extern template class LorenzoStage<int8_t>;
180extern template class LorenzoStage<int16_t>;
181extern template class LorenzoStage<int32_t>;
182extern template class LorenzoStage<int64_t>;
183
184// Kernel launcher declarations — defined in lorenzo_stage.cu.
185
186template<typename T>
187void launchLorenzoDeltaKernel1D(
188 const T* d_input, T* d_output, size_t n, cudaStream_t stream,
189 unsigned block_threads = 256);
190
191template<typename T>
192void launchLorenzoPrefixSumKernel1D(
193 const T* d_input, T* d_output, size_t n, cudaStream_t stream,
194 unsigned block_threads = 256);
195
196template<typename T>
197void launchLorenzoDeltaKernel2D(
198 const T* d_input, T* d_output, size_t nx, size_t ny, cudaStream_t stream);
199
200template<typename T>
201void launchLorenzoPrefixSumKernel2D(
202 const T* d_input, T* d_output, size_t nx, size_t ny, cudaStream_t stream);
203
204template<typename T>
205void launchLorenzoDeltaKernel3D(
206 const T* d_input, T* d_output, size_t nx, size_t ny, size_t nz, cudaStream_t stream);
207
208template<typename T>
209void launchLorenzoPrefixSumKernel3D(
210 const T* d_input, T* d_output, size_t nx, size_t ny, size_t nz, cudaStream_t stream);
211
212} // namespace fz
Definition lorenzo_stage.h:53
void setBlockSize(uint32_t n)
Definition lorenzo_stage.h:78
void execute(cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
std::vector< size_t > estimateOutputSizes(const std::vector< size_t > &input_sizes) const override
Definition lorenzo_stage.h:105
size_t getMaxHeaderSize(size_t) const override
Definition lorenzo_stage.h:160
std::string getName() const override
Definition lorenzo_stage.h:101
size_t getActualOutputSize(int index) const override
Definition lorenzo_stage.h:116
uint16_t getStageTypeId() const override
Definition lorenzo_stage.h:120
void setInverse(bool inv) override
Definition lorenzo_stage.h:59
size_t serializeHeader(size_t, uint8_t *buf, size_t max_size) const override
Definition lorenzo_stage.h:132
std::unordered_map< std::string, size_t > getActualOutputSizesByName() const override
Definition lorenzo_stage.h:112
uint8_t getInputDataType(size_t) const override
Definition lorenzo_stage.h:128
uint8_t getOutputDataType(size_t) const override
Definition lorenzo_stage.h:124
void deserializeHeader(const uint8_t *buf, size_t size) override
Definition lorenzo_stage.h:146
void setDims(const std::array< size_t, 3 > &dims) override
Definition lorenzo_stage.h:62
Definition mempool.h:82
Definition stage.h:30
FZM binary file format definitions — structs, enums, and helpers.
Definition fzm_format.h:25
constexpr size_t FZM_STAGE_CONFIG_SIZE
Per-stage serialized config slot (bytes)
Definition fzm_format.h:65
DataType
Element data type identifiers used in buffer and stage descriptors.
Definition fzm_format.h:109
Base class interface for all compression stages.
Definition lorenzo_stage.h:28
uint32_t dim_z
Z dimension (1 for 1-D/2-D).
Definition lorenzo_stage.h:34
DataType data_type
Signed integer element type (1B).
Definition lorenzo_stage.h:29
uint32_t dim_y
Y dimension (1 for 1-D).
Definition lorenzo_stage.h:33
uint8_t ndim
Spatial dimensionality 1/2/3 (0 treated as 1).
Definition lorenzo_stage.h:30
uint32_t dim_x
X (fast) dimension.
Definition lorenzo_stage.h:32
uint8_t reserved[2]
Must be zero.
Definition lorenzo_stage.h:31
uint32_t block_size
1-D block-local reset period; 0 = default N-D behavior.
Definition lorenzo_stage.h:35