27#include <cuda_runtime.h>
32#include <unordered_map>
57 , actual_output_size_(0)
58 , cached_orig_bytes_(0)
63 , d_sizes_dev_(
nullptr)
64 , d_clean_dev_(
nullptr)
65 , d_dst_off_dev_(
nullptr)
66 , d_inv_in_off_(
nullptr)
67 , d_inv_comp_sz_(
nullptr)
68 , d_inv_out_off_(
nullptr)
69 , d_inv_orig_sz_(
nullptr)
70 , scratch_capacity_(0)
77 void setInverse(
bool inv)
override { is_inverse_ = inv; }
78 bool isInverse()
const override {
return is_inverse_; }
95 void setChunkSize(
size_t bytes) { chunk_size_ =
static_cast<uint32_t
>(bytes); }
96 void setLevels(
int n) { levels_ =
static_cast<uint8_t
>(n); }
98 size_t getChunkSize()
const {
return chunk_size_; }
100 int getLevels()
const {
return static_cast<int>(levels_); }
101 uint32_t getCachedOrigBytes()
const {
return cached_orig_bytes_; }
107 const std::vector<void*>& inputs,
108 const std::vector<void*>& outputs,
109 const std::vector<size_t>& sizes
114 std::string
getName()
const override {
return "RZE"; }
115 size_t getNumInputs()
const override {
return 1; }
116 size_t getNumOutputs()
const override {
return 1; }
119 const std::vector<size_t>& input_sizes
125 if (cached_orig_bytes_ > 0)
126 return {
static_cast<size_t>(cached_orig_bytes_)};
130 return {input_sizes.empty() ? 0 : input_sizes[0]};
134 const size_t n_bytes = input_sizes.empty() ? 0 : input_sizes[0];
135 const size_t n_chunks = (n_bytes + chunk_size_ - 1) / chunk_size_;
136 const size_t hdr = 4 + 4 + 4 * n_chunks;
137 return {n_bytes + hdr};
140 std::unordered_map<std::string, size_t>
156 const std::vector<size_t>& input_sizes
158 if (is_inverse_ || input_sizes.empty())
return 0;
159 const size_t in_bytes = input_sizes[0];
160 const size_t n_chunks = (in_bytes + chunk_size_ - 1) / chunk_size_;
161 return n_chunks * (
static_cast<size_t>(chunk_size_) + 3 *
sizeof(uint32_t));
165 return static_cast<uint16_t
>(StageType::RZE);
169 return static_cast<uint8_t
>(DataType::UINT8);
180 size_t output_index, uint8_t* buf,
size_t max_size
183 if (max_size < 9)
return 0;
184 std::memcpy(buf, &chunk_size_,
sizeof(uint32_t));
186 std::memcpy(buf + 5, &cached_orig_bytes_,
sizeof(uint32_t));
191 if (size >= 4) std::memcpy(&chunk_size_, buf,
sizeof(uint32_t));
192 if (size >= 5) levels_ = buf[4];
193 if (size >= 9) std::memcpy(&cached_orig_bytes_, buf + 5,
sizeof(uint32_t));
199 saved_chunk_size_ = chunk_size_;
200 saved_levels_ = levels_;
201 saved_cached_orig_bytes_ = cached_orig_bytes_;
204 void restoreState()
override {
205 chunk_size_ = saved_chunk_size_;
206 levels_ = saved_levels_;
207 cached_orig_bytes_ = saved_cached_orig_bytes_;
212 uint32_t chunk_size_;
213 uint32_t saved_chunk_size_ = 0;
215 uint8_t saved_levels_ = 0;
216 size_t actual_output_size_;
221 uint32_t cached_orig_bytes_ = 0;
222 uint32_t saved_cached_orig_bytes_ = 0;
234 uint32_t* d_sizes_dev_;
235 uint32_t* d_clean_dev_;
236 uint32_t* d_dst_off_dev_;
237 uint32_t* d_inv_in_off_;
238 uint32_t* d_inv_comp_sz_;
239 uint32_t* d_inv_out_off_;
240 uint32_t* d_inv_orig_sz_;
241 mutable bool tail_readback_pending_ =
false;
242 mutable cudaStream_t tail_readback_stream_ =
nullptr;
243 mutable uint32_t tail_last_index_ = 0;
246 mutable uint8_t* tail_output_ptr_ =
nullptr;
247 size_t scratch_capacity_;
248 size_t inv_capacity_;
249 MemoryPool* scratch_pool_owner_ =
nullptr;
250 MemoryPool* inv_pool_owner_ =
nullptr;
251 bool scratch_from_pool_ =
false;
252 bool inv_from_pool_ =
false;
Definition rze_stage.h:51
void execute(cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override
uint8_t getOutputDataType(size_t) const override
Definition rze_stage.h:168
void deserializeHeader(const uint8_t *buf, size_t size) override
Definition rze_stage.h:190
bool isGraphCompatible() const override
Definition rze_stage.h:93
void setInverse(bool inv) override
Definition rze_stage.h:77
size_t getMaxHeaderSize(size_t) const override
Definition rze_stage.h:196
size_t serializeHeader(size_t output_index, uint8_t *buf, size_t max_size) const override
Definition rze_stage.h:179
uint16_t getStageTypeId() const override
Definition rze_stage.h:164
std::unordered_map< std::string, size_t > getActualOutputSizesByName() const override
size_t getRequiredInputAlignment() const override
Definition rze_stage.h:99
void saveState() override
Definition rze_stage.h:198
std::string getName() const override
Definition rze_stage.h:114
std::vector< size_t > estimateOutputSizes(const std::vector< size_t > &input_sizes) const override
Definition rze_stage.h:118
size_t getActualOutputSize(int index) const override
size_t estimateScratchBytes(const std::vector< size_t > &input_sizes) const override
Definition rze_stage.h:155
void postStreamSync(cudaStream_t stream) override
Definition fzm_format.h:25
Base class interface for all compression stages.