36 void setInverse(
bool inverse)
override { is_inverse_ = inverse; }
37 bool isInverse()
const override {
return is_inverse_; }
42 const std::vector<void*>& inputs,
43 const std::vector<void*>& outputs,
44 const std::vector<size_t>& sizes
54 std::string
getName()
const override {
return "RLE"; }
55 size_t getNumInputs()
const override {
return 1; }
56 size_t getNumOutputs()
const override {
return 1; }
69 const std::vector<size_t>& input_sizes
71 if (is_inverse_ || input_sizes.empty())
return 0;
72 const size_t n = input_sizes[0] /
sizeof(T);
75 return n * (1 + 4 + 4 +
sizeof(T) + 4);
79 const std::vector<size_t>& input_sizes
85 if (cached_num_elements_ > 0)
86 return {
static_cast<size_t>(cached_num_elements_) *
sizeof(T)};
87 return {input_sizes[0] * 2};
94 size_t n = input_sizes[0] /
sizeof(T);
95 size_t values_bytes = n *
sizeof(T);
96 size_t values_aligned = (values_bytes + 3u) & ~3u;
97 return {
sizeof(uint32_t) + values_aligned + n *
sizeof(uint32_t)};
102 completePendingSync();
103 return {{
"output", actual_output_sizes_.empty() ? 0 : actual_output_sizes_[0]}};
106 completePendingSync();
107 return (index == 0 && !actual_output_sizes_.empty()) ? actual_output_sizes_[0] : 0;
116 return static_cast<uint8_t
>(getDataTypeEnum());
120 return static_cast<uint8_t
>(getDataTypeEnum());
123 size_t serializeHeader(
size_t output_index, uint8_t* header_buffer,
size_t max_size)
const override {
125 const size_t needed =
sizeof(
DataType) +
sizeof(uint32_t);
126 if (max_size < needed)
return 0;
128 std::memcpy(header_buffer, &dt,
sizeof(
DataType));
129 std::memcpy(header_buffer +
sizeof(
DataType), &cached_num_elements_,
sizeof(uint32_t));
134 if (size >=
sizeof(
DataType) +
sizeof(uint32_t))
135 std::memcpy(&cached_num_elements_, header_buffer +
sizeof(
DataType),
sizeof(uint32_t));
140 return sizeof(
DataType) +
sizeof(uint32_t);
149 uint32_t cached_num_elements_ = 0;
153 uint8_t* d_is_boundary_ =
nullptr;
154 uint32_t* d_boundary_scan_ =
nullptr;
155 uint32_t* d_boundary_positions_ =
nullptr;
156 T* d_values_scratch_ =
nullptr;
157 uint32_t* d_lengths_scratch_ =
nullptr;
158 size_t fwd_scratch_n_ = 0;
160 bool fwd_from_pool_ =
false;
165 mutable uint32_t* h_num_runs_ =
nullptr;
166 mutable bool fwd_sync_pending_ =
false;
167 mutable cudaStream_t fwd_last_stream_ =
nullptr;
168 mutable std::vector<size_t> actual_output_sizes_;
173 void completePendingSync()
const {
174 if (!fwd_sync_pending_)
return;
175 cudaStreamSynchronize(fwd_last_stream_);
176 const uint32_t num_runs = *h_num_runs_;
177 const size_t values_bytes = num_runs *
sizeof(T);
178 const size_t values_aligned = (values_bytes + 3) & ~3;
179 actual_output_sizes_ = {
180 sizeof(uint32_t) + values_aligned + num_runs *
sizeof(uint32_t)
182 fwd_sync_pending_ =
false;
184 const size_t in_bytes =
static_cast<size_t>(cached_num_elements_) *
sizeof(T);
185 const size_t out_bytes = actual_output_sizes_[0];
186 const float ratio = in_bytes > 0
187 ?
static_cast<float>(in_bytes) /
static_cast<float>(out_bytes) : 0.0f;
188 FZ_LOG(
DEBUG,
"RLE encode: %u runs / %u elems %.1f KB -> %.1f KB ratio %.2fx",
189 num_runs, cached_num_elements_,
190 in_bytes / 1024.0f, out_bytes / 1024.0f, ratio);
195 if (std::is_same<T, uint8_t>::value)
return DataType::UINT8;
196 if (std::is_same<T, uint16_t>::value)
return DataType::UINT16;
197 if (std::is_same<T, uint32_t>::value)
return DataType::UINT32;
198 if (std::is_same<T, uint64_t>::value)
return DataType::UINT64;
199 if (std::is_same<T, int8_t>::value)
return DataType::INT8;
200 if (std::is_same<T, int16_t>::value)
return DataType::INT16;
201 if (std::is_same<T, int32_t>::value)
return DataType::INT32;
202 if (std::is_same<T, int64_t>::value)
return DataType::INT64;
203 if (std::is_same<T, float>::value)
return DataType::FLOAT32;
204 if (std::is_same<T, double>::value)
return DataType::FLOAT64;
205 return DataType::UINT8;
void execute(cudaStream_t stream, MemoryPool *pool, const std::vector< void * > &inputs, const std::vector< void * > &outputs, const std::vector< size_t > &sizes) override