FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
adaptive_bitpack_kernels.h
Go to the documentation of this file.
1#pragma once
2
21#include <cstddef>
22#include <cstdint>
23#include <cuda_runtime.h>
24
25namespace fz {
26namespace adaptive_bitpack {
27
31struct Config {
32 size_t num_elements = 0;
33 uint32_t block_size = 32;
34 uint32_t word_bytes = 4;
35 size_t num_blocks = 0;
36 uint32_t meta_bytes = 1;
37};
38
39inline Config configure(size_t num_elements, uint32_t block_size,
40 bool outlier_selection = false) {
41 Config c;
42 c.num_elements = num_elements;
43 c.block_size = block_size;
44 c.word_bytes = (block_size + 7u) / 8u;
45 c.num_blocks = (block_size == 0)
46 ? 0
47 : (num_elements + block_size - 1) / block_size;
48 c.meta_bytes = outlier_selection ? 2u : 1u;
49 return c;
50}
51
55inline size_t maxArchiveBytes(const Config& c, unsigned bits_per_elem) {
56 return c.num_blocks * static_cast<size_t>(c.meta_bytes)
57 + c.num_blocks * static_cast<size_t>(c.word_bytes) * (bits_per_elem + 1);
58}
59
60// ── Encode ────────────────────────────────────────────────────────────────
61// Pass A: write the per-block rate byte into `d_rate` (the archive's rate
62// region) and the per-block payload byte cost into `d_cost`.
63template<typename T>
64void launchEncodeRate(const T* d_in, const Config& c,
65 uint8_t* d_rate, uint32_t* d_cost, cudaStream_t stream);
66
67// Pass B: pack sign + bit-planes for each block at `d_payload + d_offset[b]`,
68// where `d_offset` is the exclusive scan of `d_cost`.
69template<typename T>
70void launchEncodePack(const T* d_in, const Config& c,
71 const uint8_t* d_rate, const uint32_t* d_offset,
72 uint8_t* d_payload, cudaStream_t stream);
73
74// ── Decode ────────────────────────────────────────────────────────────────
75// Pass A: per-block payload byte cost from the rate region (for the scan).
76void launchDecodeCost(const uint8_t* d_rate, const Config& c,
77 uint32_t* d_cost, cudaStream_t stream);
78
79// Pass B: unpack each block from `d_payload + d_offset[b]` into `d_out`.
80template<typename T>
81void launchDecodeUnpack(const uint8_t* d_rate, const uint32_t* d_offset,
82 const uint8_t* d_payload, const Config& c,
83 T* d_out, cudaStream_t stream);
84
85// ── Outlier-selection mode (cuSZp2) ─────────────────────────────────────────
86// Per block, choose the cheaper of (a) plain packing of all elements or
87// (b) extracting element 0 as a 1..sizeof(T)-byte raw outlier and packing only
88// elements 1..n-1. Metadata is 2 bytes per block: [rate][sel], where sel bit0 =
89// is_outlier and (when set) sel bits1-2 = outlier_byte_num - 1.
90template<typename T>
91void launchEncodeRateOutlier(const T* d_in, const Config& c,
92 uint8_t* d_meta, uint32_t* d_cost, cudaStream_t stream);
93template<typename T>
94void launchEncodePackOutlier(const T* d_in, const Config& c,
95 const uint8_t* d_meta, const uint32_t* d_offset,
96 uint8_t* d_payload, cudaStream_t stream);
97void launchDecodeCostOutlier(const uint8_t* d_meta, const Config& c,
98 uint32_t* d_cost, cudaStream_t stream);
99template<typename T>
100void launchDecodeUnpackOutlier(const uint8_t* d_meta, const uint32_t* d_offset,
101 const uint8_t* d_payload, const Config& c,
102 T* d_out, cudaStream_t stream);
103
104} // namespace adaptive_bitpack
105} // namespace fz
size_t maxArchiveBytes(const Config &c, unsigned bits_per_elem)
Definition adaptive_bitpack_kernels.h:55
Definition fzm_format.h:25
Definition adaptive_bitpack_kernels.h:31
uint32_t meta_bytes
per-block metadata stride (1 plain, 2 outlier)
Definition adaptive_bitpack_kernels.h:36
size_t num_blocks
ceil(num_elements / block_size)
Definition adaptive_bitpack_kernels.h:35
uint32_t word_bytes
ceil(block_size / 8)
Definition adaptive_bitpack_kernels.h:34