FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
hf_buf.h
1// Adapted from PHF reference (origin/v1.1.0_dev:modules/codec/huffman/hf_hl.hh)
2// Changes:
3// - Replaced #include "mem/cxx_smart_ptr.h" with local RAII wrappers below.
4// - Replaced #include "err.hh" (not needed in header; hf_buf.cc uses cuda_check.h).
5// - Removed HuffmanCodec<E> class (not used; we use Buf<E> + high_level<E> directly).
6// - Removed timer/io includes (not needed).
7// - Buf<E> refactored to use pool-managed raw pointers instead of unique_ptr members.
8// Constructor takes MemoryPool* and allocates via allocatePersistentDevice/Pinned.
9// Destructor returns all allocations to the pool via freePersistentDevice/Pinned.
10// The pool is the sole owner; Buf<E> holds non-owning raw pointers.
11
12#pragma once
13
14#include <cuda_runtime.h>
15#include <cstddef>
16#include <cstdint>
17
18#include "hf.h"
19#include "hf_impl.hh"
20#include "mem/mempool.h"
21
22// ── HuffmanHelper (used by hf_kernels.cu) ────────────────────────────────────
23
24struct HuffmanHelper {
25 static const int BLOCK_DIM_ENCODE = 256;
26 static const int BLOCK_DIM_DEFLATE = 256;
27 static const int ENC_SEQUENTIALITY = 4;
28 static const int DEFLATE_CONSTANT = 4;
29};
30
31// ── HF_SPACE / HF_STREAM convenience macros ──────────────────────────────────
32
33#define HF_SPACE phf::Buf<E>
34#define HF_STREAM void*
35
36// ── phf::Buf<E> ──────────────────────────────────────────────────────────────
37
38namespace phf {
39
40template <typename E>
41struct Buf {
42 using H4 = uint32_t;
43 using M = PHF_METADATA;
44
45 struct RC {
46 static const int SCRATCH = 0;
47 static const int FREQ = 1;
48 static const int BK = 2;
49 static const int REVBK = 3;
50 static const int PAR_NBIT = 4;
51 static const int PAR_NCELL = 5;
52 static const int PAR_ENTRY = 6;
53 static const int BITSTREAM = 7;
54 static const int END = 8;
55 };
56
57 struct memcpy_helper {
58 void* const ptr;
59 size_t const nbyte;
60 size_t const dst;
61 };
62
63 using SYM = E;
64 using Header = phf_header;
65
66 // ── Fields ────────────────────────────────────────────────────────────────
67 const size_t len;
68 size_t pardeg;
69 size_t sublen;
70 const size_t bklen;
71 const bool use_HFR;
72 const size_t revbk4_bytes;
73 const size_t bitstream_max_len;
74
75 uint16_t rt_bklen;
76 int numSMs;
77 size_t total_footprint_d = 0;
78 size_t total_footprint_h = 0;
79
80 // Device scratch — raw pointers, allocated from pool_ on construction
81 H4* d_scratch4;
82 H4* h_scratch4;
83 PHF_BYTE* d_encoded; // alias into d_scratch4 (not a separate allocation)
84 PHF_BYTE* h_encoded; // alias into h_scratch4 (not a separate allocation)
85
86 H4* d_bitstream4;
87 H4* h_bitstream4;
88
89 H4* d_bk4;
90 H4* h_bk4;
91 PHF_BYTE* d_revbk4;
92 PHF_BYTE* h_revbk4;
93
94 // Per-partition metadata
95 M* d_par_nbit;
96 M* h_par_nbit;
97 M* d_par_ncell;
98 M* h_par_ncell;
99 M* d_par_entry;
100 M* h_par_entry;
101
102 // Histogram buffers — pre-allocated for forward execute; size = bklen each
103 uint32_t* d_freq;
104 uint32_t* h_freq;
105
106 // Fine-path async totals: populated after GPU_fine_encode; read after caller sync.
107 // Null when use_HFR is false.
108 uint64_t* d_total_nbit;
109 uint64_t* d_total_ncell;
110 uint64_t* h_total_nbit;
111 uint64_t* h_total_ncell;
112
113 // CUB temp storage for GPU_encode_scan (ExclusiveSum). Null when use_HFR is false.
114 uint8_t* d_cub_temp;
115 size_t cub_temp_bytes;
116
117 // ── Static helpers ────────────────────────────────────────────────────────
118 static int _revbk4_bytes(int bklen);
119 static int _revbk8_bytes(int bklen);
120
121 // Non-copyable, non-movable
122 Buf(const Buf&) = delete;
123 Buf& operator=(const Buf&) = delete;
124 Buf(Buf&&) = delete;
125 Buf& operator=(Buf&&) = delete;
126
127 // ── Constructor / destructor ──────────────────────────────────────────────
128
134 Buf(size_t inlen, size_t _bklen, fz::MemoryPool* pool,
135 int _pardeg = -1, bool _use_HFR = false);
136 ~Buf();
137
138 // ── Mutators ──────────────────────────────────────────────────────────────
139 void register_runtime_bklen(int _rt_bklen) { rt_bklen = _rt_bklen; }
140
141 void memcpy_merge(phf_header& header, phf_stream_t stream);
142 void clear_buffer();
143
144private:
145 fz::MemoryPool* pool_; // non-owning; used only in destructor to return allocations
146};
147
148// ── phf::high_level<E> ───────────────────────────────────────────────────────
149
150template <typename E>
151struct high_level {
152 // Build codebook from host histogram; H2D copies codebook and revbook.
153 static int build_book(Buf<E>* buf, uint32_t* h_hist,
154 uint16_t rt_bklen, HF_STREAM stream);
155
156 // GPU coarse encode: histogram must already be done (histogram D2H happened
157 // outside this function to fill h_hist before build_book was called).
158 // Output lives at buf->d_encoded; *outlen = phf_encoded_bytes(&header).
159 static int encode(Buf<E>* buf, E* in_data, size_t data_len,
160 uint8_t** out_encoded, size_t* encoded_len,
161 phf_header& header, HF_STREAM stream);
162
163 // GPU coarse decode: reads phf_header from in_encoded[0..127],
164 // reconstructs symbols into out_decoded.
165 static int decode(Buf<E>* buf, phf_header& header,
166 PHF_BYTE* in_encoded, E* out_decoded, HF_STREAM stream);
167};
168
169} // namespace phf
Definition mempool.h:82
Stream-ordered CUDA memory pool for pipeline buffer management.