FZGPUModules 2.0
GPU-accelerated modular compression pipelines
Loading...
Searching...
No Matches
adm_kernels.h
1#pragma once
2// Internal header — not part of the public API.
3// Shared constants, scratch-pointer struct, and host wrapper declarations
4// for the ADM (Adaptive Data Mapping) encode/decode kernels.
5
6#include <cuda_runtime.h>
7#include <cstdint>
8#include <cstddef>
9
10namespace fz {
11namespace adm {
12
13// ── Shared constants ──────────────────────────────────────────────────────────
14
15// Block structure: 32 threads/warp × 16 elements/thread = 512 elements/warp-block.
16static constexpr int kBlockThreads = 32;
17static constexpr int kChunk = 16;
18static constexpr int kDecmpChunk = 32;
19static constexpr int kBlockElems = kBlockThreads * kChunk; // 512
20
21// Decoupled look-back prefix-sum limit (warps). Above this, Thrust fallback is used.
22static constexpr int kDecoupledMaxGsize = 1024;
23
24// Center-relative shift (1 = code 1 means "equal to center").
25static constexpr int kShift = 1;
26
27// Maximum signal bytes per input element.
28static constexpr int kMaxSignalBytesU16 = 2;
29static constexpr int kMaxSignalBytesU32 = 4;
30
31// ── Size helpers ──────────────────────────────────────────────────────────────
32
33inline size_t adm_gsize(size_t n) {
34 return (n + static_cast<size_t>(kBlockElems) - 1) / kBlockElems;
35}
36
37inline size_t adm_flags_bytes(size_t gsize) {
38 return (gsize + 7) / 8;
39}
40
41inline size_t adm_flags_words(size_t gsize) {
42 return (adm_flags_bytes(gsize) + sizeof(uint32_t) - 1) / sizeof(uint32_t);
43}
44
45// ── Scratch pointer bundle ────────────────────────────────────────────────────
46// All pointers are pool-managed device allocations. The stage pre-allocates
47// them in onFinalize()/initScratch() and reuses them every execute() call.
48
49struct AdmScratch {
50 int* d_signal_length; // gsize × sizeof(int)
51 int* d_output_lengths; // (gsize+1) × sizeof(int)
52 void* d_centers; // gsize × sizeof(uint16_t or uint32_t)
53 uint32_t* d_block_flags; // adm_flags_words(gsize) × sizeof(uint32_t)
54 uint8_t* d_codes; // num_elements × 1
55 uint8_t* d_concat_signals; // num_elements × kMaxSignalBytes
56 uint8_t* d_bit_signals; // num_elements × kMaxSignalBytes (thrust path)
57 int* d_loc_offset; // (gsize+1) × sizeof(int) (decoupled path)
58 int* d_prefix_state; // (gsize+1) × sizeof(int) (decoupled path)
59 unsigned int* d_overflow_flag; // 1 word; written by kernels only in debug builds
60};
61
62// ── u16 wrappers ──────────────────────────────────────────────────────────────
63
64void compress_u16(
65 const uint16_t* d_input, size_t num_elements,
66 uint8_t* d_output, size_t& output_size,
67 const AdmScratch& s, cudaStream_t stream);
68
69void decompress_u16(
70 const uint8_t* d_input, size_t input_size,
71 uint16_t* d_output, size_t num_elements,
72 const AdmScratch& s, cudaStream_t stream);
73
74size_t get_max_u16_payload_bytes(size_t num_elements);
75
76// ── u32 wrappers ──────────────────────────────────────────────────────────────
77
78void compress_u32(
79 const uint32_t* d_input, size_t num_elements,
80 uint8_t* d_output, size_t& output_size,
81 const AdmScratch& s, cudaStream_t stream);
82
83void decompress_u32(
84 const uint8_t* d_input, size_t input_size,
85 uint32_t* d_output, size_t num_elements,
86 const AdmScratch& s, cudaStream_t stream);
87
88size_t get_max_u32_payload_bytes(size_t num_elements);
89
90} // namespace adm
91} // namespace fz
Definition fzm_format.h:25