FZGPUModules/adm__kernels_8h_source.html

#pragma once

// Internal header — not part of the public API.

// Shared constants, scratch-pointer struct, and host wrapper declarations

// for the ADM (Adaptive Data Mapping) encode/decode kernels.


#include <cuda_runtime.h>

#include <cstdint>

#include <cstddef>


namespace fz {

namespace adm {


// ── Shared constants ──────────────────────────────────────────────────────────


// Block structure: 32 threads/warp × 16 elements/thread = 512 elements/warp-block.

static constexpr int kBlockThreads  = 32;

static constexpr int kChunk         = 16;

static constexpr int kDecmpChunk    = 32;

static constexpr int kBlockElems    = kBlockThreads * kChunk;  // 512


// Decoupled look-back prefix-sum limit (warps). Above this, Thrust fallback is used.

static constexpr int kDecoupledMaxGsize = 1024;


// Center-relative shift (1 = code 1 means "equal to center").

static constexpr int kShift = 1;


// Maximum signal bytes per input element.

static constexpr int kMaxSignalBytesU16 = 2;

static constexpr int kMaxSignalBytesU32 = 4;


// ── Size helpers ──────────────────────────────────────────────────────────────


inline size_t adm_gsize(size_t n) {

    return (n + static_cast<size_t>(kBlockElems) - 1) / kBlockElems;

}


inline size_t adm_flags_bytes(size_t gsize) {

    return (gsize + 7) / 8;

}


inline size_t adm_flags_words(size_t gsize) {

    return (adm_flags_bytes(gsize) + sizeof(uint32_t) - 1) / sizeof(uint32_t);

}


// ── Scratch pointer bundle ────────────────────────────────────────────────────

// All pointers are pool-managed device allocations. The stage pre-allocates

// them in onFinalize()/initScratch() and reuses them every execute() call.


struct AdmScratch {

    int*      d_signal_length;    // gsize × sizeof(int)

    int*      d_output_lengths;   // (gsize+1) × sizeof(int)

    void*     d_centers;          // gsize × sizeof(uint16_t or uint32_t)

    uint32_t* d_block_flags;      // adm_flags_words(gsize) × sizeof(uint32_t)

    uint8_t*  d_codes;            // num_elements × 1

    uint8_t*  d_concat_signals;   // num_elements × kMaxSignalBytes

    uint8_t*  d_bit_signals;      // num_elements × kMaxSignalBytes (thrust path)

    int*      d_loc_offset;       // (gsize+1) × sizeof(int)  (decoupled path)

    int*      d_prefix_state;     // (gsize+1) × sizeof(int)  (decoupled path)

    unsigned int* d_overflow_flag; // 1 word; written by kernels only in debug builds

};


// ── u16 wrappers ──────────────────────────────────────────────────────────────


void compress_u16(

    const uint16_t* d_input, size_t num_elements,

    uint8_t* d_output, size_t& output_size,

    const AdmScratch& s, cudaStream_t stream);


void decompress_u16(

    const uint8_t* d_input, size_t input_size,

    uint16_t* d_output, size_t num_elements,

    const AdmScratch& s, cudaStream_t stream);


size_t get_max_u16_payload_bytes(size_t num_elements);


// ── u32 wrappers ──────────────────────────────────────────────────────────────


void compress_u32(

    const uint32_t* d_input, size_t num_elements,

    uint8_t* d_output, size_t& output_size,

    const AdmScratch& s, cudaStream_t stream);


void decompress_u32(

    const uint8_t* d_input, size_t input_size,

    uint32_t* d_output, size_t num_elements,

    const AdmScratch& s, cudaStream_t stream);


size_t get_max_u32_payload_bytes(size_t num_elements);


} // namespace adm

} // namespace fz

fz
Definition fzm_format.h:25