23#include <cuda_runtime.h>
26namespace adaptive_bitpack {
32 size_t num_elements = 0;
33 uint32_t block_size = 32;
39inline Config configure(
size_t num_elements, uint32_t block_size,
40 bool outlier_selection =
false) {
42 c.num_elements = num_elements;
43 c.block_size = block_size;
47 : (num_elements + block_size - 1) / block_size;
64void launchEncodeRate(
const T* d_in,
const Config& c,
65 uint8_t* d_rate, uint32_t* d_cost, cudaStream_t stream);
70void launchEncodePack(
const T* d_in,
const Config& c,
71 const uint8_t* d_rate,
const uint32_t* d_offset,
72 uint8_t* d_payload, cudaStream_t stream);
76void launchDecodeCost(
const uint8_t* d_rate,
const Config& c,
77 uint32_t* d_cost, cudaStream_t stream);
81void launchDecodeUnpack(
const uint8_t* d_rate,
const uint32_t* d_offset,
82 const uint8_t* d_payload,
const Config& c,
83 T* d_out, cudaStream_t stream);
91void launchEncodeRateOutlier(
const T* d_in,
const Config& c,
92 uint8_t* d_meta, uint32_t* d_cost, cudaStream_t stream);
94void launchEncodePackOutlier(
const T* d_in,
const Config& c,
95 const uint8_t* d_meta,
const uint32_t* d_offset,
96 uint8_t* d_payload, cudaStream_t stream);
97void launchDecodeCostOutlier(
const uint8_t* d_meta,
const Config& c,
98 uint32_t* d_cost, cudaStream_t stream);
100void launchDecodeUnpackOutlier(
const uint8_t* d_meta,
const uint32_t* d_offset,
101 const uint8_t* d_payload,
const Config& c,
102 T* d_out, cudaStream_t stream);
size_t maxArchiveBytes(const Config &c, unsigned bits_per_elem)
Definition adaptive_bitpack_kernels.h:55
Definition fzm_format.h:25
Definition adaptive_bitpack_kernels.h:31
uint32_t meta_bytes
per-block metadata stride (1 plain, 2 outlier)
Definition adaptive_bitpack_kernels.h:36
size_t num_blocks
ceil(num_elements / block_size)
Definition adaptive_bitpack_kernels.h:35
uint32_t word_bytes
ceil(block_size / 8)
Definition adaptive_bitpack_kernels.h:34