Skip to content

Commit

Permalink
feat: separate Compressor and Decompressor (#11)
Browse files Browse the repository at this point in the history
As part of implementing the vortex `FSSTArray`, we should separate out
the data for compression (symbol table, hash table, codes_twobyte
vector) and decompression (just the symbol table).

This PR separates the previous `SymbolTable` type into two types, a
`Compressor` and a `Decompressor<'a>`. A Compressor can be trained on a
sample text, as before, but it does not have decompression methods.
Those are now on a new `Decompressor` type. Decompressors can either be
built directly from a Compressor, or you can build one by directly
wrapping a `&[Symbol]`. Compressors allow slice access to the symbol
table as well, e.g. for serialization
  • Loading branch information
a10y committed Aug 16, 2024
1 parent 7ca1002 commit 812a42d
Show file tree
Hide file tree
Showing 10 changed files with 168 additions and 110 deletions.
3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ license = "Apache-2.0"
repository = "https://github.com/spiraldb/fsst"
edition = "2021"

[lib]
name = "fsst"

[lints.rust]
warnings = "deny"
missing_docs = "deny"
Expand Down
15 changes: 8 additions & 7 deletions benches/compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use core::str;

use criterion::{black_box, criterion_group, criterion_main, Criterion};

use fsst_rs::{train, ESCAPE_CODE};
use fsst::{Compressor, ESCAPE_CODE};

const CORPUS: &str = include_str!("dracula.txt");
const TEST: &str = "I found my smattering of German very useful here";
Expand All @@ -17,31 +17,32 @@ fn bench_fsst(c: &mut Criterion) {
let mut group = c.benchmark_group("fsst");
group.bench_function("train", |b| {
let corpus = CORPUS.as_bytes();
b.iter(|| black_box(train(black_box(corpus))));
b.iter(|| black_box(Compressor::train(black_box(corpus))));
});

let table = train(CORPUS);
let compressor = Compressor::train(CORPUS);
let plaintext = TEST.as_bytes();

let compressed = table.compress(plaintext);
let compressed = compressor.compress(plaintext);
let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count();
let ratio = (plaintext.len() as f64) / (compressed.len() as f64);
println!(
"Escapes = {escape_count}/{}, compression_ratio = {ratio}",
compressed.len()
);

let decompressed = table.decompress(&compressed);
let decompressor = compressor.decompressor();
let decompressed = decompressor.decompress(&compressed);
let decompressed = str::from_utf8(&decompressed).unwrap();
println!("DECODED: {}", decompressed);
assert_eq!(decompressed, TEST);

group.bench_function("compress-single", |b| {
b.iter(|| black_box(table.compress(black_box(plaintext))));
b.iter(|| black_box(compressor.compress(black_box(plaintext))));
});

group.bench_function("decompress-single", |b| {
b.iter(|| black_box(table.decompress(black_box(&compressed))));
b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
});
}

Expand Down
4 changes: 3 additions & 1 deletion examples/file_compressor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use std::{
path::Path,
};

use fsst::Compressor;

fn main() {
let args: Vec<_> = std::env::args().skip(1).collect();
assert!(args.len() >= 2, "args TRAINING and FILE must be provided");
Expand All @@ -33,7 +35,7 @@ fn main() {
}

println!("building the compressor from {train_path:?}...");
let compressor = fsst_rs::train(&train_bytes);
let compressor = Compressor::train(&train_bytes);

println!("compressing blocks of {input_path:?} with compressor...");

Expand Down
6 changes: 4 additions & 2 deletions examples/round_trip.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,16 @@

use core::str;

use fsst::Compressor;

fn main() {
// Train on a sample.
let sample = "the quick brown fox jumped over the lazy dog";
let trained = fsst_rs::train(sample.as_bytes());
let trained = Compressor::train(sample.as_bytes());
let compressed = trained.compress(sample.as_bytes());
println!("compressed: {} => {}", sample.len(), compressed.len());
// decompress now
let decode = trained.decompress(&compressed);
let decode = trained.decompressor().decompress(&compressed);
let output = str::from_utf8(&decode).unwrap();
println!(
"decoded to the original: len={} text='{}'",
Expand Down
7 changes: 4 additions & 3 deletions fuzz/fuzz_targets/fuzz_compress.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes());
let compress = table.compress(data);
let decompress = table.decompress(&compress);
let compressor =
fsst::Compressor::train("the quick brown fox jumped over the lazy dog".as_bytes());
let compress = compressor.compress(data);
let decompress = compressor.decompressor().decompress(&compress);
assert_eq!(&decompress, data);
});
2 changes: 1 addition & 1 deletion fuzz/fuzz_targets/fuzz_train.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@
use libfuzzer_sys::fuzz_target;

fuzz_target!(|data: &[u8]| {
let _ = fsst_rs::train(data);
let _ = fsst::Compressor::train(data);
});
58 changes: 30 additions & 28 deletions src/builder.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Functions and types used for building a [`SymbolTable`] from a corpus of text.
//! Functions and types used for building a [`Compressor`] from a corpus of text.
//!
//! This module implements the logic from Algorithm 3 of the [FSST Paper].
//!
Expand All @@ -8,7 +8,7 @@ use std::cmp::Ordering;
use std::collections::BinaryHeap;

use crate::find_longest::FindLongestSymbol;
use crate::{Symbol, SymbolTable, MAX_CODE};
use crate::{Compressor, Symbol, MAX_CODE};

#[derive(Debug, Clone)]
struct Counter {
Expand Down Expand Up @@ -53,31 +53,33 @@ impl Counter {
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub const MAX_GENERATIONS: usize = 5;

/// Build and train a `SymbolTable` from a sample corpus of text.
///
/// This function implements the generational algorithm described in the [FSST paper] Section
/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
/// to merge symbols when doing so would yield better compression than leaving them unmerged. The
/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
/// code).
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable {
let mut table = SymbolTable::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
let sample = corpus.as_ref();
if sample.is_empty() {
return table;
}
for _generation in 0..MAX_GENERATIONS {
let counter = table.compress_count(sample);
table = table.optimize(counter);
}
impl Compressor {
/// Build and train a `Compressor` from a sample corpus of text.
///
/// This function implements the generational algorithm described in the [FSST paper] Section
/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
/// to merge symbols when doing so would yield better compression than leaving them unmerged. The
/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
/// code).
///
/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
pub fn train(corpus: impl AsRef<[u8]>) -> Self {
let mut compressor = Self::default();
// TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
let sample = corpus.as_ref();
if sample.is_empty() {
return compressor;
}
for _generation in 0..MAX_GENERATIONS {
let counter = compressor.compress_count(sample);
compressor = compressor.optimize(counter);
}

table
compressor
}
}

impl SymbolTable {
impl Compressor {
/// Compress the text using the current symbol table. Count the code occurrences
/// and code-pair occurrences to allow us to calculate apparent gain.
fn compress_count(&self, sample: &[u8]) -> Counter {
Expand All @@ -101,7 +103,7 @@ impl SymbolTable {
/// Using a set of counters and the existing set of symbols, build a new
/// set of symbols/codes that optimizes the gain over the distribution in `counter`.
fn optimize(&self, counters: Counter) -> Self {
let mut res = SymbolTable::default();
let mut res = Compressor::default();
let mut pqueue = BinaryHeap::new();
for code1 in 0u16..(256u16 + self.n_symbols as u16) {
let symbol1 = self.symbols[code1 as usize];
Expand Down Expand Up @@ -186,13 +188,13 @@ impl Ord for Candidate {

#[cfg(test)]
mod test {
use crate::{train, ESCAPE_CODE};
use crate::{Compressor, ESCAPE_CODE};

#[test]
fn test_builder() {
// Train a SymbolTable on the toy string
// Train a Compressor on the toy string
let text = "hello world";
let table = train(text.as_bytes());
let table = Compressor::train(text.as_bytes());

// Use the table to compress a string, see the values
let compressed = table.compress(text.as_bytes());
Expand Down
4 changes: 2 additions & 2 deletions src/find_longest/naive.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use crate::find_longest::FindLongestSymbol;
use crate::SymbolTable;
use crate::Compressor;

// Find the code that maps to a symbol with longest-match to a piece of text.
//
// This is the naive algorithm that just scans the whole table and is very slow.

impl FindLongestSymbol for SymbolTable {
impl FindLongestSymbol for Compressor {
// NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles.
#[inline(never)]
fn find_longest_symbol(&self, text: &[u8]) -> u16 {
Expand Down
Loading

0 comments on commit 812a42d

Please sign in to comment.