feat: separate Compressor and Decompressor (#11)

As part of implementing the vortex `FSSTArray`, we should separate out the data for compression (symbol table, hash table, codes_twobyte vector) and decompression (just the symbol table). This PR separates the previous `SymbolTable` type into two types, a `Compressor` and a `Decompressor<'a>`. A Compressor can be trained on a sample text, as before, but it does not have decompression methods. Those are now on a new `Decompressor` type. Decompressors can either be built directly from a Compressor, or you can build one by directly wrapping a `&[Symbol]`. Compressors allow slice access to the symbol table as well, e.g. for serialization
spiraldb · Aug 16, 2024 · 812a42d · 812a42d
1 parent 7ca1002
commit 812a42d
Show file tree

Hide file tree

Showing 10 changed files with 168 additions and 110 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -7,6 +7,9 @@ license = "Apache-2.0"
 repository = "https://github.com/spiraldb/fsst"
 edition = "2021"
 
+[lib]
+name = "fsst"
+
 [lints.rust]
 warnings = "deny"
 missing_docs = "deny"

diff --git a/benches/compress.rs b/benches/compress.rs
@@ -8,7 +8,7 @@ use core::str;
 
 use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
-use fsst_rs::{train, ESCAPE_CODE};
+use fsst::{Compressor, ESCAPE_CODE};
 
 const CORPUS: &str = include_str!("dracula.txt");
 const TEST: &str = "I found my smattering of German very useful here";
@@ -17,31 +17,32 @@ fn bench_fsst(c: &mut Criterion) {
     let mut group = c.benchmark_group("fsst");
     group.bench_function("train", |b| {
         let corpus = CORPUS.as_bytes();
-        b.iter(|| black_box(train(black_box(corpus))));
+        b.iter(|| black_box(Compressor::train(black_box(corpus))));
     });
 
-    let table = train(CORPUS);
+    let compressor = Compressor::train(CORPUS);
     let plaintext = TEST.as_bytes();
 
-    let compressed = table.compress(plaintext);
+    let compressed = compressor.compress(plaintext);
     let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count();
     let ratio = (plaintext.len() as f64) / (compressed.len() as f64);
     println!(
         "Escapes = {escape_count}/{}, compression_ratio = {ratio}",
         compressed.len()
     );
 
-    let decompressed = table.decompress(&compressed);
+    let decompressor = compressor.decompressor();
+    let decompressed = decompressor.decompress(&compressed);
     let decompressed = str::from_utf8(&decompressed).unwrap();
     println!("DECODED: {}", decompressed);
     assert_eq!(decompressed, TEST);
 
     group.bench_function("compress-single", |b| {
-        b.iter(|| black_box(table.compress(black_box(plaintext))));
+        b.iter(|| black_box(compressor.compress(black_box(plaintext))));
     });
 
     group.bench_function("decompress-single", |b| {
-        b.iter(|| black_box(table.decompress(black_box(&compressed))));
+        b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
     });
 }
 

diff --git a/examples/file_compressor.rs b/examples/file_compressor.rs
@@ -19,6 +19,8 @@ use std::{
     path::Path,
 };
 
+use fsst::Compressor;
+
 fn main() {
     let args: Vec<_> = std::env::args().skip(1).collect();
     assert!(args.len() >= 2, "args TRAINING and FILE must be provided");
@@ -33,7 +35,7 @@ fn main() {
     }
 
     println!("building the compressor from {train_path:?}...");
-    let compressor = fsst_rs::train(&train_bytes);
+    let compressor = Compressor::train(&train_bytes);
 
     println!("compressing blocks of {input_path:?} with compressor...");
 

diff --git a/examples/round_trip.rs b/examples/round_trip.rs
@@ -2,14 +2,16 @@
 
 use core::str;
 
+use fsst::Compressor;
+
 fn main() {
     // Train on a sample.
     let sample = "the quick brown fox jumped over the lazy dog";
-    let trained = fsst_rs::train(sample.as_bytes());
+    let trained = Compressor::train(sample.as_bytes());
     let compressed = trained.compress(sample.as_bytes());
     println!("compressed: {} => {}", sample.len(), compressed.len());
     // decompress now
-    let decode = trained.decompress(&compressed);
+    let decode = trained.decompressor().decompress(&compressed);
     let output = str::from_utf8(&decode).unwrap();
     println!(
         "decoded to the original: len={} text='{}'",

diff --git a/fuzz/fuzz_targets/fuzz_compress.rs b/fuzz/fuzz_targets/fuzz_compress.rs
@@ -3,8 +3,9 @@
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
-    let table = fsst_rs::train("the quick brown fox jumped over the lazy dog".as_bytes());
-    let compress = table.compress(data);
-    let decompress = table.decompress(&compress);
+    let compressor =
+        fsst::Compressor::train("the quick brown fox jumped over the lazy dog".as_bytes());
+    let compress = compressor.compress(data);
+    let decompress = compressor.decompressor().decompress(&compress);
     assert_eq!(&decompress, data);
 });
diff --git a/fuzz/fuzz_targets/fuzz_train.rs b/fuzz/fuzz_targets/fuzz_train.rs
@@ -3,5 +3,5 @@
 use libfuzzer_sys::fuzz_target;
 
 fuzz_target!(|data: &[u8]| {
-    let _ = fsst_rs::train(data);
+    let _ = fsst::Compressor::train(data);
 });
diff --git a/src/builder.rs b/src/builder.rs
@@ -1,4 +1,4 @@
-//! Functions and types used for building a [`SymbolTable`] from a corpus of text.
+//! Functions and types used for building a [`Compressor`] from a corpus of text.
 //!
 //! This module implements the logic from Algorithm 3 of the [FSST Paper].
 //!
@@ -8,7 +8,7 @@ use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 
 use crate::find_longest::FindLongestSymbol;
-use crate::{Symbol, SymbolTable, MAX_CODE};
+use crate::{Compressor, Symbol, MAX_CODE};
 
 #[derive(Debug, Clone)]
 struct Counter {
@@ -53,31 +53,33 @@ impl Counter {
 /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
 pub const MAX_GENERATIONS: usize = 5;
 
-/// Build and train a `SymbolTable` from a sample corpus of text.
-///
-/// This function implements the generational algorithm described in the [FSST paper] Section
-/// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
-/// to merge symbols when doing so would yield better compression than leaving them unmerged. The
-/// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
-/// code).
-///
-/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
-pub fn train(corpus: impl AsRef<[u8]>) -> SymbolTable {
-    let mut table = SymbolTable::default();
-    // TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
-    let sample = corpus.as_ref();
-    if sample.is_empty() {
-        return table;
-    }
-    for _generation in 0..MAX_GENERATIONS {
-        let counter = table.compress_count(sample);
-        table = table.optimize(counter);
-    }
+impl Compressor {
+    /// Build and train a `Compressor` from a sample corpus of text.
+    ///
+    /// This function implements the generational algorithm described in the [FSST paper] Section
+    /// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
+    /// to merge symbols when doing so would yield better compression than leaving them unmerged. The
+    /// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
+    /// code).
+    ///
+    /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
+    pub fn train(corpus: impl AsRef<[u8]>) -> Self {
+        let mut compressor = Self::default();
+        // TODO(aduffy): handle truncating/sampling if corpus > requires sample size.
+        let sample = corpus.as_ref();
+        if sample.is_empty() {
+            return compressor;
+        }
+        for _generation in 0..MAX_GENERATIONS {
+            let counter = compressor.compress_count(sample);
+            compressor = compressor.optimize(counter);
+        }
 
-    table
+        compressor
+    }
 }
 
-impl SymbolTable {
+impl Compressor {
     /// Compress the text using the current symbol table. Count the code occurrences
     /// and code-pair occurrences to allow us to calculate apparent gain.
     fn compress_count(&self, sample: &[u8]) -> Counter {
@@ -101,7 +103,7 @@ impl SymbolTable {
     /// Using a set of counters and the existing set of symbols, build a new
     /// set of symbols/codes that optimizes the gain over the distribution in `counter`.
     fn optimize(&self, counters: Counter) -> Self {
-        let mut res = SymbolTable::default();
+        let mut res = Compressor::default();
         let mut pqueue = BinaryHeap::new();
         for code1 in 0u16..(256u16 + self.n_symbols as u16) {
             let symbol1 = self.symbols[code1 as usize];
@@ -186,13 +188,13 @@ impl Ord for Candidate {
 
 #[cfg(test)]
 mod test {
-    use crate::{train, ESCAPE_CODE};
+    use crate::{Compressor, ESCAPE_CODE};
 
     #[test]
     fn test_builder() {
-        // Train a SymbolTable on the toy string
+        // Train a Compressor on the toy string
         let text = "hello world";
-        let table = train(text.as_bytes());
+        let table = Compressor::train(text.as_bytes());
 
         // Use the table to compress a string, see the values
         let compressed = table.compress(text.as_bytes());

diff --git a/src/find_longest/naive.rs b/src/find_longest/naive.rs
@@ -1,11 +1,11 @@
 use crate::find_longest::FindLongestSymbol;
-use crate::SymbolTable;
+use crate::Compressor;
 
 // Find the code that maps to a symbol with longest-match to a piece of text.
 //
 // This is the naive algorithm that just scans the whole table and is very slow.
 
-impl FindLongestSymbol for SymbolTable {
+impl FindLongestSymbol for Compressor {
     // NOTE(aduffy): if you don't disable inlining, this function won't show up in profiles.
     #[inline(never)]
     fn find_longest_symbol(&self, text: &[u8]) -> u16 {