feat: port in more from the C++ code (#24)

This PR ports in some more functionality based on the MIT-licensed C++ code from CWI. In particular, it implements the following: * The `makeSample` function from C++ to build a sample of ~16KB from the input data * The `suffix limit` optimization and its corresponding `finalize` method needed when building the symbol table, including changes to the `compress_word` function we have that more directly corresponds to the `compressVariant` from the C++ code * The `byteCodes` from C++, which we implement here as `codes_one_byte`. Note that before this PR, one-byte codes would not be found unless the byte occurred at the end of the plaintext string * Separates the `Compressor` build state into a new `CompressorBuilder` struct, which has all methods that take `&mut self`. This also means that we can in theory construct a `Compressor` now from a symbol table, though that logic is not implemented. Additional things in this PR: * Added a micro benchmark for `compress_word` method comparing the relative speeds of both code paths, see #24 (comment) * Removed many of the old small-data benchmarks. I've added several of the `dbtext` compression benchmarks from the CWI paper. Here's a table of the compression factors: dbtext | c++ compress factor | fsst-rs compress factor -------|-----|------- l_comment | 2.73 | 2.69 urls | 2.33 | 2.27 wikipedia | 1.81 | 1.75 I'll follow up to figure out how to close the gap with those 1-2% differences
spiraldb · Sep 3, 2024 · c944de6 · c944de6
1 parent 38017d0
commit c944de6
Show file tree

Hide file tree

Showing 15 changed files with 1,204 additions and 505 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,7 @@ use_debug = { level = "deny" }
 
 [dev-dependencies]
 criterion = "0.5"
+curl = "*"
 
 [[example]]
 name = "round_trip"
@@ -37,6 +38,10 @@ test = false
 name = "compress"
 harness = false
 
+[[bench]]
+name = "micro"
+harness = false
+
 [[test]]
 name = "correctness"
 test = true

diff --git a/benches/.gitignore b/benches/.gitignore
@@ -0,0 +1 @@
+data/
diff --git a/benches/compress.rs b/benches/compress.rs
@@ -1,56 +1,122 @@
 //! Benchmarks for FSST compression, decompression, and symbol table training.
+//!
+//! We use the dbtext data at https://github.com/cwida/fsst/tree/master/paper/dbtext
 #![allow(missing_docs)]
 use core::str;
+use std::{
+    error::Error,
+    fs::{self, DirBuilder, File},
+    io::{Read, Write},
+    path::Path,
+};
 
-use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
+use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 
-use fsst::{Compressor, ESCAPE_CODE};
+use curl::easy::Easy;
+use fsst::Compressor;
 
-const CORPUS: &str = include_str!("dracula.txt");
-const TEST: &str = "I found my smattering of German very useful here";
+fn download_dataset(url: &str, path: impl AsRef<Path>) -> Result<(), Box<dyn Error>> {
+    let target = path.as_ref();
 
-fn bench_fsst(c: &mut Criterion) {
-    let mut group = c.benchmark_group("fsst");
-    group.bench_function("train", |b| {
-        let corpus = CORPUS.as_bytes();
-        b.iter(|| black_box(Compressor::train(black_box(corpus))));
-    });
+    let mut dir_builder = DirBuilder::new();
+    dir_builder.recursive(true);
 
-    let compressor = Compressor::train(CORPUS);
-    let plaintext = TEST.as_bytes();
+    dir_builder.create(target.parent().unwrap())?;
 
-    let compressed = compressor.compress(plaintext);
-    let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count();
-    let ratio = (plaintext.len() as f64) / (compressed.len() as f64);
-    println!(
-        "Escapes = {escape_count}/{}, compression_ratio = {ratio}",
-        compressed.len()
+    // Avoid downloading the file twice.
+    if target.exists() {
+        return Ok(());
+    }
+
+    let mut handle = Easy::new();
+
+    let mut buffer = Vec::new();
+    handle.url(url)?;
+    {
+        let mut transfer = handle.transfer();
+        transfer.write_function(|data| {
+            buffer.extend_from_slice(data);
+
+            Ok(data.len())
+        })?;
+        transfer.perform()?;
+    }
+
+    let mut output = File::create(target)?;
+    match output.write_all(&buffer) {
+        Ok(()) => {}
+        Err(err) => {
+            // cleanup in case of failure
+            fs::remove_file(target).unwrap();
+
+            return Err(Box::new(err));
+        }
+    }
+
+    Ok(())
+}
+
+#[allow(clippy::use_debug)]
+fn bench_dbtext(c: &mut Criterion) {
+    fn run_dataset_bench(name: &str, url: &str, path: &str, c: &mut Criterion) {
+        let mut group = c.benchmark_group(name);
+        download_dataset(url, path).unwrap();
+
+        let mut buf = Vec::new();
+        {
+            let mut file = File::open(path).unwrap();
+            file.read_to_end(&mut buf).unwrap();
+        }
+
+        group.bench_function("train-and-compress", |b| {
+            b.iter_with_large_drop(|| {
+                let compressor = Compressor::train(&vec![&buf]);
+                compressor.compress_bulk(std::hint::black_box(&vec![&buf]))
+            });
+        });
+
+        let compressor = Compressor::train(&vec![&buf]);
+        let mut buffer = Vec::with_capacity(200 * 1024 * 1024);
+        group.throughput(Throughput::Bytes(buf.len() as u64));
+        group.bench_function("compress-only", |b| {
+            b.iter(|| unsafe { compressor.compress_into(&buf, &mut buffer) });
+        });
+
+        group.finish();
+
+        // Report the compression factor for this dataset.
+        let uncompressed_size = buf.len();
+        let compressor = Compressor::train(&vec![&buf]);
+
+        let compressed = compressor.compress_bulk(&vec![&buf]);
+        let compressed_size = compressed.iter().map(|l| l.len()).sum::<usize>();
+        let cf = (uncompressed_size as f64) / (compressed_size as f64);
+        println!(
+            "compressed {name} {uncompressed_size} => {compressed_size}B (compression factor {cf:.2}:1)"
+        )
+    }
+
+    run_dataset_bench(
+        "dbtext/wikipedia",
+        "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/wikipedia",
+        "benches/data/wikipedia",
+        c,
+    );
+
+    run_dataset_bench(
+        "dbtext/l_comment",
+        "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/l_comment",
+        "benches/data/l_comment",
+        c,
     );
 
-    let decompressor = compressor.decompressor();
-    let decompressed = decompressor.decompress(&compressed);
-    let decompressed = str::from_utf8(&decompressed).unwrap();
-
-    group.throughput(Throughput::Elements(1));
-    group.bench_function("compress-word", |b| {
-        let mut out = vec![0u8; 8];
-        let out_ptr = out.as_mut_ptr();
-        let front = &TEST.as_bytes()[0..8];
-        let word = u64::from_le_bytes(front.try_into().unwrap());
-
-        b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) }));
-    });
-
-    group.throughput(Throughput::Bytes(CORPUS.len() as u64));
-    group.bench_function("compress-single", |b| {
-        b.iter(|| black_box(compressor.compress(black_box(CORPUS.as_bytes()))));
-    });
-
-    group.throughput(Throughput::Bytes(decompressed.len() as u64));
-    group.bench_function("decompress-single", |b| {
-        b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
-    });
+    run_dataset_bench(
+        "dbtext/urls",
+        "https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/urls",
+        "benches/data/urls",
+        c,
+    );
 }
 
-criterion_group!(compress_bench, bench_fsst);
+criterion_group!(compress_bench, bench_dbtext);
 criterion_main!(compress_bench);
diff --git a/benches/dracula.txt b/benches/dracula.txt