Skip to content

Commit

Permalink
feat: port in more from the C++ code (#24)
Browse files Browse the repository at this point in the history
This PR ports in some more functionality based on the MIT-licensed C++
code from CWI.

In particular, it implements the following:

* The `makeSample` function from C++ to build a sample of ~16KB from the
input data
* The `suffix limit` optimization and its corresponding `finalize`
method needed when building the symbol table, including changes to the
`compress_word` function we have that more directly corresponds to the
`compressVariant` from the C++ code
* The `byteCodes` from C++, which we implement here as `codes_one_byte`.
Note that before this PR, one-byte codes would not be found unless the
byte occurred at the end of the plaintext string
* Separates the `Compressor` build state into a new `CompressorBuilder`
struct, which has all methods that take `&mut self`. This also means
that we can in theory construct a `Compressor` now from a symbol table,
though that logic is not implemented.

Additional things in this PR:

* Added a micro benchmark for `compress_word` method comparing the
relative speeds of both code paths, see
#24 (comment)
* Removed many of the old small-data benchmarks. I've added several of
the `dbtext` compression benchmarks from the CWI paper. Here's a table
of the compression factors:

dbtext | c++ compress factor | fsst-rs compress factor
-------|-----|-------
l_comment | 2.73 | 2.69
urls | 2.33 | 2.27
wikipedia | 1.81 | 1.75

I'll follow up to figure out how to close the gap with those 1-2%
differences
  • Loading branch information
a10y committed Sep 3, 2024
1 parent 38017d0 commit c944de6
Show file tree
Hide file tree
Showing 15 changed files with 1,204 additions and 505 deletions.
107 changes: 107 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ use_debug = { level = "deny" }

[dev-dependencies]
criterion = "0.5"
curl = "*"

[[example]]
name = "round_trip"
Expand All @@ -37,6 +38,10 @@ test = false
name = "compress"
harness = false

[[bench]]
name = "micro"
harness = false

[[test]]
name = "correctness"
test = true
Expand Down
1 change: 1 addition & 0 deletions benches/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/
150 changes: 108 additions & 42 deletions benches/compress.rs
Original file line number Diff line number Diff line change
@@ -1,56 +1,122 @@
//! Benchmarks for FSST compression, decompression, and symbol table training.
//!
//! We use the dbtext data at https://github.com/cwida/fsst/tree/master/paper/dbtext
#![allow(missing_docs)]
use core::str;
use std::{
error::Error,
fs::{self, DirBuilder, File},
io::{Read, Write},
path::Path,
};

use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput};
use criterion::{criterion_group, criterion_main, Criterion, Throughput};

use fsst::{Compressor, ESCAPE_CODE};
use curl::easy::Easy;
use fsst::Compressor;

const CORPUS: &str = include_str!("dracula.txt");
const TEST: &str = "I found my smattering of German very useful here";
fn download_dataset(url: &str, path: impl AsRef<Path>) -> Result<(), Box<dyn Error>> {
let target = path.as_ref();

fn bench_fsst(c: &mut Criterion) {
let mut group = c.benchmark_group("fsst");
group.bench_function("train", |b| {
let corpus = CORPUS.as_bytes();
b.iter(|| black_box(Compressor::train(black_box(corpus))));
});
let mut dir_builder = DirBuilder::new();
dir_builder.recursive(true);

let compressor = Compressor::train(CORPUS);
let plaintext = TEST.as_bytes();
dir_builder.create(target.parent().unwrap())?;

let compressed = compressor.compress(plaintext);
let escape_count = compressed.iter().filter(|b| **b == ESCAPE_CODE).count();
let ratio = (plaintext.len() as f64) / (compressed.len() as f64);
println!(
"Escapes = {escape_count}/{}, compression_ratio = {ratio}",
compressed.len()
// Avoid downloading the file twice.
if target.exists() {
return Ok(());
}

let mut handle = Easy::new();

let mut buffer = Vec::new();
handle.url(url)?;
{
let mut transfer = handle.transfer();
transfer.write_function(|data| {
buffer.extend_from_slice(data);

Ok(data.len())
})?;
transfer.perform()?;
}

let mut output = File::create(target)?;
match output.write_all(&buffer) {
Ok(()) => {}
Err(err) => {
// cleanup in case of failure
fs::remove_file(target).unwrap();

return Err(Box::new(err));
}
}

Ok(())
}

#[allow(clippy::use_debug)]
fn bench_dbtext(c: &mut Criterion) {
fn run_dataset_bench(name: &str, url: &str, path: &str, c: &mut Criterion) {
let mut group = c.benchmark_group(name);
download_dataset(url, path).unwrap();

let mut buf = Vec::new();
{
let mut file = File::open(path).unwrap();
file.read_to_end(&mut buf).unwrap();
}

group.bench_function("train-and-compress", |b| {
b.iter_with_large_drop(|| {
let compressor = Compressor::train(&vec![&buf]);
compressor.compress_bulk(std::hint::black_box(&vec![&buf]))
});
});

let compressor = Compressor::train(&vec![&buf]);
let mut buffer = Vec::with_capacity(200 * 1024 * 1024);
group.throughput(Throughput::Bytes(buf.len() as u64));
group.bench_function("compress-only", |b| {
b.iter(|| unsafe { compressor.compress_into(&buf, &mut buffer) });
});

group.finish();

// Report the compression factor for this dataset.
let uncompressed_size = buf.len();
let compressor = Compressor::train(&vec![&buf]);

let compressed = compressor.compress_bulk(&vec![&buf]);
let compressed_size = compressed.iter().map(|l| l.len()).sum::<usize>();
let cf = (uncompressed_size as f64) / (compressed_size as f64);
println!(
"compressed {name} {uncompressed_size} => {compressed_size}B (compression factor {cf:.2}:1)"
)
}

run_dataset_bench(
"dbtext/wikipedia",
"https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/wikipedia",
"benches/data/wikipedia",
c,
);

run_dataset_bench(
"dbtext/l_comment",
"https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/l_comment",
"benches/data/l_comment",
c,
);

let decompressor = compressor.decompressor();
let decompressed = decompressor.decompress(&compressed);
let decompressed = str::from_utf8(&decompressed).unwrap();

group.throughput(Throughput::Elements(1));
group.bench_function("compress-word", |b| {
let mut out = vec![0u8; 8];
let out_ptr = out.as_mut_ptr();
let front = &TEST.as_bytes()[0..8];
let word = u64::from_le_bytes(front.try_into().unwrap());

b.iter(|| black_box(unsafe { compressor.compress_word(word, out_ptr) }));
});

group.throughput(Throughput::Bytes(CORPUS.len() as u64));
group.bench_function("compress-single", |b| {
b.iter(|| black_box(compressor.compress(black_box(CORPUS.as_bytes()))));
});

group.throughput(Throughput::Bytes(decompressed.len() as u64));
group.bench_function("decompress-single", |b| {
b.iter(|| black_box(decompressor.decompress(black_box(&compressed))));
});
run_dataset_bench(
"dbtext/urls",
"https://raw.githubusercontent.com/cwida/fsst/4e188a/paper/dbtext/urls",
"benches/data/urls",
c,
);
}

criterion_group!(compress_bench, bench_fsst);
criterion_group!(compress_bench, bench_dbtext);
criterion_main!(compress_bench);
1 change: 0 additions & 1 deletion benches/dracula.txt

This file was deleted.

Loading

0 comments on commit c944de6

Please sign in to comment.