Benchmark this implementation against the standard library's

milkey-mouse · milkey-mouse · commit 1496561069f8 · 2020-06-29T21:42:56.000-07:00
Two sets of benchmarks are now run, one with this validator and one
with std::str::from_utf8 (a thin, inlined wrapper around the internal
run_utf8_validation function). This commit also changes the benchmark
to use functions instead of macros, slightly improving readability.
diff --git a/Cargo.toml b/Cargo.toml
@@ -15,4 +15,4 @@ core_affinity = "*"
 
 [[bench]]
 name = "criterion_bench"
-harness = false
+harness = false
diff --git a/benches/criterion_bench.rs b/benches/criterion_bench.rs
@@ -6,119 +6,46 @@ use mimalloc::MiMalloc;
 #[global_allocator]
 static GLOBAL: MiMalloc = MiMalloc;
 
-use criterion::{BatchSize, Criterion, ParameterizedBenchmark, Throughput};
-use std::fs::File;
-use std::io::Read;
-
-macro_rules! bench_file {
-    ($name:ident) => {
-        fn $name(c: &mut Criterion) {
-            let core_ids = core_affinity::get_core_ids().unwrap();
-            core_affinity::set_for_current(core_ids[0]);
-
-            let mut vec = Vec::new();
-            File::open(concat!("data/", stringify!($name), ".data"))
-                .unwrap()
-                .read_to_end(&mut vec)
-                .unwrap();
-
-            let b = ParameterizedBenchmark::new(
-                "faster_utf8_validator",
-                |b, data| {
-                    b.iter_batched(
-                        || data,
-                        |bytes| {
-                            assert!(faster_utf8_validator::validate(&bytes));
-                        },
-                        BatchSize::SmallInput,
-                    )
-                },
-                vec![vec],
-            );
-            c.bench(
-                stringify!($name),
-                b.throughput(|data| Throughput::Bytes(data.len() as u64)),
-            );
-        }
-    };
+use criterion::{measurement::Measurement, Criterion, Throughput};
+use std::{fs, str};
+
+fn bench_file<T: Measurement>(c: &mut Criterion<T>, name: &str, is_valid: bool) {
+    let buf = fs::read(format!("data/{}.data", name)).unwrap();
+
+    let mut group = c.benchmark_group(name);
+    group.throughput(Throughput::Bytes(buf.len() as u64));
+    group.bench_function("std_utf8_validator", |b| {
+        b.iter(|| assert!(str::from_utf8(&buf).is_ok() == is_valid))
+    });
+    group.bench_function("faster_utf8_validator", |b| {
+        b.iter(|| assert!(faster_utf8_validator::validate(&buf) == is_valid))
+    });
+
+    group.finish();
 }
 
-macro_rules! bench_file_bad {
-    ($name:ident) => {
-        fn $name(c: &mut Criterion) {
-            let core_ids = core_affinity::get_core_ids().unwrap();
-            core_affinity::set_for_current(core_ids[0]);
-
-            let mut vec = Vec::new();
-            File::open(concat!("data/", stringify!($name), ".data"))
-                .unwrap()
-                .read_to_end(&mut vec)
-                .unwrap();
-
-            let b = ParameterizedBenchmark::new(
-                "faster_utf8_validator",
-                |b, data| {
-                    b.iter_batched(
-                        || data,
-                        |bytes| {
-                            assert!(!faster_utf8_validator::validate(&bytes));
-                        },
-                        BatchSize::SmallInput,
-                    )
-                },
-                vec![vec],
-            );
-            c.bench(
-                stringify!($name),
-                b.throughput(|data| Throughput::Bytes(data.len() as u64)),
-            );
-        }
-    };
+fn bench_all<T: Measurement>(c: &mut Criterion<T>) {
+    bench_file(c, "apache_builds", true);
+    bench_file(c, "canada", true);
+    bench_file(c, "citm_catalog", true);
+    bench_file(c, "github_events", true);
+    bench_file(c, "gsoc_2018", true);
+    bench_file(c, "instruments", true);
+    bench_file(c, "log", true);
+    bench_file(c, "marine_ik", true);
+    bench_file(c, "mesh", true);
+    bench_file(c, "numbers", true);
+    bench_file(c, "random", true);
+    bench_file(c, "twitterescaped", true);
+    bench_file(c, "twitter", true);
+    bench_file(c, "update_center", true);
+    bench_file(c, "mostly_ascii_sample_ok", true);
+    bench_file(c, "random_bytes", false);
+    bench_file(c, "utf8_characters_0_0x10ffff", true);
+    bench_file(c, "utf8_characters_0_0x10ffff_with_garbage", false);
+    bench_file(c, "utf8_sample_ok", true);
+    bench_file(c, "ascii_sample_ok", true);
 }
 
-bench_file!(apache_builds);
-bench_file!(canada);
-bench_file!(citm_catalog);
-bench_file!(github_events);
-bench_file!(gsoc_2018);
-bench_file!(instruments);
-bench_file!(log);
-bench_file!(marine_ik);
-bench_file!(mesh);
-bench_file!(numbers);
-bench_file!(random);
-bench_file!(twitterescaped);
-bench_file!(twitter);
-bench_file!(update_center);
-bench_file!(mostly_ascii_sample_ok);
-bench_file_bad!(random_bytes);
-bench_file!(utf8_characters_0_0x10ffff);
-bench_file_bad!(utf8_characters_0_0x10ffff_with_garbage);
-bench_file!(utf8_sample_ok);
-bench_file!(ascii_sample_ok);
-
-criterion_group!(
-    benches,
-    mostly_ascii_sample_ok,
-    ascii_sample_ok,
-    random_bytes,
-    utf8_characters_0_0x10ffff,
-    utf8_characters_0_0x10ffff_with_garbage,
-    utf8_sample_ok,
-    apache_builds,
-    canada,
-    citm_catalog,
-    github_events,
-    gsoc_2018,
-    instruments,
-    log,
-    marine_ik,
-    mesh,
-    numbers,
-    random,
-    twitterescaped,
-    twitter,
-    update_center
-);
-
+criterion_group!(benches, bench_all);
 criterion_main!(benches);