vmware-archive
diff --git a/‎Cargo.lock
Lines changed: 2 additions & 0 deletions b/‎Cargo.lock
Lines changed: 2 additions & 0 deletions
diff --git a/‎Cargo.toml
Lines changed: 2 additions & 3 deletions b/‎Cargo.toml
Lines changed: 2 additions & 3 deletions
diff --git a/‎benches/column_leaf.rs renamed to ‎benches/column_layer.rs
Lines changed: 2 additions & 2 deletions b/‎benches/column_leaf.rs renamed to ‎benches/column_layer.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎benches/galen.rs
Lines changed: 2 additions & 2 deletions b/‎benches/galen.rs
Lines changed: 2 additions & 2 deletions
diff --git a/‎benches/gdelt/data.rs
Lines changed: 2 additions & 1 deletion b/‎benches/gdelt/data.rs
Lines changed: 2 additions & 1 deletion
diff --git a/‎benches/gdelt/main.rs
Lines changed: 8 additions & 1 deletion b/‎benches/gdelt/main.rs
Lines changed: 8 additions & 1 deletion
diff --git a/‎benches/ldbc-graphalytics/data.rs
Lines changed: 42 additions & 46 deletions b/‎benches/ldbc-graphalytics/data.rs
Lines changed: 42 additions & 46 deletions
diff --git a/‎benches/ldbc-graphalytics/main.rs
Lines changed: 6 additions & 4 deletions b/‎benches/ldbc-graphalytics/main.rs
Lines changed: 6 additions & 4 deletions
diff --git a/‎benches/ldbc-graphalytics/pagerank.rs
Lines changed: 1 addition & 1 deletion b/‎benches/ldbc-graphalytics/pagerank.rs
Lines changed: 1 addition & 1 deletion
diff --git a/‎benches/nexmark/main.rs
Lines changed: 1 addition & 1 deletion b/‎benches/nexmark/main.rs
Lines changed: 1 addition & 1 deletion
@@ -70,7 +70,6 @@ cached = { version = "0.38.0", optional = true }
     features = ["hashbrown", "time-std", "xxhash-xxh3"]
 
 [dev-dependencies]
-csv = "1.1.6"
 zip = "0.6.2"
 tar = "0.4.38"
 rand = "0.8.5"
@@ -98,6 +97,7 @@ time = { version = "0.3.14", features = [
 itertools = "0.10.5"
 serde_json = "1.0.87"
 serde_with = "2.0.1"
+csv = { git = "https://github.com/ryzhyk/rust-csv.git" }
 
 [profile.bench]
 debug = true
@@ -131,7 +131,7 @@ required-features = ["with-csv"]
 harness = false
 
 [[bench]]
-name = "column_leaf"
+name = "column_layer"
 harness = false
 
 [[bench]]
@@ -142,6 +142,5 @@ required-features = ["__gdelt"]
 # Waiting for bincode 2.0.0 to be released (https://github.com/thomcc/arcstr/pull/45)
 [patch.crates-io.arcstr]
 git = "https://github.com/gz/arcstr.git"
-features = ["bincode"]
 rev = "b43120c"
 optional = true
@@ -86,7 +86,7 @@ macro_rules! leaf_benches {
                     b.iter_batched(
                         || unsorted.clone(),
                         |unsorted| {
-                            let mut builder = UnorderedColumnLeafBuilder::new();
+                            let mut builder = UnorderedColumnLayerBuilder::new();
                             for tuple in unsorted {
                                 builder.push_tuple(tuple);
                             }
@@ -138,7 +138,7 @@ macro_rules! leaf_benches {
                     b.iter_batched(
                         || (left.cursor(), right.cursor()),
                         |(left, right)| {
-                            let mut builder = OrderedColumnLeafBuilder::new();
+                            let mut builder = ColumnLayerBuilder::new();
                             builder.push_merge(left, right);
                         },
                         BatchSize::PerIteration,
 
@@ -309,11 +309,11 @@ fn main() -> Result<()> {
                 let mut csv_writer = csv::WriterBuilder::new().from_writer(file);
                 if !results_file_already_exists {
                     csv_writer
-                        .write_record(&["name", "workers", "elapsed"])
+                        .write_record(["name", "workers", "elapsed"])
                         .expect("failed to write csv header");
                 }
                 csv_writer
-                    .write_record(&[
+                    .write_record([
                         "galen",
                         args.workers.to_string().as_str(),
                         elapsed.as_secs_f64().to_string().as_str(),
 
@@ -57,6 +57,7 @@
 //! [GKG cookbook]: http://data.gdeltproject.org/documentation/GDELT-Global_Knowledge_Graph_Codebook-V2.1.pdf
 
 use arcstr::{literal, ArcStr};
+use bincode::{Decode, Encode};
 use csv::{ReaderBuilder, Trim};
 use dbsp::CollectionHandle;
 use hashbrown::{HashMap, HashSet};
@@ -83,7 +84,7 @@ type Interner = HashSet<ArcStr, Xxh3Builder>;
 type Invalid = HashSet<&'static str, Xxh3Builder>;
 type Normalizations = HashMap<&'static str, &'static [ArcStr], Xxh3Builder>;
 
-#[derive(Debug, Clone, SizeOf)]
+#[derive(Debug, Clone, SizeOf, Decode, Encode)]
 pub struct PersonalNetworkGkgEntry {
     pub id: ArcStr,
     pub date: u64,
 
@@ -69,7 +69,14 @@ fn main() {
         .map(NonZeroUsize::get)
         .unwrap_or(1);
     let batches = args.batches.get();
-    let person = ArcStr::from(args.person);
+    let person = ArcStr::from(args.person.trim().to_lowercase());
+
+    if let Some((start, end)) = args.date_start.zip(args.date_end) {
+        if start > end {
+            eprintln!("error: `--date-start` must be less than than or equal to `--date-end` ({start} > {end})");
+            return;
+        }
+    }
 
     let (mut handle, mut entries) = Runtime::init_circuit(threads, move |circuit| {
         let (events, handle) = circuit.add_input_zset();
 
@@ -6,8 +6,10 @@ use dbsp::{
     Circuit, OrdIndexedZSet, OrdZSet, Stream,
 };
 use indicatif::{HumanBytes, ProgressBar, ProgressState, ProgressStyle};
+use reqwest::header::CONTENT_LENGTH;
 use std::{
     cmp::Reverse,
+    collections::HashMap,
     ffi::OsStr,
     fmt::{self, Debug},
     fs::{self, File, OpenOptions},
@@ -115,8 +117,37 @@ pub(crate) fn list_downloaded_benchmarks() {
 }
 
 pub(crate) fn list_datasets() {
+    let cache_file = Path::new(DATA_PATH).join("dataset_cache.json");
+    let dataset_sizes = if cache_file.exists() {
+        serde_json::from_reader(File::open(&cache_file).unwrap()).unwrap_or_default()
+    } else {
+        let mut sizes = HashMap::with_capacity(DataSet::DATASETS.len());
+
+        // TODO: Realistically we should be doing all of these requests in parallel but
+        // I don't feel like adding tokio as a direct dependency at the moment (it's
+        // already a transitive dependency so it doesn't *really* matter, I'm just lazy)
+        let client = reqwest::blocking::Client::new();
+        for dataset in DataSet::DATASETS {
+            if let Ok(response) = client.head(dataset.url).send() {
+                if let Some(length) = response.headers()[CONTENT_LENGTH]
+                    .to_str()
+                    .ok()
+                    .and_then(|len| len.parse::<u64>().ok())
+                {
+                    sizes.insert(dataset.name.to_owned(), length);
+                }
+            }
+        }
+
+        fs::create_dir_all(DATA_PATH).unwrap();
+        let cache_file = BufWriter::new(File::create(&cache_file).unwrap());
+        serde_json::to_writer_pretty(cache_file, &sizes).unwrap();
+
+        sizes
+    };
+
     let mut datasets = DataSet::DATASETS.to_vec();
-    datasets.sort_by_key(|dataset| dataset.scale);
+    datasets.sort_by_key(|dataset| (dataset.scale, dataset_sizes.get(dataset.name).copied()));
 
     let longest_name = datasets
         .iter()
@@ -126,12 +157,18 @@ pub(crate) fn list_datasets() {
 
     let mut stdout = io::stdout().lock();
     for dataset in datasets {
-        writeln!(
+        write!(
             stdout,
-            "{:<longest_name$} scale: {:?}",
+            "{:<longest_name$} scale: {:?} archive size: ",
             dataset.name, dataset.scale,
         )
         .unwrap();
+
+        if let Some(&length) = dataset_sizes.get(dataset.name) {
+            writeln!(stdout, "{}", HumanBytes(length)).unwrap();
+        } else {
+            writeln!(stdout, "???").unwrap();
+        }
     }
 
     stdout.flush().unwrap();
@@ -340,47 +377,6 @@ impl DataSet {
 
         Ok(data_path)
     }
-
-    // Urls are hosted with faster download speeds here:
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/cit-Patents.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/com-friendster.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_5-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_6-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_7-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_8-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_9-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_0-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_1-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_2-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_3-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_4-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_5-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_6-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_7-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_8-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_9-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_0-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_1-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_2-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_3-zf.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_4-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-sf10k-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-sf3k-fb.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/dota-league.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/example-directed.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/example-undirected.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-22.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-23.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-24.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-25.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-26.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-27.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-28.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-29.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-30.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/kgs.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/twitter_mpi.tar.zst
-    // https://r2-public-worker.ldbc.workers.dev/graphalytics/wiki-Talk.tar.zst
 }
 
 macro_rules! datasets {
@@ -429,7 +425,7 @@ datasets! {
     DATAGEN_9_4 = "datagen-9_4-fb" @ XL,
 
     DATAGEN_SF3K = "datagen-sf3k-fb" @ XL,
-    // There's also datagen-sf10k-fb but it requires downloading 2 files
+    DATAGEN_SF10K = "datagen-sf10k-fb" @ XL,
 
     GRAPH_500_22 = "graph500-22" @ S,
     GRAPH_500_23 = "graph500-23" @ M,
@@ -439,7 +435,7 @@ datasets! {
     GRAPH_500_27 = "graph500-27" @ XL,
     GRAPH_500_28 = "graph500-28" @ XXL,
     GRAPH_500_29 = "graph500-29" @ XXL,
-    // There's also graph500-30 but it's massive and requires downloading 4 files
+    GRAPH_500_30 = "graph500-30" @ XXL,
 
     KGS = "kgs" @ XS,
     WIKI_TALK = "wiki-Talk" @ XXS,
 
@@ -259,7 +259,7 @@ fn main() {
 
                 if !results_file_already_exists {
                     // Write a header row if the file is newly created
-                    csv_writer.write_record(&[
+                    csv_writer.write_record([
                         "name",
                         "algorithm",
                         "dataset",
@@ -283,9 +283,10 @@ fn main() {
                         "allocstats_after_current_commit",
                         "allocstats_after_peak_commit",
                         "allocstats_after_page_faults"
-                    ]).expect("failed to write csv header");
+                    ])
+                    .expect("failed to write csv header");
                 }
-                csv_writer.write_record(&[
+                csv_writer.write_record([
                     "ldbc",
                     args.algorithm(),
                     config.dataset.name,
@@ -309,7 +310,8 @@ fn main() {
                     stats.current_commit.to_string().as_str(),
                     stats.peak_commit.to_string().as_str(),
                     stats.page_faults.to_string().as_str(),
-                ]).expect("failed to write csv record");
+                ])
+                .expect("failed to write csv record");
             }
 
             const MAX_PRINT_COUNT: usize = 10;
 
@@ -155,7 +155,7 @@ pub fn pagerank(
                 weights_var
                     .stream()
                     .apply2(&initial_weights, |weights, initial_weights| {
-                        if initial_weights.is_empty() {
+                        if !initial_weights.is_empty() {
                             initial_weights.clone()
                         } else {
                             weights.clone()
 
@@ -305,7 +305,7 @@ fn main() -> Result<()> {
             .has_headers(false)
             .from_writer(file);
         if !results_file_already_exists {
-            csv_writer.write_record(&[
+            csv_writer.write_record([
                 "name",
                 "num_cores",
                 "num_events",