@@ -6,8 +6,10 @@ use dbsp::{
6
6
Circuit , OrdIndexedZSet , OrdZSet , Stream ,
7
7
} ;
8
8
use indicatif:: { HumanBytes , ProgressBar , ProgressState , ProgressStyle } ;
9
+ use reqwest:: header:: CONTENT_LENGTH ;
9
10
use std:: {
10
11
cmp:: Reverse ,
12
+ collections:: HashMap ,
11
13
ffi:: OsStr ,
12
14
fmt:: { self , Debug } ,
13
15
fs:: { self , File , OpenOptions } ,
@@ -115,8 +117,37 @@ pub(crate) fn list_downloaded_benchmarks() {
115
117
}
116
118
117
119
pub ( crate ) fn list_datasets ( ) {
120
+ let cache_file = Path :: new ( DATA_PATH ) . join ( "dataset_cache.json" ) ;
121
+ let dataset_sizes = if cache_file. exists ( ) {
122
+ serde_json:: from_reader ( File :: open ( & cache_file) . unwrap ( ) ) . unwrap_or_default ( )
123
+ } else {
124
+ let mut sizes = HashMap :: with_capacity ( DataSet :: DATASETS . len ( ) ) ;
125
+
126
+ // TODO: Realistically we should be doing all of these requests in parallel but
127
+ // I don't feel like adding tokio as a direct dependency at the moment (it's
128
+ // already a transitive dependency so it doesn't *really* matter, I'm just lazy)
129
+ let client = reqwest:: blocking:: Client :: new ( ) ;
130
+ for dataset in DataSet :: DATASETS {
131
+ if let Ok ( response) = client. head ( dataset. url ) . send ( ) {
132
+ if let Some ( length) = response. headers ( ) [ CONTENT_LENGTH ]
133
+ . to_str ( )
134
+ . ok ( )
135
+ . and_then ( |len| len. parse :: < u64 > ( ) . ok ( ) )
136
+ {
137
+ sizes. insert ( dataset. name . to_owned ( ) , length) ;
138
+ }
139
+ }
140
+ }
141
+
142
+ fs:: create_dir_all ( DATA_PATH ) . unwrap ( ) ;
143
+ let cache_file = BufWriter :: new ( File :: create ( & cache_file) . unwrap ( ) ) ;
144
+ serde_json:: to_writer_pretty ( cache_file, & sizes) . unwrap ( ) ;
145
+
146
+ sizes
147
+ } ;
148
+
118
149
let mut datasets = DataSet :: DATASETS . to_vec ( ) ;
119
- datasets. sort_by_key ( |dataset| dataset. scale ) ;
150
+ datasets. sort_by_key ( |dataset| ( dataset. scale , dataset_sizes . get ( dataset . name ) . copied ( ) ) ) ;
120
151
121
152
let longest_name = datasets
122
153
. iter ( )
@@ -126,12 +157,18 @@ pub(crate) fn list_datasets() {
126
157
127
158
let mut stdout = io:: stdout ( ) . lock ( ) ;
128
159
for dataset in datasets {
129
- writeln ! (
160
+ write ! (
130
161
stdout,
131
- "{:<longest_name$} scale: {:?}" ,
162
+ "{:<longest_name$} scale: {:?} archive size: " ,
132
163
dataset. name, dataset. scale,
133
164
)
134
165
. unwrap ( ) ;
166
+
167
+ if let Some ( & length) = dataset_sizes. get ( dataset. name ) {
168
+ writeln ! ( stdout, "{}" , HumanBytes ( length) ) . unwrap ( ) ;
169
+ } else {
170
+ writeln ! ( stdout, "???" ) . unwrap ( ) ;
171
+ }
135
172
}
136
173
137
174
stdout. flush ( ) . unwrap ( ) ;
@@ -340,47 +377,6 @@ impl DataSet {
340
377
341
378
Ok ( data_path)
342
379
}
343
-
344
- // Urls are hosted with faster download speeds here:
345
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/cit-Patents.tar.zst
346
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/com-friendster.tar.zst
347
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_5-fb.tar.zst
348
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_6-fb.tar.zst
349
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_7-zf.tar.zst
350
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_8-zf.tar.zst
351
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-7_9-fb.tar.zst
352
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_0-fb.tar.zst
353
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_1-fb.tar.zst
354
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_2-zf.tar.zst
355
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_3-zf.tar.zst
356
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_4-fb.tar.zst
357
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_5-fb.tar.zst
358
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_6-fb.tar.zst
359
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_7-zf.tar.zst
360
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_8-zf.tar.zst
361
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-8_9-fb.tar.zst
362
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_0-fb.tar.zst
363
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_1-fb.tar.zst
364
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_2-zf.tar.zst
365
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_3-zf.tar.zst
366
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-9_4-fb.tar.zst
367
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-sf10k-fb.tar.zst
368
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/datagen-sf3k-fb.tar.zst
369
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/dota-league.tar.zst
370
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/example-directed.tar.zst
371
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/example-undirected.tar.zst
372
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-22.tar.zst
373
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-23.tar.zst
374
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-24.tar.zst
375
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-25.tar.zst
376
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-26.tar.zst
377
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-27.tar.zst
378
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-28.tar.zst
379
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-29.tar.zst
380
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/graph500-30.tar.zst
381
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/kgs.tar.zst
382
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/twitter_mpi.tar.zst
383
- // https://r2-public-worker.ldbc.workers.dev/graphalytics/wiki-Talk.tar.zst
384
380
}
385
381
386
382
macro_rules! datasets {
@@ -429,7 +425,7 @@ datasets! {
429
425
DATAGEN_9_4 = "datagen-9_4-fb" @ XL ,
430
426
431
427
DATAGEN_SF3K = "datagen-sf3k-fb" @ XL ,
432
- // There's also datagen-sf10k-fb but it requires downloading 2 files
428
+ DATAGEN_SF10K = " datagen-sf10k-fb" @ XL ,
433
429
434
430
GRAPH_500_22 = "graph500-22" @ S ,
435
431
GRAPH_500_23 = "graph500-23" @ M ,
@@ -439,7 +435,7 @@ datasets! {
439
435
GRAPH_500_27 = "graph500-27" @ XL ,
440
436
GRAPH_500_28 = "graph500-28" @ XXL ,
441
437
GRAPH_500_29 = "graph500-29" @ XXL ,
442
- // There's also graph500-30 but it's massive and requires downloading 4 files
438
+ GRAPH_500_30 = " graph500-30" @ XXL ,
443
439
444
440
KGS = "kgs" @ XS ,
445
441
WIKI_TALK = "wiki-Talk" @ XXS ,
0 commit comments