forked from arXiv/arxiv-browse
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_test_article.py
82 lines (58 loc) · 2.91 KB
/
get_test_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""For an arXiv id, gets from the production bucket all related files.
Files such as abs, src and contents of the ps_cache and saves them in the test
directoires.
TODO Sanitizes them of email addresses
"""
import os
import sys
import argparse
from pathlib import Path
from arxiv.identifier import Identifier
from google.cloud import storage
from browse.services import key_patterns
def get_article_for_test(bucket, save_base_dir: str, arxiv_id: Identifier):
"""Gets from the production bucket all the files related to an arxiv_id
and saves them in the test directoires"""
abs_current = key_patterns.abs_path_current(arxiv_id)
get_object_for_test(bucket, save_base_dir, abs_current)
other_current = f"{key_patterns.abs_path_current_parent(arxiv_id)}/{arxiv_id.filename}"
get_objs_matching_keyprefix(bucket, save_base_dir,other_current)
abs_orig = f"{key_patterns.abs_path_orig_parent(arxiv_id)}/{arxiv_id.filename}"
get_objs_matching_keyprefix(bucket, save_base_dir, abs_orig)
ps_cache = f"{key_patterns._ps_cache_part('pdf',arxiv_id)}/{arxiv_id.filename}"
get_objs_matching_keyprefix(bucket, save_base_dir, ps_cache)
def get_objs_matching_keyprefix(bucket, save_base_dir:str, key_prefix:str) -> int:
print(f"Trying to get all objects in gs://{bucket.name}/{key_prefix}* to {save_base_dir}/")
blobs = list(bucket.client.list_blobs(bucket, prefix=key_prefix, max_results=100))
count= sum([get_object_for_test(bucket, save_base_dir, blob.name)
for blob in blobs])
print(f"Items in gs://{bucket.name} is {len(blobs)} copied {count}")
def get_object_for_test(bucket, save_base_dir:str, key:str) -> int :
print(f"trying to get gs://{bucket.name}/{key} to {save_base_dir}/{key}")
blob = bucket.blob(key)
if not blob.exists():
raise Exception(f"Object {key} does not exist in bucket")
base = Path(save_base_dir)
target = base / key
if target.exists():
print(f"{key} exists locally, skipping")
return 0
target.parent.mkdir(parents=True, exist_ok=True)
blob.download_to_filename(target)
print(f"Successfully got gs://{bucket.name}/{key} to {save_base_dir}/{key}")
return 1
def sanitize_abs_file(abs_file:Path):
pass
if __name__ == '__main__':
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__,)
parser.add_argument('id', type=str, help="id of article to get")
parser.add_argument('save_base_dir', type=Path, help="directory to save data in ex ./tests/data/abs_files")
parser.add_argument('bucket', type=str, help="GS bucket name. Do not prefix with gs://")
args = parser.parse_args()
gs_client = storage.Client()
bucket = gs_client.bucket(args.bucket)
if not bucket.exists():
raise Exception(f"GS bucket {bucket} does not exist.")
get_article_for_test(bucket,
args.save_base_dir,
Identifier(args.id))