Skip to content

Commit 3f0c671

Browse files
authored
Merge pull request #465 from PNNL-CompBio/figshare-api-fix
Package Download Function Fixed
2 parents c5d97ab + 4dd89a3 commit 3f0c671

File tree

3 files changed

+105
-57
lines changed

3 files changed

+105
-57
lines changed

coderdata/dataset.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
figshare: https://api.figshare.com/v2/articles/29923646
2-
version: 2.2.0
1+
figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500
2+
version: 2.2.1
33
datasets:
44
beataml:
55
description: Beat acute myeloid leukemia (BeatAML) focuses on acute myeloid leukemia

coderdata/download/downloader.py

Lines changed: 96 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,59 @@
77
import os
88
import requests
99
import warnings
10-
1110
import yaml
11+
from typing import Iterable, List, Dict, Any, Optional
12+
13+
14+
15+
def _gather_files_from_response(resp: requests.Response) -> List[Dict[str, Any]]:
16+
"""
17+
Normalize Figshare API responses into a list of file dicts.
18+
19+
Supports:
20+
1) Article endpoint: https://api.figshare.com/v2/articles/{id}
21+
-> JSON object with key 'files' (list)
22+
23+
2) Files endpoint: https://api.figshare.com/v2/articles/{id}/files[?...]
24+
-> JSON list of file objects (possibly paginated with Link headers)
25+
"""
26+
data = resp.json()
27+
if isinstance(data, dict) and "files" in data and isinstance(data["files"], list):
28+
return data["files"]
29+
if isinstance(data, list):
30+
return data
31+
raise ValueError("Unexpected Figshare API response structure; expected dict with 'files' "
32+
"or a list of file objects.")
33+
34+
35+
def _iter_paginated_files(url: str, session: Optional[requests.Session] = None) -> Iterable[Dict[str, Any]]:
36+
"""
37+
Iterate over all files, following 'Link: <...>; rel=\"next\"' pagination if present.
38+
Works for both the article endpoint (no pagination) and the files endpoint (may paginate).
39+
"""
40+
sess = session or requests.Session()
41+
next_url = url
42+
43+
while next_url:
44+
resp = sess.get(next_url)
45+
if resp.status_code != 200:
46+
raise Exception(f"Failed to get dataset details from Figshare: {resp.text}")
47+
48+
for f in _gather_files_from_response(resp):
49+
yield f
50+
51+
# RFC5988-style 'Link' header pagination
52+
link = resp.headers.get("Link") or resp.headers.get("link")
53+
next_url = None
54+
if link:
55+
parts = [p.strip() for p in link.split(",")]
56+
for part in parts:
57+
if 'rel="next"' in part:
58+
start = part.find("<") + 1
59+
end = part.find(">", start)
60+
if start > 0 and end > start:
61+
next_url = part[start:end]
62+
break
1263

1364
def download(
1465
name: str='all',
@@ -46,81 +97,73 @@ def download(
4697
local_path = Path(local_path)
4798

4899
if not local_path.exists():
49-
Path.mkdir(local_path)
100+
local_path.mkdir(parents=True, exist_ok=True)
50101
# Get the dataset details
51102
with resources.open_text('coderdata', 'dataset.yml') as f:
52103
data_information = yaml.load(f, Loader=yaml.FullLoader)
53104
url = data_information['figshare']
54-
55-
response = requests.get(url)
56-
if response.status_code != 200:
57-
raise Exception(
58-
f"Failed to get dataset details from Figshare: {response.text}"
59-
)
60-
61-
data = response.json()
62105

63-
# making sure that we are case insensitive
64-
name = name.casefold()
106+
name = (name or "all").casefold()
107+
session = requests.Session()
108+
all_files = list(_iter_paginated_files(url, session=session))
65109

66-
# Filter files by the specified prefix
67110
if name != "all":
68111
filtered_files = [
69-
file
70-
for file
71-
in data['files']
72-
if file['name'].startswith(name) or 'genes' in file['name']
73-
]
112+
f for f in all_files
113+
if (f.get('name', '').casefold().startswith(name)) or ('genes' in f.get('name', '').casefold())
114+
]
74115
else:
75-
filtered_files = data['files']
116+
filtered_files = all_files
76117

77-
# Group files by name and select the one with the highest ID
78118
unique_files = {}
79119
for file in filtered_files:
80-
file_name = local_path.joinpath(file['name'])
81-
file_id = file['id']
82-
if (
83-
file_name not in unique_files
84-
or file_id > unique_files[file_name]['id']
85-
):
86-
unique_files[file_name] = {'file_info': file, 'id': file_id}
120+
fname = file.get('name')
121+
fid = file.get('id')
122+
if fname is None or fid is None:
123+
continue
124+
file_name = local_path.joinpath(fname)
125+
if (file_name not in unique_files) or (fid > unique_files[file_name]['id']):
126+
unique_files[file_name] = {'file_info': file, 'id': fid}
87127

88128
for file_name, file_data in unique_files.items():
89129
file_info = file_data['file_info']
90130
file_id = str(file_info['id'])
91-
file_url = "https://api.figshare.com/v2/file/download/" + file_id
92-
file_md5sum = file_info['supplied_md5']
131+
file_url = f"https://api.figshare.com/v2/file/download/{file_id}"
132+
file_md5sum = file_info.get('supplied_md5')
133+
134+
if file_name.exists() and not exist_ok:
135+
warnings.warn(
136+
f"{file_name} already exists. Use argument 'exist_ok=True' to overwrite the existing file."
137+
)
138+
93139
retry_count = 10
94-
# Download the file
95140
while retry_count > 0:
96-
with requests.get(file_url, stream=True) as r:
141+
with session.get(file_url, stream=True) as r:
97142
r.raise_for_status()
98-
if file_name.exists() and not exist_ok:
99-
warnings.warn(
100-
f"{file_name} already exists. Use argument 'exist_ok=True'"
101-
"to overwrite existing file."
102-
)
143+
with open(file_name, 'wb') as f:
144+
for chunk in r.iter_content(chunk_size=8192):
145+
f.write(chunk)
146+
147+
if file_md5sum:
148+
with open(file_name, 'rb') as f:
149+
check_md5sum = md5(f.read()).hexdigest()
150+
if file_md5sum == check_md5sum:
151+
break
103152
else:
104-
with open(file_name, 'wb') as f:
105-
for chunk in r.iter_content(chunk_size=8192):
106-
f.write(chunk)
107-
with open(file_name, 'rb') as f:
108-
check_md5sum = md5(f.read()).hexdigest()
109-
if file_md5sum == check_md5sum:
153+
retry_count -= 1
154+
if retry_count > 0:
155+
warnings.warn(
156+
f"{file_name} failed MD5 verification "
157+
f"(expected: {file_md5sum}, got: {check_md5sum}). Retrying..."
158+
)
159+
else:
110160
break
111-
elif retry_count > 0:
112-
warnings.warn(
113-
f"{file_name} could not be downloaded successfully. "
114-
f"(expected md5sum: {file_md5sum} - "
115-
f"calculated md5sum: {check_md5sum})... retrying..."
116-
)
117-
retry_count = retry_count - 1
118-
if retry_count == 0:
161+
162+
if retry_count == 0 and file_md5sum:
119163
warnings.warn(
120-
f"{file_name} could not be downloaded. Try again."
121-
)
164+
f"{file_name} could not be downloaded with a matching MD5 after retries."
165+
)
122166
else:
123167
print(f"Downloaded '{file_url}' to '{file_name}'")
124168

125-
return
126169

scripts/push_to_figshare.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
197197
# update dataset.yml
198198
with open("coderdata/dataset.yml", "r") as f:
199199
data = yaml.safe_load(f)
200-
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}"
200+
data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}/files?page=1&page_size=500"
201201
data["version"] = version
202202
with open('/tmp/dataset.yml', 'w') as f:
203203
yaml.safe_dump(data, f, sort_keys=False)
@@ -232,7 +232,12 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
232232
remote_file_info = get_remote_file_info(article_id, file_name)
233233
if remote_file_info:
234234
local_md5, local_size = get_file_check_data(file_path)
235-
if remote_file_info['size'] != local_size or remote_file_info['computed_md5'] != local_md5:
235+
remote_md5 = (
236+
remote_file_info.get('computed_md5')
237+
or remote_file_info.get('md5')
238+
or remote_file_info.get('supplied_md5')
239+
)
240+
if remote_file_info.get('size') != local_size or remote_md5 != local_md5:
236241
print(f"Updating file {file_name} in Figshare...")
237242
delete_existing_file(article_id, remote_file_info['id'])
238243
file_info = initiate_new_upload(article_id, file_path)

0 commit comments

Comments
 (0)