files got unpushed. repushing

jjacobson95 · jjacobson95 · commit 1408cd712150 · 2025-09-30T10:56:49.000-07:00
diff --git a/coderdata/dataset.yml b/coderdata/dataset.yml
@@ -1,4 +1,4 @@
-figshare: https://api.figshare.com/v2/articles/29923646
+figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500
 version: 2.2.0
 datasets:
   beataml:
diff --git a/coderdata/download/downloader.py b/coderdata/download/downloader.py
@@ -7,8 +7,59 @@
 import os
 import requests
 import warnings
-
 import yaml
+from typing import Iterable, List, Dict, Any, Optional
+    
+    
+    
+def _gather_files_from_response(resp: requests.Response) -> List[Dict[str, Any]]:
+    """
+    Normalize Figshare API responses into a list of file dicts.
+
+    Supports:
+      1) Article endpoint:   https://api.figshare.com/v2/articles/{id}
+         -> JSON object with key 'files' (list)
+
+      2) Files endpoint:     https://api.figshare.com/v2/articles/{id}/files[?...]
+         -> JSON list of file objects (possibly paginated with Link headers)
+    """
+    data = resp.json()
+    if isinstance(data, dict) and "files" in data and isinstance(data["files"], list):
+        return data["files"]
+    if isinstance(data, list):
+        return data
+    raise ValueError("Unexpected Figshare API response structure; expected dict with 'files' "
+                     "or a list of file objects.")
+
+
+def _iter_paginated_files(url: str, session: Optional[requests.Session] = None) -> Iterable[Dict[str, Any]]:
+    """
+    Iterate over all files, following 'Link: <...>; rel=\"next\"' pagination if present.
+    Works for both the article endpoint (no pagination) and the files endpoint (may paginate).
+    """
+    sess = session or requests.Session()
+    next_url = url
+
+    while next_url:
+        resp = sess.get(next_url)
+        if resp.status_code != 200:
+            raise Exception(f"Failed to get dataset details from Figshare: {resp.text}")
+
+        for f in _gather_files_from_response(resp):
+            yield f
+
+        # RFC5988-style 'Link' header pagination
+        link = resp.headers.get("Link") or resp.headers.get("link")
+        next_url = None
+        if link:
+            parts = [p.strip() for p in link.split(",")]
+            for part in parts:
+                if 'rel="next"' in part:
+                    start = part.find("<") + 1
+                    end = part.find(">", start)
+                    if start > 0 and end > start:
+                        next_url = part[start:end]
+                        break
 
 def download(
         name: str='all',
@@ -46,81 +97,73 @@ def download(
         local_path = Path(local_path)
 
     if not local_path.exists():
-        Path.mkdir(local_path)
+        local_path.mkdir(parents=True, exist_ok=True)
     # Get the dataset details
     with resources.open_text('coderdata', 'dataset.yml') as f:
         data_information = yaml.load(f, Loader=yaml.FullLoader)
     url = data_information['figshare']
-    
-    response = requests.get(url)
-    if response.status_code != 200:
-        raise Exception(
-            f"Failed to get dataset details from Figshare: {response.text}"
-            )
-
-    data = response.json()
 
-    # making sure that we are case insensitive
-    name = name.casefold()
+    name = (name or "all").casefold()
+    session = requests.Session()
+    all_files = list(_iter_paginated_files(url, session=session))
 
-    # Filter files by the specified prefix
     if name != "all":
         filtered_files = [
-            file 
-            for file 
-            in data['files'] 
-            if file['name'].startswith(name) or 'genes' in file['name']
-            ]
+            f for f in all_files
+            if (f.get('name', '').casefold().startswith(name)) or ('genes' in f.get('name', '').casefold())
+        ]
     else:
-        filtered_files = data['files']
+        filtered_files = all_files
 
-    # Group files by name and select the one with the highest ID
     unique_files = {}
     for file in filtered_files:
-        file_name = local_path.joinpath(file['name'])
-        file_id = file['id']
-        if (
-            file_name not in unique_files
-            or file_id > unique_files[file_name]['id']
-        ):
-            unique_files[file_name] = {'file_info': file, 'id': file_id}
+        fname = file.get('name')
+        fid = file.get('id')
+        if fname is None or fid is None:
+            continue
+        file_name = local_path.joinpath(fname)
+        if (file_name not in unique_files) or (fid > unique_files[file_name]['id']):
+            unique_files[file_name] = {'file_info': file, 'id': fid}
 
     for file_name, file_data in unique_files.items():
         file_info = file_data['file_info']
         file_id = str(file_info['id'])
-        file_url = "https://api.figshare.com/v2/file/download/" + file_id
-        file_md5sum = file_info['supplied_md5']
+        file_url = f"https://api.figshare.com/v2/file/download/{file_id}"
+        file_md5sum = file_info.get('supplied_md5')
+
+        if file_name.exists() and not exist_ok:
+            warnings.warn(
+                f"{file_name} already exists. Use argument 'exist_ok=True' to overwrite the existing file."
+            )
+
         retry_count = 10
-        # Download the file
         while retry_count > 0:
-            with requests.get(file_url, stream=True) as r:
+            with session.get(file_url, stream=True) as r:
                 r.raise_for_status()
-                if file_name.exists() and not exist_ok:
-                    warnings.warn(
-                        f"{file_name} already exists. Use argument 'exist_ok=True'"
-                        "to overwrite existing file."
-                        )
+                with open(file_name, 'wb') as f:
+                    for chunk in r.iter_content(chunk_size=8192):
+                        f.write(chunk)
+
+            if file_md5sum:
+                with open(file_name, 'rb') as f:
+                    check_md5sum = md5(f.read()).hexdigest()
+                if file_md5sum == check_md5sum:
+                    break
                 else:
-                    with open(file_name, 'wb') as f:
-                        for chunk in r.iter_content(chunk_size=8192): 
-                            f.write(chunk)
-            with open(file_name, 'rb') as f:
-                check_md5sum = md5(f.read()).hexdigest()
-            if file_md5sum == check_md5sum:
+                    retry_count -= 1
+                    if retry_count > 0:
+                        warnings.warn(
+                            f"{file_name} failed MD5 verification "
+                            f"(expected: {file_md5sum}, got: {check_md5sum}). Retrying..."
+                        )
+            else:
                 break
-            elif retry_count > 0:
-                warnings.warn(
-                    f"{file_name} could not be downloaded successfully. "
-                    f"(expected md5sum: {file_md5sum} - "
-                    f"calculated md5sum: {check_md5sum})... retrying..."
-                )
-                retry_count = retry_count - 1
-        if retry_count == 0:
+
+        if retry_count == 0 and file_md5sum:
             warnings.warn(
-                f"{file_name} could not be downloaded. Try again."
-                )
+                f"{file_name} could not be downloaded with a matching MD5 after retries."
+            )
         else:
             print(f"Downloaded '{file_url}' to '{file_name}'")
 
-    return
 
diff --git a/scripts/push_to_figshare.py b/scripts/push_to_figshare.py
@@ -197,7 +197,7 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
         # update dataset.yml
         with open("coderdata/dataset.yml", "r") as f:
             data = yaml.safe_load(f)
-        data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}"
+        data["figshare"] = f"https://api.figshare.com/v2/articles/{article_id}/files?page=1&page_size=500"
         data["version"] = version
         with open('/tmp/dataset.yml', 'w') as f:
             yaml.safe_dump(data, f, sort_keys=False)       
@@ -232,7 +232,12 @@ def write_figshare_details_to_yaml(article_id, project_id, title, version):
         remote_file_info = get_remote_file_info(article_id, file_name)
         if remote_file_info:
             local_md5, local_size = get_file_check_data(file_path)
-            if remote_file_info['size'] != local_size or remote_file_info['computed_md5'] != local_md5:
+            remote_md5 = (
+                remote_file_info.get('computed_md5')
+                or remote_file_info.get('md5')
+                or remote_file_info.get('supplied_md5')
+            )
+            if remote_file_info.get('size') != local_size or remote_md5 != local_md5:
                 print(f"Updating file {file_name} in Figshare...")
                 delete_existing_file(article_id, remote_file_info['id'])
                 file_info = initiate_new_upload(article_id, file_path)

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-figshare: https://api.figshare.com/v2/articles/29923646`
	`1`	`+figshare: https://api.figshare.com/v2/articles/29923646/files?page=1&page_size=500`
`2`	`2`	`version: 2.2.0`
`3`	`3`	`datasets:`
`4`	`4`	`beataml:`