Skip to content

Commit

Permalink
added timeout error handling and removed Session
Browse files Browse the repository at this point in the history
  • Loading branch information
ChristopherMancuso committed Jan 30, 2025
1 parent 7336197 commit 8d709f9
Showing 1 changed file with 51 additions and 51 deletions.
102 changes: 51 additions & 51 deletions geneplexus/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,12 @@
import shutil
import tarfile
import time
from concurrent.futures import ThreadPoolExecutor
from itertools import repeat
from threading import local
import requests
from typing import List
from typing import Tuple
from typing import Union
from urllib.parse import urljoin
from zipfile import ZipFile

import requests
from requests.sessions import Session

from . import util
# from . import util
from ._config import logger
from ._config.config import ALL_SPECIES
from ._config.config import LOG_LEVEL_TYPE
Expand All @@ -29,14 +22,12 @@
from ._config.logger_util import stream_level_context
from .exception import DownloadError

thread_local = local()


def download_select_data(
data_dir: str,
species: SPECIES_SELECTION_TYPE = "All",
data_loc: str = "Zenodo",
retry: bool = True,
data_loc: str = "ZenodoAPI",
num_retries: int = MAX_RETRY,
log_level: LOG_LEVEL_TYPE = "INFO",
):
"""Select species of data to download.
Expand All @@ -46,26 +37,28 @@ def download_select_data(
species: Species of interest, accept multiple selection as a
list. Do all the species if set to "All".
data_loc: the remote system where to look for the data
retry: If set to True, then retry downloading any missing file.
num_retries: Number of times to retry downloading a file.
log_level: Level to set the logger
"""
species = _get_species_list(species)
with stream_level_context(logger, log_level):
for aspecies in species:
if not _check_all_files(data_dir, aspecies):
if data_loc == "Zenodo":
if data_loc in ["Zenodo", "ZenodoAPI"]:
logger.warning(
f"Downloading {aspecies} data from Zenodo. This should take ~2 "
"minutes per species but can vary greatly depending on download speeds. "
"If Zenodo download is hanging for > 5 minutes, it might be best to stop "
"and restart the PyGenePlexus download function.",
"If Zenodo download is hanging for > 5 minutes per attempt, it might be best "
"to stop and restart the PyGenePlexus download function.",
)
log_path = osp.join(data_dir, "download.log")
logger.info(f"Start downloading data for {aspecies} and saving to: {data_dir}")
fn_download = f"{aspecies}_data.tar.gz"
if data_loc == "ZenodoAPI":
fn_download = f"{fn_download}/content"
with file_handler_context(logger, log_path, "DEBUG"):
_download_and_extract(data_dir, aspecies, fn_download, data_loc, retry)
_download_and_extract(data_dir, aspecies, fn_download, data_loc, num_retries)
logger.info("Download completed.")
else:
logger.warning(
Expand Down Expand Up @@ -110,69 +103,76 @@ def _check_all_files(
return True


def _download_and_extract(data_dir, file_cat, fn_download, data_loc, retry):
session = requests.Session()
def _download_and_extract(data_dir, file_cat, fn_download, data_loc, num_retries):
url = urljoin(URL_DICT[data_loc], fn_download)
num_tries = 0
while num_tries <= MAX_RETRY:
while num_tries <= num_retries - 1:
num_tries += 1
with session.get(url) as r:
if r.ok:
logger.debug(f"Response ok ({r!r}): {url=}")
with tarfile.open(fileobj=io.BytesIO(r.content), mode="r:gz") as tf:
for member in tf.getmembers():
member.name = os.path.basename(member.name)
tf.extract(member, data_dir)
logger.info(f"Downloaded {member.name}")
try:
shutil.rmtree(osp.join(data_dir, f"{file_cat}_data"))
except FileNotFoundError:
pass
if _check_all_files(data_dir, file_cat):
break
logger.info(f"On attempt {num_tries} of downloading the file")
try:
with requests.get(url, timeout = 2) as r:
if r.ok:
logger.debug(f"Response ok ({r!r}): {url=}")
with tarfile.open(fileobj=io.BytesIO(r.content), mode="r:gz") as tf:
for member in tf.getmembers():
member.name = os.path.basename(member.name)
tf.extract(member, data_dir)
logger.info(f"Downloaded {member.name}")
try:
shutil.rmtree(osp.join(data_dir, f"{file_cat}_data"))
except FileNotFoundError:
pass
if _check_all_files(data_dir, file_cat):
break
else:
logger.warning(f"Not all files downloaded, trying again")
elif r.status_code == 429: # Retry later
t = r.headers["Retry-after"]
logger.warning(f"Too many requests, waiting for {t} sec")
time.sleep(int(t))
continue
else:
logger.warning(f"Not all files downloaded, trying again")
elif r.status_code == 429: # Retry later
t = r.headers["Retry-after"]
logger.warning(f"Too many requests, waiting for {t} sec")
time.sleep(int(t))
continue
else:
raise requests.exceptions.RequestException(r, url)
logger.info("An unknown error occured")
continue
except:
print("An error occured during download (probably a connection timeout)")
continue
logger.critical("Session context closed, this should never happen!")
else:
raise DownloadError(f"Failed to download from {url} ({MAX_RETRY=})")
raise DownloadError(f"Failed to download from {url} ({num_retries=})")


def download_pytest_data(
data_dir: str,
data_loc: str = "Zenodo",
retry: bool = True,
data_loc: str = "ZenodoAPI",
num_retries: int = MAX_RETRY,
log_level: LOG_LEVEL_TYPE = "INFO",
):
"""Download data for pytests.
Args:
data_dir: Location of data files.
data_loc: the remote system where to look for the data
retry: If set to True, then retry downloading any missing file.
num_retries: Number of times to retry downloading a file.
log_level: Level to set the logger
"""
with stream_level_context(logger, log_level):
if not _check_all_files(data_dir, "pytest"):
if data_loc == "Zenodo":
if data_loc in ["Zenodo", "ZenodoAPI"]:
logger.warning(
f"Downloading pytest data from Zenodo. This should take ~2 "
"minutes but can vary greatly depending on download speeds. "
"If Zenodo download is hanging for > 5 minutes, it might be best to stop "
"and restart the PyGenePlexus download function.",
"If Zenodo download is hanging for > 5 minutes per attempt, it might be best "
"to stop and restart the PyGenePlexus download function.",
)
log_path = osp.join(data_dir, "download.log")
logger.info(f"Start downloading pytest data and saving to: {data_dir}")
fn_download = "pytest_data.tar.gz"
if data_loc == "ZenodoAPI":
fn_download = f"{fn_download}/content"
with file_handler_context(logger, log_path, "DEBUG"):
_download_and_extract(data_dir, "pytest", fn_download, data_loc, retry)
_download_and_extract(data_dir, "pytest", fn_download, data_loc, num_retries)
logger.info("Download completed.")
else:
logger.warning(
Expand Down

0 comments on commit 8d709f9

Please sign in to comment.