From 7a1c4b92f6ff5f5546f3941faa36a6651981fb83 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 5 Dec 2023 12:27:42 +0100 Subject: [PATCH] stream 4cat files when importing 4cat datasets --- datasources/fourcat_import/import_4cat.py | 27 ++++++++++++++++------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index 01a05d94e..cd231b445 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -208,10 +208,8 @@ def process(self): self.halt_and_catch_fire() try: self.dataset.update_status(f"Transferring data file for dataset {new_dataset.key}") - data = SearchImportFromFourcat.fetch_from_4cat(self.base, dataset_key, api_key, "data") datapath = new_dataset.get_results_path() - with datapath.open("wb") as outfile: - outfile.write(data.content) + data = SearchImportFromFourcat.fetch_from_4cat(self.base, dataset_key, api_key, "data", datapath) if not imported: # first dataset - use num rows as 'overall' @@ -293,7 +291,7 @@ def halt_and_catch_fire(self): raise ProcessorInterruptedException() @staticmethod - def fetch_from_4cat(base, dataset_key, api_key, component): + def fetch_from_4cat(base, dataset_key, api_key, component, datapath=None): """ Get dataset component from 4CAT export API @@ -304,10 +302,23 @@ def fetch_from_4cat(base, dataset_key, api_key, component): :return: HTTP response object """ try: - response = requests.get(f"{base}/api/export-packed-dataset/{dataset_key}/{component}/", timeout=5, headers={ - "User-Agent": "4cat/import", - "Authentication": api_key - }) + if component == "data" and datapath: + # Stream data + with requests.get(f"{base}/api/export-packed-dataset/{dataset_key}/{component}/", timeout=5, stream=True, + headers={ + "User-Agent": "4cat/import", + "Authentication": api_key + }) as r: + r.raise_for_status() + with datapath.open("wb") as outfile: + for chunk in r.iter_content(chunk_size=8192): + outfile.write(chunk) + return r + else: + response = requests.get(f"{base}/api/export-packed-dataset/{dataset_key}/{component}/", timeout=5, headers={ + "User-Agent": "4cat/import", + "Authentication": api_key + }) except requests.Timeout: raise FourcatImportException(f"The 4CAT server at {base} took too long to respond. Make sure it is " f"accessible to external connections and try again.")