diff --git a/backend/lib/processor.py b/backend/lib/processor.py index 3a47e5634..3aad595c7 100644 --- a/backend/lib/processor.py +++ b/backend/lib/processor.py @@ -538,6 +538,7 @@ def unpack_archive_contents(self, path, staging_area=None): :param Path staging_area: Where to store the files while they're being worked with. If omitted, a temporary folder is created and deleted after use + :param int max_number_files: Maximum number of files to unpack. If None, all files unpacked :return Path: A path to the staging area """ @@ -565,6 +566,32 @@ def unpack_archive_contents(self, path, staging_area=None): return staging_area + def extract_archived_file_by_name(self, filename, archive_path, staging_area=None): + """ + Extract a file from an archive by name + + :param str filename: Name of file to extract + :param Path archive_path: Path to zip file to read + :param Path staging_area: Where to store the files while they're + being worked with. If omitted, a temporary folder is created + :return Path: A path to the extracted file + """ + if not archive_path.exists(): + return + + if not staging_area: + staging_area = self.dataset.get_staging_area() + + if not staging_area.exists() or not staging_area.is_dir(): + raise RuntimeError("Staging area %s is not a valid folder") + + with zipfile.ZipFile(archive_path, "r") as archive_file: + if filename not in archive_file.namelist(): + raise KeyError("File %s not found in archive %s" % (filename, archive_path)) + else: + archive_file.extract(filename, staging_area) + return staging_area.joinpath(filename) + def write_csv_items_and_finish(self, data): """ Write data as csv to results file and finish dataset diff --git a/common/lib/dmi_service_manager.py b/common/lib/dmi_service_manager.py index 01c79cfe9..091bb2dd8 100644 --- a/common/lib/dmi_service_manager.py +++ b/common/lib/dmi_service_manager.py @@ -230,7 +230,7 @@ def request_folder_files(self, folder_name): except requests.exceptions.ConnectionError as e: retries += 1 if retries > 3: - raise DmiServiceManagerException(f"Connection Error {e} while downloading files from: {folder_name}") + raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}") continue # Check if 4CAT has access to this server @@ -353,7 +353,7 @@ def download_results(self, filenames_to_download, folder_name, local_output_dir, except requests.exceptions.ConnectionError as e: retries += 1 if retries > 3: - raise DmiServiceManagerException(f"Connection Error {e} while downloading file: {filename}") + raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading file: {filename}") continue files_downloaded += 1 if files_downloaded % 1000 == 0: diff --git a/processors/machine-learning/text_from_image.py b/processors/machine-learning/text_from_image.py index e25551e09..b1a42c8b0 100644 --- a/processors/machine-learning/text_from_image.py +++ b/processors/machine-learning/text_from_image.py @@ -105,13 +105,29 @@ def process(self): # Unpack the images into a staging_area self.dataset.update_status("Unzipping images") - staging_area = self.unpack_archive_contents(self.source_file) - # Collect filenames (skip .json metadata files) - image_filenames = [filename for filename in os.listdir(staging_area) if - filename.split('.')[-1] not in ["json", "log"]] if int(self.parameters.get("amount", 100)) != 0: - image_filenames = image_filenames[:int(self.parameters.get("amount", 100))] + max_images = int(self.parameters.get("amount", 100)) + else: + max_images = None + + staging_area = self.dataset.get_staging_area() + # Collect filenames and metadata + image_filenames = [] + metadata_file = None + for image in self.iterate_archive_contents(self.source_file, staging_area=staging_area, immediately_delete=False): + if self.interrupted: + raise ProcessorInterruptedException("Interrupted while unzipping images") + + if image.name.split('.')[-1] not in ["json", "log"]: + image_filenames.append(image.name) + + if image.name == ".metadata.json": + metadata_file = image.name + + if max_images and len(image_filenames) >= max_images: + break + total_image_files = len(image_filenames) # Make output dir @@ -158,12 +174,21 @@ def process(self): # Load the metadata from the archive image_metadata = {} - with open(os.path.join(staging_area, '.metadata.json')) as file: - image_data = json.load(file) - for url, data in image_data.items(): - if data.get('success'): - data.update({"url": url}) - image_metadata[data['filename']] = data + if metadata_file is None: + try: + self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area) + metadata_success = True + except KeyError: + self.dataset.update_status("No metadata file found") + metadata_success = False + + if metadata_success: + with open(os.path.join(staging_area, '.metadata.json')) as file: + image_data = json.load(file) + for url, data in image_data.items(): + if data.get('success'): + data.update({"url": url}) + image_metadata[data['filename']] = data # Check if we need to collect data for updating the original dataset update_original = self.parameters.get("update_original", False)