text_from_image: only extract needed files

digitalmethodsinitiative · Jan 24, 2024 · 3de1836 · 3de1836
1 parent 87539a4
commit 3de1836
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 13 deletions.
diff --git a/backend/lib/processor.py b/backend/lib/processor.py
@@ -538,6 +538,7 @@ def unpack_archive_contents(self, path, staging_area=None):
 		:param Path staging_area:  Where to store the files while they're
 		  being worked with. If omitted, a temporary folder is created and
 		  deleted after use
+		:param int max_number_files:  Maximum number of files to unpack. If None, all files unpacked
 		:return Path:  A path to the staging area
 		"""
 
@@ -565,6 +566,32 @@ def unpack_archive_contents(self, path, staging_area=None):
 
 		return staging_area
 
+	def extract_archived_file_by_name(self, filename, archive_path, staging_area=None):
+		"""
+		Extract a file from an archive by name
+
+		:param str filename:  Name of file to extract
+		:param Path archive_path:  Path to zip file to read
+		:param Path staging_area:  Where to store the files while they're
+		  		being worked with. If omitted, a temporary folder is created
+		:return Path:  A path to the extracted file
+		"""
+		if not archive_path.exists():
+			return
+
+		if not staging_area:
+			staging_area = self.dataset.get_staging_area()
+
+		if not staging_area.exists() or not staging_area.is_dir():
+			raise RuntimeError("Staging area %s is not a valid folder")
+
+		with zipfile.ZipFile(archive_path, "r") as archive_file:
+			if filename not in archive_file.namelist():
+				raise KeyError("File %s not found in archive %s" % (filename, archive_path))
+			else:
+				archive_file.extract(filename, staging_area)
+				return staging_area.joinpath(filename)
+
 	def write_csv_items_and_finish(self, data):
 		"""
 		Write data as csv to results file and finish dataset

diff --git a/common/lib/dmi_service_manager.py b/common/lib/dmi_service_manager.py
@@ -230,7 +230,7 @@ def request_folder_files(self, folder_name):
             except requests.exceptions.ConnectionError as e:
                 retries += 1
                 if retries > 3:
-                    raise DmiServiceManagerException(f"Connection Error {e} while downloading files from: {folder_name}")
+                    raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}")
                 continue
 
         # Check if 4CAT has access to this server
@@ -353,7 +353,7 @@ def download_results(self, filenames_to_download, folder_name, local_output_dir,
                 except requests.exceptions.ConnectionError as e:
                     retries += 1
                     if retries > 3:
-                        raise DmiServiceManagerException(f"Connection Error {e} while downloading file: {filename}")
+                        raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading file: {filename}")
                     continue
             files_downloaded += 1
             if files_downloaded % 1000 == 0:

diff --git a/processors/machine-learning/text_from_image.py b/processors/machine-learning/text_from_image.py
@@ -105,13 +105,29 @@ def process(self):
 
         # Unpack the images into a staging_area
         self.dataset.update_status("Unzipping images")
-        staging_area = self.unpack_archive_contents(self.source_file)
 
-        # Collect filenames (skip .json metadata files)
-        image_filenames = [filename for filename in os.listdir(staging_area) if
-                           filename.split('.')[-1] not in ["json", "log"]]
         if int(self.parameters.get("amount", 100)) != 0:
-            image_filenames = image_filenames[:int(self.parameters.get("amount", 100))]
+            max_images = int(self.parameters.get("amount", 100))
+        else:
+            max_images = None
+
+        staging_area = self.dataset.get_staging_area()
+        # Collect filenames and metadata
+        image_filenames = []
+        metadata_file = None
+        for image in self.iterate_archive_contents(self.source_file, staging_area=staging_area, immediately_delete=False):
+            if self.interrupted:
+                raise ProcessorInterruptedException("Interrupted while unzipping images")
+
+            if image.name.split('.')[-1] not in ["json", "log"]:
+                image_filenames.append(image.name)
+
+            if image.name == ".metadata.json":
+                metadata_file = image.name
+
+            if max_images and len(image_filenames) >= max_images:
+                break
+
         total_image_files = len(image_filenames)
 
         # Make output dir
@@ -158,12 +174,21 @@ def process(self):
 
         # Load the metadata from the archive
         image_metadata = {}
-        with open(os.path.join(staging_area, '.metadata.json')) as file:
-            image_data = json.load(file)
-            for url, data in image_data.items():
-                if data.get('success'):
-                    data.update({"url": url})
-                    image_metadata[data['filename']] = data
+        if metadata_file is None:
+            try:
+                self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area)
+                metadata_success = True
+            except KeyError:
+                self.dataset.update_status("No metadata file found")
+                metadata_success = False
+
+            if metadata_success:
+                with open(os.path.join(staging_area, '.metadata.json')) as file:
+                    image_data = json.load(file)
+                    for url, data in image_data.items():
+                        if data.get('success'):
+                            data.update({"url": url})
+                            image_metadata[data['filename']] = data
 
         # Check if we need to collect data for updating the original dataset
         update_original = self.parameters.get("update_original", False)