Skip to content

Commit

Permalink
text_from_image: only extract needed files
Browse files Browse the repository at this point in the history
  • Loading branch information
dale-wahl committed Jan 24, 2024
1 parent 87539a4 commit 3de1836
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 13 deletions.
27 changes: 27 additions & 0 deletions backend/lib/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,6 +538,7 @@ def unpack_archive_contents(self, path, staging_area=None):
:param Path staging_area: Where to store the files while they're
being worked with. If omitted, a temporary folder is created and
deleted after use
:param int max_number_files: Maximum number of files to unpack. If None, all files unpacked
:return Path: A path to the staging area
"""

Expand Down Expand Up @@ -565,6 +566,32 @@ def unpack_archive_contents(self, path, staging_area=None):

return staging_area

def extract_archived_file_by_name(self, filename, archive_path, staging_area=None):
"""
Extract a file from an archive by name
:param str filename: Name of file to extract
:param Path archive_path: Path to zip file to read
:param Path staging_area: Where to store the files while they're
being worked with. If omitted, a temporary folder is created
:return Path: A path to the extracted file
"""
if not archive_path.exists():
return

if not staging_area:
staging_area = self.dataset.get_staging_area()

if not staging_area.exists() or not staging_area.is_dir():
raise RuntimeError("Staging area %s is not a valid folder")

with zipfile.ZipFile(archive_path, "r") as archive_file:
if filename not in archive_file.namelist():
raise KeyError("File %s not found in archive %s" % (filename, archive_path))
else:
archive_file.extract(filename, staging_area)
return staging_area.joinpath(filename)

def write_csv_items_and_finish(self, data):
"""
Write data as csv to results file and finish dataset
Expand Down
4 changes: 2 additions & 2 deletions common/lib/dmi_service_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def request_folder_files(self, folder_name):
except requests.exceptions.ConnectionError as e:
retries += 1
if retries > 3:
raise DmiServiceManagerException(f"Connection Error {e} while downloading files from: {folder_name}")
raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading files from: {folder_name}")
continue

# Check if 4CAT has access to this server
Expand Down Expand Up @@ -353,7 +353,7 @@ def download_results(self, filenames_to_download, folder_name, local_output_dir,
except requests.exceptions.ConnectionError as e:
retries += 1
if retries > 3:
raise DmiServiceManagerException(f"Connection Error {e} while downloading file: {filename}")
raise DmiServiceManagerException(f"Connection Error {e} (retries {retries}) while downloading file: {filename}")
continue
files_downloaded += 1
if files_downloaded % 1000 == 0:
Expand Down
47 changes: 36 additions & 11 deletions processors/machine-learning/text_from_image.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,29 @@ def process(self):

# Unpack the images into a staging_area
self.dataset.update_status("Unzipping images")
staging_area = self.unpack_archive_contents(self.source_file)

# Collect filenames (skip .json metadata files)
image_filenames = [filename for filename in os.listdir(staging_area) if
filename.split('.')[-1] not in ["json", "log"]]
if int(self.parameters.get("amount", 100)) != 0:
image_filenames = image_filenames[:int(self.parameters.get("amount", 100))]
max_images = int(self.parameters.get("amount", 100))
else:
max_images = None

staging_area = self.dataset.get_staging_area()
# Collect filenames and metadata
image_filenames = []
metadata_file = None
for image in self.iterate_archive_contents(self.source_file, staging_area=staging_area, immediately_delete=False):
if self.interrupted:
raise ProcessorInterruptedException("Interrupted while unzipping images")

if image.name.split('.')[-1] not in ["json", "log"]:
image_filenames.append(image.name)

if image.name == ".metadata.json":
metadata_file = image.name

if max_images and len(image_filenames) >= max_images:
break

total_image_files = len(image_filenames)

# Make output dir
Expand Down Expand Up @@ -158,12 +174,21 @@ def process(self):

# Load the metadata from the archive
image_metadata = {}
with open(os.path.join(staging_area, '.metadata.json')) as file:
image_data = json.load(file)
for url, data in image_data.items():
if data.get('success'):
data.update({"url": url})
image_metadata[data['filename']] = data
if metadata_file is None:
try:
self.extract_archived_file_by_name(".metadata.json", self.source_file, staging_area)
metadata_success = True
except KeyError:
self.dataset.update_status("No metadata file found")
metadata_success = False

if metadata_success:
with open(os.path.join(staging_area, '.metadata.json')) as file:
image_data = json.load(file)
for url, data in image_data.items():
if data.get('success'):
data.update({"url": url})
image_metadata[data['filename']] = data

# Check if we need to collect data for updating the original dataset
update_original = self.parameters.get("update_original", False)
Expand Down

0 comments on commit 3de1836

Please sign in to comment.