From aad7d57e1b44f7c59a3eb89ba6282d37f10339fe Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Wed, 23 Oct 2024 15:39:27 +0200
Subject: [PATCH] New processor: deduplicate images

---
 processors/filtering/unique_images.py | 143 ++++++++++++++++++++++++++
 1 file changed, 143 insertions(+)
 create mode 100644 processors/filtering/unique_images.py

diff --git a/processors/filtering/unique_images.py b/processors/filtering/unique_images.py
new file mode 100644
index 000000000..819e4b9d6
--- /dev/null
+++ b/processors/filtering/unique_images.py
@@ -0,0 +1,143 @@
+"""
+Filter by unique images
+"""
+import imagehash
+import hashlib
+import shutil
+import json
+
+from PIL import Image
+from backend.lib.processor import BasicProcessor
+from common.lib.exceptions import ProcessorInterruptedException
+from common.lib.helpers import UserInput
+
+__author__ = "Stijn Peeters"
+__credits__ = ["Stijn Peeters"]
+__maintainer__ = "Stijn Peeters"
+__email__ = "4cat@oilab.eu"
+
+
+class UniqueImageFilter(BasicProcessor):
+    """
+    Retain only unique images, by a user-defined metric
+    """
+    type = "image-downloader-unique"  # job type ID
+    category = "Visualisation"  # category
+    title = "Filter for unique images"  # title displayed in UI
+    description = "Only keeps one instance per image, using a choice of detection method."  # description displayed in UI
+    extension = "zip"
+
+    references = [
+        "[Imagehash library](https://github.com/JohannesBuchner/imagehash?tab=readme-ov-file)",
+        "Explainer: [Average hash](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+        "Explainer: [Perceptual hashing](https://www.hackerfactor.com/blog/index.php?/archives/432-Looks-Like-It.html)",
+        "Explainer: [Difference hash](https://www.hackerfactor.com/blog/index.php?/archives/529-Kind-of-Like-That.html)",
+
+    ]
+
+    options = {
+        "hash-type": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": "Comparison method",
+            "default": "file-hash",
+            "options": {
+                "file-hash": "File hash (files need to be byte-by-byte duplicates)",
+                "colorhash": "Colour hash (good at colours, worse at shapes)",
+                "phash": "Perceptual hash (decent at colours and shapes)",
+                "average_hash": "Average hash (good at crops, less tolerant of differences than perceptual hashing)",
+                "dhash": "Difference hash (similar to average hash, better at photos and art)"
+            }
+        }
+    }
+
+    @classmethod
+    def is_compatible_with(cls, module=None, user=None):
+        """
+        Allow processor on image archives
+
+        :param module: Module to determine compatibility with
+        """
+        return module.get_media_type() == "image" or module.type.startswith(
+            "image-downloader") or module.type == "video-frames"
+
+    def hash_file(self, image_file, hash_type="file-hash"):
+        """
+        Generate an image hash
+
+        :param Path image_file:  Image file to hash
+        :param str hash_type:  Hash type, one of `file-hash`, `colorhash`,
+        `phash`, `average_hash`, `dhash`
+        :return str:  Hexadecimal hash value
+        """
+        if not image_file.exists():
+            raise FileNotFoundError()
+
+        if hash_type == "file-hash":
+            hasher = hashlib.sha1()
+
+            # Open the file in binary mode
+            with image_file.open("rb") as infile:
+                # Read and update hash in chunks to handle large files
+                while chunk := infile.read(1024):
+                    hasher.update(chunk)
+
+            return hasher.hexdigest()
+
+        elif hash_type in ("colorhash", "phash", "average_hash", "dhash"):
+            image = Image.open(image_file)
+
+            return str(getattr(imagehash, hash_type)(image))
+
+        else:
+            raise NotImplementedError(f"Unknown hash type '{hash_type}'")
+
+    def process(self):
+        """
+        Loop through images and only retain ones that have not been seen yet
+
+        :return:
+        """
+        seen_hashes = set()
+        hash_map = {}
+        metadata = None
+        dupes = 0
+        processed = 0
+        staging_area = self.dataset.get_staging_area()
+
+        for image_file in self.iterate_archive_contents(self.source_file):
+            if self.interrupted:
+                raise ProcessorInterruptedException("Interrupted while filtering for unique images")
+
+            self.dataset.update_progress(processed / self.source_dataset.num_rows)
+            processed += 1
+
+            if image_file.name == ".metadata.json":
+                with image_file.open() as infile:
+                    metadata = json.load(infile)
+                continue
+
+            image_hash = self.hash_file(image_file, self.parameters.get("hash-type"))
+
+            if image_hash not in seen_hashes:
+                seen_hashes.add(image_hash)
+                shutil.copy2(image_file, staging_area)
+                hash_map[image_hash] = image_file.name
+            else:
+                self.dataset.log(f"{image_file.name} is a duplicate of {hash_map[image_hash]} - skipping")
+                dupes += 1
+
+        new_metadata = {}
+        inverse_hashmap = {v: k for k, v in hash_map.items()}
+        for url, item in metadata.items():
+            if item["filename"] in inverse_hashmap:
+                new_metadata[inverse_hashmap[item["filename"]]] = {
+                    **item,
+                    "hash": inverse_hashmap[item["filename"]],
+                    "hash_type": self.parameters.get("hash-type")
+                }
+
+        with staging_area.joinpath(".metadata.json").open("w") as outfile:
+            json.dump(new_metadata, outfile)
+
+        self.dataset.update_status(f"Image archive filtered, found {dupes:,} duplicate(s)", is_final=True)
+        self.write_archive_and_finish(staging_area, len(hash_map), finish=True)