DataFog
diff --git a/‎.gitignore
Lines changed: 1 addition & 2 deletions b/‎.gitignore
Lines changed: 1 addition & 2 deletions
diff --git a/‎README.md
Lines changed: 6 additions & 6 deletions b/‎README.md
Lines changed: 6 additions & 6 deletions
diff --git a/‎datafog/client.py
Lines changed: 4 additions & 4 deletions b/‎datafog/client.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎datafog/config.py
Lines changed: 10 additions & 8 deletions b/‎datafog/config.py
Lines changed: 10 additions & 8 deletions
diff --git a/‎datafog/main.py
Lines changed: 97 additions & 58 deletions b/‎datafog/main.py
Lines changed: 97 additions & 58 deletions
diff --git a/‎datafog/models/anonymizer.py
Lines changed: 1 addition & 1 deletion b/‎datafog/models/anonymizer.py
Lines changed: 1 addition & 1 deletion
@@ -36,5 +36,4 @@ error_log.txt
 docs/*
 !docs/*.rst
 !docs/conf.py
-test_anonymizer.py
-test_anonymizer.pyc
+scratch.py
@@ -70,19 +70,19 @@ datafog scan-text "Tim Cook is the CEO of Apple and is based out of Cupertino, C
 To extract text from images and optionally perform PII annotation:
 
 ```bash
-datafog scan-image "path/to/image.png" --operations extract_text
+datafog scan-image "path/to/image.png" --operations extract
 ```
 
 **Example:**
 
 ```bash
-datafog scan-image "nokia-statement.png" --operations extract_text
+datafog scan-image "nokia-statement.png" --operations extract
 ```
 
 To extract text and annotate PII:
 
 ```bash
-datafog scan-image "nokia-statement.png" --operations annotate_pii
+datafog scan-image "nokia-statement.png" --operations scan
 ```
 
 ### Utility Commands
@@ -127,7 +127,7 @@ datafog list-entities
 
 ## ⚠️ Important Notes
 
-- For `scan-image` and `scan-text` commands, use `--operations` to specify different operations. Default is `annotate_pii`.
+- For `scan-image` and `scan-text` commands, use `--operations` to specify different operations. Default is `scan`.
 - Process multiple images or text strings in a single command by providing multiple arguments.
 - Ensure proper permissions and configuration of the DataFog service before running commands.
 
@@ -145,10 +145,10 @@ To use DataFog, you'll need to create a DataFog client with the desired operatio
 from datafog import DataFog
 
 # For text annotation
-client = DataFog(operations="annotate_pii")
+client = DataFog(operations="scan")
 
 # For OCR (Optical Character Recognition)
-ocr_client = DataFog(operations="extract_text")
+ocr_client = DataFog(operations="extract")
 ```
 
 ### Text PII Annotation
 
@@ -27,7 +27,7 @@ def scan_image(
     image_urls: List[str] = typer.Argument(
         None, help="List of image URLs or file paths to extract text from"
     ),
-    operations: str = typer.Option("annotate_pii", help="Operation to perform"),
+    operations: str = typer.Option("scan", help="Operation to perform"),
 ):
     """
     Scan images for text and PII.
@@ -37,7 +37,7 @@ def scan_image(
 
     Args:
         image_urls: List of image URLs or file paths
-        operations: Pipeline operations to run (default: annotate_pii)
+        operations: Pipeline operations to run (default: scan)
 
     Prints results or exits with error on failure.
     """
@@ -61,7 +61,7 @@ def scan_text(
     str_list: List[str] = typer.Argument(
         None, help="List of texts to extract text from"
     ),
-    operations: str = typer.Option("annotate_pii", help="Operation to perform"),
+    operations: str = typer.Option("scan", help="Operation to perform"),
 ):
     """
     Scan texts for PII.
@@ -70,7 +70,7 @@ def scan_text(
 
     Args:
         str_list: List of texts to analyze
-        operations: Pipeline operations to run (default: annotate_pii)
+        operations: Pipeline operations to run (default: scan)
 
     Prints results or exits with error on failure.
     """
 
@@ -80,13 +80,15 @@ class OperationType(str, Enum):
     """
     Enum for supported DataFog operations.
 
-    ANNOTATE_PII: Detect and annotate PII in text
-    EXTRACT_TEXT: Extract text from images
-    REDACT_PII: Remove PII from text
-    ANONYMIZE_PII: Replace PII with fake data
+    SCAN: Detect and annotate PII in text
+    EXTRACT: Extract text from images
+    REDACT: Remove PII from text
+    REPLACE: Replace PII with fake data
+    HASH: Replace PII with a hash
     """
 
-    ANNOTATE_PII = "annotate_pii"
-    EXTRACT_TEXT = "extract_text"
-    REDACT_PII = "redact_pii"
-    ANONYMIZE_PII = "anonymize_pii"
+    SCAN = "scan"
+    EXTRACT = "extract"
+    REDACT = "redact"
+    REPLACE = "replace"
+    HASH = "hash"
@@ -6,14 +6,15 @@
 - TextPIIAnnotator: Class for annotating PII in text.
 
 These classes provide high-level interfaces for image and text processing,
-including OCR, PII detection, and annotation.
+including OCR, PII detection, annotation, and anonymization.
 """
 
 import json
 import logging
 from typing import List
 
 from .config import OperationType
+from .models.anonymizer import Anonymizer, AnonymizerType, HashType
 from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
 from .services.image_service import ImageService
 from .services.spark_service import SparkService
@@ -27,26 +28,28 @@ class DataFog:
     """
     Main class for running OCR and text processing pipelines.
 
-    Handles image and text processing operations, including OCR and PII detection.
+    Handles image and text processing operations, including OCR, PII detection, and anonymization.
 
     Attributes:
         image_service: Service for image processing and OCR.
         text_service: Service for text processing and annotation.
         spark_service: Optional Spark service for distributed processing.
         operations: List of operations to perform.
+        anonymizer: Anonymizer for PII redaction, replacement, or hashing.
     """
 
     def __init__(
         self,
         image_service=ImageService(),
         text_service=TextService(),
         spark_service=None,
-        operations: List[OperationType] = [OperationType.ANNOTATE_PII],
+        operations: List[OperationType] = [OperationType.SCAN],
     ):
         self.image_service = image_service
         self.text_service = text_service
         self.spark_service: SparkService = spark_service
         self.operations: List[OperationType] = operations
+        self.anonymizer = Anonymizer()
         self.logger = logging.getLogger(__name__)
         self.logger.info(
             "Initializing DataFog class with the following services and operations:"
@@ -64,38 +67,22 @@ async def run_ocr_pipeline(self, image_urls: List[str]):
 
         This method performs optical character recognition (OCR) on the images specified by the URLs.
         If PII annotation is enabled, it also annotates the extracted text for personally identifiable information.
+        If redaction, replacement, or hashing is enabled, it applies the corresponding anonymization.
 
         Args:
             image_urls (List[str]): A list of URLs pointing to the images to be processed.
 
         Returns:
-            List: If PII annotation is enabled, returns a list of annotated text results.
-                  Otherwise, returns a list of extracted text from the images.
+            List: Processed text results based on the enabled operations.
 
         Raises:
-            Exception: Any error encountered during the OCR or annotation process.
-
-        Note:
-            The method logs various stages of the process, including completion of OCR extraction
-            and text annotation, as well as any errors encountered.
+            Exception: Any error encountered during the OCR or text processing.
         """
         try:
             extracted_text = await self.image_service.ocr_extract(image_urls)
             self.logger.info(f"OCR extraction completed for {len(image_urls)} images.")
-            self.logger.debug(
-                f"Total length of extracted text: {sum(len(text) for text in extracted_text)}"
-            )
 
-            if OperationType.ANNOTATE_PII in self.operations:
-                annotated_text = await self.text_service.batch_annotate_text_async(
-                    extracted_text
-                )
-                self.logger.info(
-                    f"Text annotation completed with {len(annotated_text)} annotations."
-                )
-                return annotated_text
-            else:
-                return extracted_text
+            return await self._process_text(extracted_text)
         except Exception as e:
             logging.error(f"Error in run_ocr_pipeline: {str(e)}")
             return [f"Error: {str(e)}"]
@@ -105,75 +92,126 @@ async def run_text_pipeline(self, str_list: List[str]):
         Run the text pipeline asynchronously on a list of input text.
 
         This method processes a list of text strings, potentially annotating them for personally
-        identifiable information (PII) if the ANNOTATE_PII operation is enabled.
+        identifiable information (PII) and applying anonymization if enabled.
 
         Args:
             str_list (List[str]): A list of text strings to be processed.
 
         Returns:
-            List: If PII annotation is enabled, returns a list of annotated text results.
-                  Otherwise, returns the original list of text strings.
+            List: Processed text results based on the enabled operations.
 
         Raises:
-            Exception: Any error encountered during the text processing or annotation.
-
-        Note:
-            The method logs the start of the pipeline, the completion of text annotation if applicable,
-            and any errors encountered during processing.
+            Exception: Any error encountered during the text processing.
         """
         try:
             self.logger.info(f"Starting text pipeline with {len(str_list)} texts.")
-            if OperationType.ANNOTATE_PII in self.operations:
-                annotated_text = await self.text_service.batch_annotate_text_async(
-                    str_list
-                )
-                self.logger.info(
-                    f"Text annotation completed with {len(annotated_text)} annotations."
-                )
-                return annotated_text
-
-            self.logger.info("No annotation operation found; returning original texts.")
-            return str_list
+            return await self._process_text(str_list)
         except Exception as e:
             self.logger.error(f"Error in run_text_pipeline: {str(e)}")
             raise
 
+    async def _process_text(self, text_list: List[str]):
+        """
+        Internal method to process text based on enabled operations.
+        """
+        if OperationType.SCAN in self.operations:
+            annotated_text = await self.text_service.batch_annotate_text_async(
+                text_list
+            )
+            self.logger.info(
+                f"Text annotation completed with {len(annotated_text)} annotations."
+            )
+
+            if OperationType.REDACT in self.operations:
+                return [
+                    self.anonymizer.anonymize(
+                        text, annotations, AnonymizerType.REDACT
+                    ).anonymized_text
+                    for text, annotations in zip(text_list, annotated_text, strict=True)
+                ]
+            elif OperationType.REPLACE in self.operations:
+                return [
+                    self.anonymizer.anonymize(
+                        text, annotations, AnonymizerType.REPLACE
+                    ).anonymized_text
+                    for text, annotations in zip(text_list, annotated_text, strict=True)
+                ]
+            elif OperationType.HASH in self.operations:
+                return [
+                    self.anonymizer.anonymize(
+                        text, annotations, AnonymizerType.HASH
+                    ).anonymized_text
+                    for text, annotations in zip(text_list, annotated_text, strict=True)
+                ]
+            else:
+                return annotated_text
+
+        self.logger.info(
+            "No annotation or anonymization operation found; returning original texts."
+        )
+        return text_list
+
     def run_text_pipeline_sync(self, str_list: List[str]):
         """
         Run the text pipeline synchronously on a list of input text.
 
         This method processes a list of text strings in a synchronous manner, potentially
-        annotating them for personally identifiable information (PII) if the ANNOTATE_PII
-        operation is enabled.
+        annotating them for personally identifiable information (PII) and applying
+        anonymization if enabled.
 
         Args:
             str_list (List[str]): A list of text strings to be processed.
 
         Returns:
-            List: If PII annotation is enabled, returns a list of annotated text results.
-                  Otherwise, returns the original list of text strings.
+            List: Processed text results based on the enabled operations.
 
         Raises:
-            Exception: Any error encountered during the text processing or annotation.
-
-        Note:
-            The method logs the start of the pipeline, the completion of text annotation if applicable,
-            and any errors encountered during processing. This synchronous version may be preferred
-            for smaller datasets or when immediate results are required.
+            Exception: Any error encountered during the text processing.
         """
         try:
             self.logger.info(f"Starting text pipeline with {len(str_list)} texts.")
-            if OperationType.ANNOTATE_PII in self.operations:
+            if OperationType.SCAN in self.operations:
                 annotated_text = self.text_service.batch_annotate_text_sync(str_list)
                 self.logger.info(
                     f"Text annotation completed with {len(annotated_text)} annotations."
                 )
-                return annotated_text
 
-            self.logger.info("No annotation operation found; returning original texts.")
+                if OperationType.REDACT in self.operations:
+                    return [
+                        self.anonymizer.anonymize(
+                            text, annotations, AnonymizerType.REDACT
+                        ).anonymized_text
+                        for text, annotations in zip(
+                            str_list, annotated_text, strict=True
+                        )
+                    ]
+                elif OperationType.REPLACE in self.operations:
+                    return [
+                        self.anonymizer.anonymize(
+                            text, annotations, AnonymizerType.REPLACE
+                        ).anonymized_text
+                        for text, annotations in zip(
+                            str_list, annotated_text, strict=True
+                        )
+                    ]
+                elif OperationType.HASH in self.operations:
+                    return [
+                        self.anonymizer.anonymize(
+                            text, annotations, AnonymizerType.HASH
+                        ).anonymized_text
+                        for text, annotations in zip(
+                            str_list, annotated_text, strict=True
+                        )
+                    ]
+                else:
+                    return annotated_text
+
+            self.logger.info(
+                "No annotation or anonymization operation found; returning original texts."
+            )
             return str_list
         except Exception as e:
-            self.logger.error(f"Error in run_text_pipeline: {str(e)}")
+            self.logger.error(f"Error in run_text_pipeline_sync: {str(e)}")
             raise
 
     def _add_attributes(self, attributes: dict):
@@ -194,7 +232,7 @@ def _add_attributes(self, attributes: dict):
             using this method to avoid overwriting existing attributes.
         """
         for key, value in attributes.items():
-            pass
+            setattr(self, key, value)
 
 
 class TextPIIAnnotator:
@@ -225,4 +263,5 @@ def run(self, text, output_path=None):
 
         finally:
             # Ensure Spark resources are released
-            pass
+            if self.spark_processor:
+                self.spark_processor.stop()
@@ -39,7 +39,7 @@ class AnonymizationResult(BaseModel):
 
 
 class Anonymizer(BaseModel):
-    anonymizer_type: AnonymizerType
+    anonymizer_type: AnonymizerType = AnonymizerType.REPLACE
     entities: Optional[List[EntityTypes]] = None
     hash_type: Optional[HashType] = HashType.SHA256