all tests passed

Sid Mohan · Sid Mohan · commit 8ff5c5b22150 · 2024-03-06T17:08:53.000-08:00
diff --git a/datafog_debug.log b/datafog_debug.log
@@ -46,3 +46,7 @@
 2024-03-06 17:02:19,425 - datafog - ERROR - Unsupported input source type
 2024-03-06 17:04:29,470 - datafog - ERROR - Unsupported input source type
 2024-03-06 17:04:39,082 - datafog - ERROR - Unsupported input source type
+2024-03-06 17:06:17,074 - src.datafog - DEBUG - This is a debug message
+2024-03-06 17:06:29,333 - src.datafog - DEBUG - This is a debug message
+2024-03-06 17:08:26,748 - datafog - ERROR - Unsupported input source type
+2024-03-06 17:08:35,703 - datafog - ERROR - Unsupported input source type
diff --git a/src/datafog/__init__.py b/src/datafog/__init__.py
@@ -1,10 +1,11 @@
 # datafog-python/src/datafog/__init__.py
 import json
-import requests
 import logging
-from .pii_tools.PresidioEngine import presidio_batch_init, batch_scan, batch_redact
+
+import requests
 
 from .__about__ import __version__
+from .pii_tools.PresidioEngine import batch_redact, batch_scan, presidio_batch_init
 
 __all__ = [
     "__version__",
@@ -13,24 +14,29 @@
 # Create file handler which logs even debug messages
 
 # Configure basic settings for logging to console
-logging.basicConfig(level=logging.DEBUG,
-                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-                    datefmt='%Y-%m-%d %H:%M:%S')
+logging.basicConfig(
+    level=logging.DEBUG,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
 
 # Create and configure logger
 logger = logging.getLogger(__name__)
 
 # Create file handler which logs debug messages
-fh = logging.FileHandler('datafog_debug.log')
+fh = logging.FileHandler("datafog_debug.log")
 fh.setLevel(logging.DEBUG)
-fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+fh.setFormatter(
+    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+)
 
 # Add the file handler to the logger
 logger.addHandler(fh)
 
 # Now you can use logger to log messages
 logger.debug("This is a debug message")
 
+
 class DataFog:
     """DataFog class for scanning and redacting PII from text data"""
 
@@ -44,16 +50,17 @@ def __init__(self):
         logger.debug(f"Threshold: {self.threshold}")
         self.redaction_string = "REDACTED"
         logger.debug(f"Redaction string: {self.redaction_string}")
-        self.analyzer, self.batch_analyzer, self.batch_anonymizer = presidio_batch_init()
+        self.analyzer, self.batch_analyzer, self.batch_anonymizer = (
+            presidio_batch_init()
+        )
         logger.debug(f"Analyzer: {self.analyzer}")
         logger.debug(f"Batch Analyzer: {self.batch_analyzer}")
         logger.debug(f"Batch Anonymizer: {self.batch_anonymizer}")
 
         self.entities = self.analyzer.get_supported_entities()
-    
+
         logger.debug(f"Supported entities: {self.entities}")
 
-        
     def __call__(self, input_source, privacy_operation):
         """Scan or redact PII from input source"""
 
@@ -91,6 +98,10 @@ def __call__(self, input_source, privacy_operation):
             results = batch_scan(text_dict, self.batch_analyzer)
         elif privacy_operation == "redact":
             print("Redacting PII")
-            results = batch_redact(text_dict, batch_scan(text_dict, self.batch_analyzer), self.batch_anonymizer)
+            results = batch_redact(
+                text_dict,
+                batch_scan(text_dict, self.batch_analyzer),
+                self.batch_anonymizer,
+            )
 
         return [str(result) for result in results]
diff --git a/src/datafog/pii_tools/PresidioEngine/__init__.py b/src/datafog/pii_tools/PresidioEngine/__init__.py
@@ -1,56 +1,104 @@
 import json
-from typing import Dict, List, Iterable, Union, Any
+from typing import Any, Dict, Iterable, List, Union
+
 from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 
+
 def presidio_batch_init():
     analyzer = AnalyzerEngine()
     batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
     batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine=AnonymizerEngine())
     return analyzer, batch_analyzer, batch_anonymizer
 
+
 def batch_scan(text: Dict[str, str], batch_analyzer: BatchAnalyzerEngine) -> List[str]:
     analyzer_results = batch_analyzer.analyze_dict(text, language="en")
-    return [json.dumps({
-        "key": result.key,
-        "value": result.value,
-        "recognizer_results": serialize_recognizer_results(result.recognizer_results)
-    }) for result in analyzer_results]
+    return [
+        json.dumps(
+            {
+                "key": result.key,
+                "value": result.value,
+                "recognizer_results": serialize_recognizer_results(
+                    result.recognizer_results
+                ),
+            }
+        )
+        for result in analyzer_results
+    ]
+
+
+from typing import Dict, Iterator, List, Optional, Union
 
-from typing import Union, List, Dict, Optional, Iterator
 from presidio_analyzer import RecognizerResult
 
-def serialize_recognizer_results(recognizer_results: Union[List[RecognizerResult], List[List[RecognizerResult]], Iterator[RecognizerResult]]) -> Optional[Union[List[Dict[str, Union[str, int, float, None]]], List[List[Dict[str, Union[str, int, float, None]]]]]]:
+
+def serialize_recognizer_results(
+    recognizer_results: Union[
+        List[RecognizerResult], List[List[RecognizerResult]], Iterator[RecognizerResult]
+    ]
+) -> Optional[
+    Union[
+        List[Dict[str, Union[str, int, float, None]]],
+        List[List[Dict[str, Union[str, int, float, None]]]],
+    ]
+]:
     if isinstance(recognizer_results, list):
         if recognizer_results and isinstance(recognizer_results[0], RecognizerResult):
-            return [{
-                "entity_type": r.entity_type,
-                "start": r.start,
-                "end": r.end,
-                "score": r.score,
-                "analysis_explanation": r.analysis_explanation
-            } for r in recognizer_results]
+            return [
+                {
+                    "entity_type": r.entity_type,
+                    "start": r.start,
+                    "end": r.end,
+                    "score": r.score,
+                    "analysis_explanation": r.analysis_explanation,
+                }
+                for r in recognizer_results
+            ]
         elif recognizer_results and isinstance(recognizer_results[0], list):
             return [serialize_recognizer_results(rr) for rr in recognizer_results]
     elif isinstance(recognizer_results, Iterator):
         return [serialize_recognizer_results(rr) for rr in recognizer_results]
     else:
         return None
-    
-from typing import Union, List, Dict, Iterable, Any
+
+
+from typing import Any, Dict, Iterable, List, Union
+
 from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
 from presidio_anonymizer.entities import DictRecognizerResult, RecognizerResult
 
-def batch_redact(input_data: Union[Dict[str, str], List[str]], results: List[str], anonymizer: BatchAnonymizerEngine, **kwargs) -> Union[List[str], Dict[str, str]]:
+
+def batch_redact(
+    input_data: Union[Dict[str, str], List[str]],
+    results: List[str],
+    anonymizer: BatchAnonymizerEngine,
+    **kwargs
+) -> Union[List[str], Dict[str, str]]:
     if isinstance(input_data, dict):
         # Input is a dictionary, perform anonymize_dict
-        analyzer_results = [DictRecognizerResult(key=result_dict["key"], value=result_dict["value"], recognizer_results=[
-            RecognizerResult(entity_type=r['entity_type'], start=r['start'], end=r['end'], score=r['score'])
-            for recognizer_result in result_dict['recognizer_results'] if recognizer_result
-            for r in recognizer_result
-        ]) for result_dict in [json.loads(result) for result in results]]
+        analyzer_results = [
+            DictRecognizerResult(
+                key=result_dict["key"],
+                value=result_dict["value"],
+                recognizer_results=[
+                    RecognizerResult(
+                        entity_type=r["entity_type"],
+                        start=r["start"],
+                        end=r["end"],
+                        score=r["score"],
+                    )
+                    for recognizer_result in result_dict["recognizer_results"]
+                    if recognizer_result
+                    for r in recognizer_result
+                ],
+            )
+            for result_dict in [json.loads(result) for result in results]
+        ]
 
-        anonymized_data = anonymizer.anonymize_dict(analyzer_results=analyzer_results, **kwargs)
+        anonymized_data = anonymizer.anonymize_dict(
+            analyzer_results=analyzer_results, **kwargs
+        )
         return anonymized_data
 
     elif isinstance(input_data, list):
@@ -60,14 +108,22 @@ def batch_redact(input_data: Union[Dict[str, str], List[str]], results: List[str
         for result in results:
             result_dict = json.loads(result)
             recognizer_results = [
-                RecognizerResult(entity_type=r['entity_type'], start=r['start'], end=r['end'], score=r['score'])
-                for recognizer_result in result_dict['recognizer_results'] if recognizer_result
+                RecognizerResult(
+                    entity_type=r["entity_type"],
+                    start=r["start"],
+                    end=r["end"],
+                    score=r["score"],
+                )
+                for recognizer_result in result_dict["recognizer_results"]
+                if recognizer_result
                 for r in recognizer_result
             ]
             analyzer_results.append(recognizer_results)
 
-        anonymized_texts = anonymizer.anonymize_list(texts=texts, recognizer_results_list=analyzer_results, **kwargs)
+        anonymized_texts = anonymizer.anonymize_list(
+            texts=texts, recognizer_results_list=analyzer_results, **kwargs
+        )
         return anonymized_texts
 
     else:
-        raise ValueError("Invalid input type. Expected Dict[str, str] or List[str].")
+        raise ValueError("Invalid input type. Expected Dict[str, str] or List[str].")
diff --git a/src/datafog/pii_tools/PresidioEngine/analyzer.py b/src/datafog/pii_tools/PresidioEngine/analyzer.py
@@ -123,4 +123,4 @@ def __check_label(
     ) -> bool:
         return any(
             [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
-        )
+        )
diff --git a/src/datafog/pii_tools/PresidioEngine/process_csv_file.py b/src/datafog/pii_tools/PresidioEngine/process_csv_file.py
@@ -1,6 +1,6 @@
 import csv
 import pprint
-from typing import List, Iterable, Optional
+from typing import Iterable, List, Optional
 
 from presidio_analyzer import BatchAnalyzerEngine, DictAnalyzerResult
 from presidio_anonymizer import BatchAnonymizerEngine
@@ -32,20 +32,23 @@ def analyze_csv(
         **kwargs,
     ) -> Iterable[DictAnalyzerResult]:
 
-        with open(csv_full_path, 'r') as csv_file:
+        with open(csv_full_path, "r") as csv_file:
             csv_list = list(csv.reader(csv_file))
-            csv_dict = {header: list(map(str, values)) for header, *values in zip(*csv_list)}
+            csv_dict = {
+                header: list(map(str, values)) for header, *values in zip(*csv_list)
+            }
             analyzer_results = self.analyze_dict(csv_dict, language, keys_to_skip)
             return list(analyzer_results)
 
 
 if __name__ == "__main__":
 
     analyzer = CSVAnalyzer()
-    analyzer_results = analyzer.analyze_csv('./csv_sample_data/sample_data.csv',
-                                            language="en")
+    analyzer_results = analyzer.analyze_csv(
+        "./csv_sample_data/sample_data.csv", language="en"
+    )
     pprint.pprint(analyzer_results)
 
     anonymizer = BatchAnonymizerEngine()
     anonymized_results = anonymizer.anonymize_dict(analyzer_results)
-    pprint.pprint(anonymized_results)
+    pprint.pprint(anonymized_results)
diff --git a/tests/common/assertions.py b/tests/common/assertions.py
@@ -41,4 +41,4 @@ def _ordered(obj):
     if isinstance(obj, list):
         return sorted(_ordered(x) for x in obj)
     else:
-        return obj
+        return obj
diff --git a/tests/test_redact.py b/tests/test_redact.py
@@ -1,41 +1,55 @@
+import json
 import logging
+from datetime import datetime
+
 import pytest
+
 from datafog import DataFog, __version__
-from datetime import datetime
-import json
 
 # Generate a timestamp
 current_time = datetime.now().strftime("%Y%m%d%H%M%S")
 
 # Configure logging to write to a file with the current datetime in its name
 log_filename = f"datafog_debug{current_time}.txt"
-logging.basicConfig(filename=log_filename, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
+logging.basicConfig(
+    filename=log_filename,
+    level=logging.DEBUG,
+    format="%(asctime)s - %(levelname)s - %(message)s",
+)
+
 
 # Fixture for initializing DataFog
 @pytest.fixture
 def datafog():
     return DataFog()
+
+
 # Test data fixtures
 @pytest.fixture
 def text_file_url():
     return "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
 
+
 @pytest.fixture
 def csv_file_url():
     return "https://gist.githubusercontent.com/sidmohan0/4e9a86bb779bcb066bb340b9b60e078a/raw/586fd18490918445918770034e39980f20ce9180/fake_csv.csv"
 
+
 @pytest.fixture
 def text_string():
     return "John Smith drivers license is AC432223"
 
+
 @pytest.fixture
 def json_string():
     return '{"text": "John Smith drivers license is AC432223", "language": "en"}'
 
+
 # Test cases
 def test_version(datafog):
     assert datafog.version == __version__
 
+
 def test_datafog_init(datafog):
     logging.info("Testing DataFog initialization")
     assert datafog.language == "en"
@@ -46,32 +60,37 @@ def test_datafog_init(datafog):
     assert datafog.batch_anonymizer is not None
     assert datafog.entities is not None
 
+
 def test_batch_redact_string_input(datafog, text_string):
     redacted_results = datafog(text_string, "redact")
     assert len(redacted_results) == 1
     assert "John Smith" not in redacted_results[0]
     assert "Junk Fee Prevention Act" not in redacted_results[0]
 
+
 def test_batch_redact_text_file(datafog, text_file_url):
     scan_results = datafog(text_file_url, "scan")
     redacted_results = datafog(text_file_url, "redact")
     assert len(redacted_results) == 1
     assert "Chuck Schumer" not in redacted_results[0]
     assert "AC432223" not in redacted_results[0]
 
+
 def test_batch_redact_csv_file(datafog, csv_file_url):
     redacted_results = datafog(csv_file_url, "redact")
     assert len(redacted_results) == 1
     assert "John Smith" not in redacted_results[0]
     assert "Widget C" not in redacted_results[0]
 
+
 def test_batch_redact_json_file(datafog, json_string):
     redacted_results = datafog(json_string, "redact")
-    
+
     assert len(redacted_results) == 1
     assert "John Smith" not in redacted_results[0]
     assert "AC432223" not in redacted_results[0]
 
+
 def test_datafog_call_unsupported_input(datafog):
     with pytest.raises(ValueError):
         datafog(123, "redact")
diff --git a/tests/test_scan.py b/tests/test_scan.py