DataFog
diff --git a/‎README.md
Lines changed: 7 additions & 1 deletion b/‎README.md
Lines changed: 7 additions & 1 deletion
diff --git a/‎datafog_debug.log
Lines changed: 48 additions & 0 deletions b/‎datafog_debug.log
Lines changed: 48 additions & 0 deletions
diff --git a/‎poetry.lock
Lines changed: 10 additions & 27 deletions b/‎poetry.lock
Lines changed: 10 additions & 27 deletions
diff --git a/‎pyproject.toml
Lines changed: 1 addition & 1 deletion b/‎pyproject.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/datafog/__init__.py
Lines changed: 69 additions & 39 deletions b/‎src/datafog/__init__.py
Lines changed: 69 additions & 39 deletions
@@ -24,14 +24,20 @@
 
 ## Overview
 
-DataFog works by scanning and redacting-out PII in files **before** are uploaded to a RAG system.
+DataFog works by scanning and redacting-out PII in files **before** they are uploaded to database used for Retrieval/RAG systems.
 
 ## How it works
 
 <img src="https://www.datafog.ai/hero.png" alt="DataFog Overview">
 
 ## Installation
 
+Before getting started, please install [en_spacy_pii_fast](https://huggingface.co/beki/en_spacy_pii_fast) by performing the following command:
+```
+pip install pip install https://huggingface.co/beki/en_spacy_pii_fast/resolve/main/en_spacy_pii_fast-any-py3-none-any.whl
+
+```
+
 DataFog can be installed via pip:
 
 ```bash
 
@@ -0,0 +1,48 @@
+2024-03-06 14:14:54,095 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:16:48,686 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:19:38,417 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:21:36,491 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:24:44,678 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:26:21,271 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:28:46,820 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:33:42,199 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:34:06,118 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:36:25,548 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:43:53,113 - datafog - ERROR - Unsupported input source type
+2024-03-06 14:54:10,767 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:07:53,084 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:08:36,485 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:08:38,519 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:09:59,669 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:10:11,058 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:10:12,753 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:11:30,217 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:11:42,658 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:11:44,620 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:12:02,139 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:48:40,772 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:48:44,166 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:48:45,637 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:48:46,465 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:48:55,288 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:48:56,986 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:49:00,061 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:49:01,164 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:49:01,990 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:53:06,899 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:53:08,580 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:53:09,378 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:53:17,086 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:54:58,499 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:55:01,748 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:55:12,662 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:56:34,054 - datafog - ERROR - Unsupported input source type
+2024-03-06 15:58:53,844 - datafog - ERROR - Unsupported input source type
+2024-03-06 16:00:41,271 - datafog - ERROR - Unsupported input source type
+2024-03-06 16:22:50,353 - datafog - ERROR - Unsupported input source type
+2024-03-06 16:27:07,004 - datafog - ERROR - Unsupported input source type
+2024-03-06 16:28:58,212 - datafog - ERROR - Unsupported input source type
+2024-03-06 16:40:53,775 - datafog - ERROR - Unsupported input source type
+2024-03-06 17:02:19,425 - datafog - ERROR - Unsupported input source type
+2024-03-06 17:04:29,470 - datafog - ERROR - Unsupported input source type
+2024-03-06 17:04:39,082 - datafog - ERROR - Unsupported input source type
@@ -14,9 +14,9 @@ numpy = "^1.26.4"
 pytest = "^8.0.1"
 requests-mock = "^1.11.0"
 spacy = "3.4.4"
-en-spacy-pii-fast = {url = "https://huggingface.co/beki/en_spacy_pii_fast/resolve/main/en_spacy_pii_fast-any-py3-none-any.whl"}
 pandas = "^2.2.1"
 
+
 [build-system]
 requires = ["poetry-core"]
 build-backend = "poetry.core.masonry.api"
 
@@ -1,66 +1,96 @@
 # datafog-python/src/datafog/__init__.py
 import json
-
-import pandas as pd
 import requests
-import spacy
+import logging
+from .pii_tools.PresidioEngine import presidio_batch_init, batch_scan, batch_redact
 
 from .__about__ import __version__
-from .pii_tools import PresidioEngine
 
 __all__ = [
     "__version__",
     "PresidioEngine",
 ]
+# Create file handler which logs even debug messages
+
+# Configure basic settings for logging to console
+logging.basicConfig(level=logging.DEBUG,
+                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
+                    datefmt='%Y-%m-%d %H:%M:%S')
+
+# Create and configure logger
+logger = logging.getLogger(__name__)
+
+# Create file handler which logs debug messages
+fh = logging.FileHandler('datafog_debug.log')
+fh.setLevel(logging.DEBUG)
+fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
 
+# Add the file handler to the logger
+logger.addHandler(fh)
+
+# Now you can use logger to log messages
+logger.debug("This is a debug message")
 
 class DataFog:
+    """DataFog class for scanning and redacting PII from text data"""
+
     def __init__(self):
-        self.nlp = spacy.load("en_spacy_pii_fast")
+        self.version = __version__
+        logger.debug(f"DataFog version: {self.version}")
+        logger.debug(f"Initializing Presidio Engine")
+        self.language = "en"
+        logger.debug(f"Language: {self.language}")
+        self.threshold = 0.5
+        logger.debug(f"Threshold: {self.threshold}")
+        self.redaction_string = "REDACTED"
+        logger.debug(f"Redaction string: {self.redaction_string}")
+        self.analyzer, self.batch_analyzer, self.batch_anonymizer = presidio_batch_init()
+        logger.debug(f"Analyzer: {self.analyzer}")
+        logger.debug(f"Batch Analyzer: {self.batch_analyzer}")
+        logger.debug(f"Batch Anonymizer: {self.batch_anonymizer}")
 
-    @staticmethod
-    def client():
-        return DataFog()
+        self.entities = self.analyzer.get_supported_entities()
+    
+        logger.debug(f"Supported entities: {self.entities}")
 
+        
     def __call__(self, input_source, privacy_operation):
+        """Scan or redact PII from input source"""
+
+        # Read the input source
         if isinstance(input_source, str):
             if input_source.startswith(("http://", "https://")):
-                print("Downloading file from URL")
-                response = requests.get(input_source)
-                text = response.text
-            elif input_source.endswith((".csv", ".txt")):
-                print("Reading  CSV/TXT from local path")
-                with open(input_source, "r") as file:
-                    text = file.read()
+                logger.debug("Downloading file from URL")
+                text = requests.get(input_source).text.splitlines()
+            elif input_source.endswith(".csv"):
+                logger.debug("Reading CSV from local path")
+                text = open(input_source).read().splitlines()
+            elif input_source.endswith(".txt"):
+                logger.debug("Reading TXT from local path")
+                text = open(input_source).read().splitlines()
             elif input_source.endswith(".json"):
-                print("Reading JSON from local path")
-                with open(input_source, "r") as file:
-                    data = json.load(file)
-                    text = json.dumps(data)
-            elif input_source.endswith(".parquet"):
-                print("Reading Parquet from local path")
-                df = pd.read_parquet(input_source)
-                text = df.to_csv(index=False)
+                logger.debug("Reading JSON from local path")
+                text = json.load(open(input_source))
             else:
-                text = input_source
+                logger.debug("Reading text from string")
+                text = [input_source]
         else:
+            logger.error("Unsupported input source type")
             raise ValueError("Unsupported input source type")
 
-        doc = self.nlp(text)
+        text = [str(t) for t in text]
+
+        # Process the input based on privacy operation
+        if privacy_operation not in ["scan", "redact"]:
+            raise ValueError("Unsupported privacy operation")
 
-        # Chunk the text and perform privacy operation
-        for ent in doc.ents:
-            if ent.label_ in ["PERSON", "ORG", "GPE", "PHONE", "EMAIL", "URL"]:
-                # Perform privacy operation based on the entity type
-                if privacy_operation == "redact":
-                    text = text.replace(ent.text, "[REDACTED]")
-                elif privacy_operation == "annotate":
-                    text = text.replace(ent.text, f"[{ent.label_}]")
+        text_dict = {"text": text}
 
-                else:
-                    raise ValueError(
-                        f"Unsupported privacy operation: {privacy_operation}"
-                    )
-                # Add more privacy operations as needed
+        if privacy_operation == "scan":
+            print("Scanning for PII")
+            results = batch_scan(text_dict, self.batch_analyzer)
+        elif privacy_operation == "redact":
+            print("Redacting PII")
+            results = batch_redact(text_dict, batch_scan(text_dict, self.batch_analyzer), self.batch_anonymizer)
 
-        return text
+        return [str(result) for result in results]