Skip to content

Commit 314578c

Browse files
Sid MohanSid Mohan
authored andcommitted
unit tests passed
1 parent 45eadda commit 314578c

File tree

14 files changed

+462
-272
lines changed

14 files changed

+462
-272
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,20 @@
2424

2525
## Overview
2626

27-
DataFog works by scanning and redacting-out PII in files **before** are uploaded to a RAG system.
27+
DataFog works by scanning and redacting-out PII in files **before** they are uploaded to database used for Retrieval/RAG systems.
2828

2929
## How it works
3030

3131
<img src="https://www.datafog.ai/hero.png" alt="DataFog Overview">
3232

3333
## Installation
3434

35+
Before getting started, please install [en_spacy_pii_fast](https://huggingface.co/beki/en_spacy_pii_fast) by performing the following command:
36+
```
37+
pip install pip install https://huggingface.co/beki/en_spacy_pii_fast/resolve/main/en_spacy_pii_fast-any-py3-none-any.whl
38+
39+
```
40+
3541
DataFog can be installed via pip:
3642

3743
```bash

datafog_debug.log

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
2024-03-06 14:14:54,095 - datafog - ERROR - Unsupported input source type
2+
2024-03-06 14:16:48,686 - datafog - ERROR - Unsupported input source type
3+
2024-03-06 14:19:38,417 - datafog - ERROR - Unsupported input source type
4+
2024-03-06 14:21:36,491 - datafog - ERROR - Unsupported input source type
5+
2024-03-06 14:24:44,678 - datafog - ERROR - Unsupported input source type
6+
2024-03-06 14:26:21,271 - datafog - ERROR - Unsupported input source type
7+
2024-03-06 14:28:46,820 - datafog - ERROR - Unsupported input source type
8+
2024-03-06 14:33:42,199 - datafog - ERROR - Unsupported input source type
9+
2024-03-06 14:34:06,118 - datafog - ERROR - Unsupported input source type
10+
2024-03-06 14:36:25,548 - datafog - ERROR - Unsupported input source type
11+
2024-03-06 14:43:53,113 - datafog - ERROR - Unsupported input source type
12+
2024-03-06 14:54:10,767 - datafog - ERROR - Unsupported input source type
13+
2024-03-06 15:07:53,084 - datafog - ERROR - Unsupported input source type
14+
2024-03-06 15:08:36,485 - datafog - ERROR - Unsupported input source type
15+
2024-03-06 15:08:38,519 - datafog - ERROR - Unsupported input source type
16+
2024-03-06 15:09:59,669 - datafog - ERROR - Unsupported input source type
17+
2024-03-06 15:10:11,058 - datafog - ERROR - Unsupported input source type
18+
2024-03-06 15:10:12,753 - datafog - ERROR - Unsupported input source type
19+
2024-03-06 15:11:30,217 - datafog - ERROR - Unsupported input source type
20+
2024-03-06 15:11:42,658 - datafog - ERROR - Unsupported input source type
21+
2024-03-06 15:11:44,620 - datafog - ERROR - Unsupported input source type
22+
2024-03-06 15:12:02,139 - datafog - ERROR - Unsupported input source type
23+
2024-03-06 15:48:40,772 - datafog - ERROR - Unsupported input source type
24+
2024-03-06 15:48:44,166 - datafog - ERROR - Unsupported input source type
25+
2024-03-06 15:48:45,637 - datafog - ERROR - Unsupported input source type
26+
2024-03-06 15:48:46,465 - datafog - ERROR - Unsupported input source type
27+
2024-03-06 15:48:55,288 - datafog - ERROR - Unsupported input source type
28+
2024-03-06 15:48:56,986 - datafog - ERROR - Unsupported input source type
29+
2024-03-06 15:49:00,061 - datafog - ERROR - Unsupported input source type
30+
2024-03-06 15:49:01,164 - datafog - ERROR - Unsupported input source type
31+
2024-03-06 15:49:01,990 - datafog - ERROR - Unsupported input source type
32+
2024-03-06 15:53:06,899 - datafog - ERROR - Unsupported input source type
33+
2024-03-06 15:53:08,580 - datafog - ERROR - Unsupported input source type
34+
2024-03-06 15:53:09,378 - datafog - ERROR - Unsupported input source type
35+
2024-03-06 15:53:17,086 - datafog - ERROR - Unsupported input source type
36+
2024-03-06 15:54:58,499 - datafog - ERROR - Unsupported input source type
37+
2024-03-06 15:55:01,748 - datafog - ERROR - Unsupported input source type
38+
2024-03-06 15:55:12,662 - datafog - ERROR - Unsupported input source type
39+
2024-03-06 15:56:34,054 - datafog - ERROR - Unsupported input source type
40+
2024-03-06 15:58:53,844 - datafog - ERROR - Unsupported input source type
41+
2024-03-06 16:00:41,271 - datafog - ERROR - Unsupported input source type
42+
2024-03-06 16:22:50,353 - datafog - ERROR - Unsupported input source type
43+
2024-03-06 16:27:07,004 - datafog - ERROR - Unsupported input source type
44+
2024-03-06 16:28:58,212 - datafog - ERROR - Unsupported input source type
45+
2024-03-06 16:40:53,775 - datafog - ERROR - Unsupported input source type
46+
2024-03-06 17:02:19,425 - datafog - ERROR - Unsupported input source type
47+
2024-03-06 17:04:29,470 - datafog - ERROR - Unsupported input source type
48+
2024-03-06 17:04:39,082 - datafog - ERROR - Unsupported input source type

poetry.lock

Lines changed: 10 additions & 27 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ numpy = "^1.26.4"
1414
pytest = "^8.0.1"
1515
requests-mock = "^1.11.0"
1616
spacy = "3.4.4"
17-
en-spacy-pii-fast = {url = "https://huggingface.co/beki/en_spacy_pii_fast/resolve/main/en_spacy_pii_fast-any-py3-none-any.whl"}
1817
pandas = "^2.2.1"
1918

19+
2020
[build-system]
2121
requires = ["poetry-core"]
2222
build-backend = "poetry.core.masonry.api"

src/datafog/__init__.py

Lines changed: 69 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,66 +1,96 @@
11
# datafog-python/src/datafog/__init__.py
22
import json
3-
4-
import pandas as pd
53
import requests
6-
import spacy
4+
import logging
5+
from .pii_tools.PresidioEngine import presidio_batch_init, batch_scan, batch_redact
76

87
from .__about__ import __version__
9-
from .pii_tools import PresidioEngine
108

119
__all__ = [
1210
"__version__",
1311
"PresidioEngine",
1412
]
13+
# Create file handler which logs even debug messages
14+
15+
# Configure basic settings for logging to console
16+
logging.basicConfig(level=logging.DEBUG,
17+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
18+
datefmt='%Y-%m-%d %H:%M:%S')
19+
20+
# Create and configure logger
21+
logger = logging.getLogger(__name__)
22+
23+
# Create file handler which logs debug messages
24+
fh = logging.FileHandler('datafog_debug.log')
25+
fh.setLevel(logging.DEBUG)
26+
fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
1527

28+
# Add the file handler to the logger
29+
logger.addHandler(fh)
30+
31+
# Now you can use logger to log messages
32+
logger.debug("This is a debug message")
1633

1734
class DataFog:
35+
"""DataFog class for scanning and redacting PII from text data"""
36+
1837
def __init__(self):
19-
self.nlp = spacy.load("en_spacy_pii_fast")
38+
self.version = __version__
39+
logger.debug(f"DataFog version: {self.version}")
40+
logger.debug(f"Initializing Presidio Engine")
41+
self.language = "en"
42+
logger.debug(f"Language: {self.language}")
43+
self.threshold = 0.5
44+
logger.debug(f"Threshold: {self.threshold}")
45+
self.redaction_string = "REDACTED"
46+
logger.debug(f"Redaction string: {self.redaction_string}")
47+
self.analyzer, self.batch_analyzer, self.batch_anonymizer = presidio_batch_init()
48+
logger.debug(f"Analyzer: {self.analyzer}")
49+
logger.debug(f"Batch Analyzer: {self.batch_analyzer}")
50+
logger.debug(f"Batch Anonymizer: {self.batch_anonymizer}")
2051

21-
@staticmethod
22-
def client():
23-
return DataFog()
52+
self.entities = self.analyzer.get_supported_entities()
53+
54+
logger.debug(f"Supported entities: {self.entities}")
2455

56+
2557
def __call__(self, input_source, privacy_operation):
58+
"""Scan or redact PII from input source"""
59+
60+
# Read the input source
2661
if isinstance(input_source, str):
2762
if input_source.startswith(("http://", "https://")):
28-
print("Downloading file from URL")
29-
response = requests.get(input_source)
30-
text = response.text
31-
elif input_source.endswith((".csv", ".txt")):
32-
print("Reading CSV/TXT from local path")
33-
with open(input_source, "r") as file:
34-
text = file.read()
63+
logger.debug("Downloading file from URL")
64+
text = requests.get(input_source).text.splitlines()
65+
elif input_source.endswith(".csv"):
66+
logger.debug("Reading CSV from local path")
67+
text = open(input_source).read().splitlines()
68+
elif input_source.endswith(".txt"):
69+
logger.debug("Reading TXT from local path")
70+
text = open(input_source).read().splitlines()
3571
elif input_source.endswith(".json"):
36-
print("Reading JSON from local path")
37-
with open(input_source, "r") as file:
38-
data = json.load(file)
39-
text = json.dumps(data)
40-
elif input_source.endswith(".parquet"):
41-
print("Reading Parquet from local path")
42-
df = pd.read_parquet(input_source)
43-
text = df.to_csv(index=False)
72+
logger.debug("Reading JSON from local path")
73+
text = json.load(open(input_source))
4474
else:
45-
text = input_source
75+
logger.debug("Reading text from string")
76+
text = [input_source]
4677
else:
78+
logger.error("Unsupported input source type")
4779
raise ValueError("Unsupported input source type")
4880

49-
doc = self.nlp(text)
81+
text = [str(t) for t in text]
82+
83+
# Process the input based on privacy operation
84+
if privacy_operation not in ["scan", "redact"]:
85+
raise ValueError("Unsupported privacy operation")
5086

51-
# Chunk the text and perform privacy operation
52-
for ent in doc.ents:
53-
if ent.label_ in ["PERSON", "ORG", "GPE", "PHONE", "EMAIL", "URL"]:
54-
# Perform privacy operation based on the entity type
55-
if privacy_operation == "redact":
56-
text = text.replace(ent.text, "[REDACTED]")
57-
elif privacy_operation == "annotate":
58-
text = text.replace(ent.text, f"[{ent.label_}]")
87+
text_dict = {"text": text}
5988

60-
else:
61-
raise ValueError(
62-
f"Unsupported privacy operation: {privacy_operation}"
63-
)
64-
# Add more privacy operations as needed
89+
if privacy_operation == "scan":
90+
print("Scanning for PII")
91+
results = batch_scan(text_dict, self.batch_analyzer)
92+
elif privacy_operation == "redact":
93+
print("Redacting PII")
94+
results = batch_redact(text_dict, batch_scan(text_dict, self.batch_analyzer), self.batch_anonymizer)
6595

66-
return text
96+
return [str(result) for result in results]

0 commit comments

Comments
 (0)