|
1 | 1 | # datafog-python/src/datafog/__init__.py
|
2 | 2 | import json
|
3 |
| - |
4 |
| -import pandas as pd |
5 | 3 | import requests
|
6 |
| -import spacy |
| 4 | +import logging |
| 5 | +from .pii_tools.PresidioEngine import presidio_batch_init, batch_scan, batch_redact |
7 | 6 |
|
8 | 7 | from .__about__ import __version__
|
9 |
| -from .pii_tools import PresidioEngine |
10 | 8 |
|
11 | 9 | __all__ = [
|
12 | 10 | "__version__",
|
13 | 11 | "PresidioEngine",
|
14 | 12 | ]
|
| 13 | +# Create file handler which logs even debug messages |
| 14 | + |
| 15 | +# Configure basic settings for logging to console |
| 16 | +logging.basicConfig(level=logging.DEBUG, |
| 17 | + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', |
| 18 | + datefmt='%Y-%m-%d %H:%M:%S') |
| 19 | + |
| 20 | +# Create and configure logger |
| 21 | +logger = logging.getLogger(__name__) |
| 22 | + |
| 23 | +# Create file handler which logs debug messages |
| 24 | +fh = logging.FileHandler('datafog_debug.log') |
| 25 | +fh.setLevel(logging.DEBUG) |
| 26 | +fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')) |
15 | 27 |
|
| 28 | +# Add the file handler to the logger |
| 29 | +logger.addHandler(fh) |
| 30 | + |
| 31 | +# Now you can use logger to log messages |
| 32 | +logger.debug("This is a debug message") |
16 | 33 |
|
17 | 34 | class DataFog:
|
| 35 | + """DataFog class for scanning and redacting PII from text data""" |
| 36 | + |
18 | 37 | def __init__(self):
|
19 |
| - self.nlp = spacy.load("en_spacy_pii_fast") |
| 38 | + self.version = __version__ |
| 39 | + logger.debug(f"DataFog version: {self.version}") |
| 40 | + logger.debug(f"Initializing Presidio Engine") |
| 41 | + self.language = "en" |
| 42 | + logger.debug(f"Language: {self.language}") |
| 43 | + self.threshold = 0.5 |
| 44 | + logger.debug(f"Threshold: {self.threshold}") |
| 45 | + self.redaction_string = "REDACTED" |
| 46 | + logger.debug(f"Redaction string: {self.redaction_string}") |
| 47 | + self.analyzer, self.batch_analyzer, self.batch_anonymizer = presidio_batch_init() |
| 48 | + logger.debug(f"Analyzer: {self.analyzer}") |
| 49 | + logger.debug(f"Batch Analyzer: {self.batch_analyzer}") |
| 50 | + logger.debug(f"Batch Anonymizer: {self.batch_anonymizer}") |
20 | 51 |
|
21 |
| - @staticmethod |
22 |
| - def client(): |
23 |
| - return DataFog() |
| 52 | + self.entities = self.analyzer.get_supported_entities() |
| 53 | + |
| 54 | + logger.debug(f"Supported entities: {self.entities}") |
24 | 55 |
|
| 56 | + |
25 | 57 | def __call__(self, input_source, privacy_operation):
|
| 58 | + """Scan or redact PII from input source""" |
| 59 | + |
| 60 | + # Read the input source |
26 | 61 | if isinstance(input_source, str):
|
27 | 62 | if input_source.startswith(("http://", "https://")):
|
28 |
| - print("Downloading file from URL") |
29 |
| - response = requests.get(input_source) |
30 |
| - text = response.text |
31 |
| - elif input_source.endswith((".csv", ".txt")): |
32 |
| - print("Reading CSV/TXT from local path") |
33 |
| - with open(input_source, "r") as file: |
34 |
| - text = file.read() |
| 63 | + logger.debug("Downloading file from URL") |
| 64 | + text = requests.get(input_source).text.splitlines() |
| 65 | + elif input_source.endswith(".csv"): |
| 66 | + logger.debug("Reading CSV from local path") |
| 67 | + text = open(input_source).read().splitlines() |
| 68 | + elif input_source.endswith(".txt"): |
| 69 | + logger.debug("Reading TXT from local path") |
| 70 | + text = open(input_source).read().splitlines() |
35 | 71 | elif input_source.endswith(".json"):
|
36 |
| - print("Reading JSON from local path") |
37 |
| - with open(input_source, "r") as file: |
38 |
| - data = json.load(file) |
39 |
| - text = json.dumps(data) |
40 |
| - elif input_source.endswith(".parquet"): |
41 |
| - print("Reading Parquet from local path") |
42 |
| - df = pd.read_parquet(input_source) |
43 |
| - text = df.to_csv(index=False) |
| 72 | + logger.debug("Reading JSON from local path") |
| 73 | + text = json.load(open(input_source)) |
44 | 74 | else:
|
45 |
| - text = input_source |
| 75 | + logger.debug("Reading text from string") |
| 76 | + text = [input_source] |
46 | 77 | else:
|
| 78 | + logger.error("Unsupported input source type") |
47 | 79 | raise ValueError("Unsupported input source type")
|
48 | 80 |
|
49 |
| - doc = self.nlp(text) |
| 81 | + text = [str(t) for t in text] |
| 82 | + |
| 83 | + # Process the input based on privacy operation |
| 84 | + if privacy_operation not in ["scan", "redact"]: |
| 85 | + raise ValueError("Unsupported privacy operation") |
50 | 86 |
|
51 |
| - # Chunk the text and perform privacy operation |
52 |
| - for ent in doc.ents: |
53 |
| - if ent.label_ in ["PERSON", "ORG", "GPE", "PHONE", "EMAIL", "URL"]: |
54 |
| - # Perform privacy operation based on the entity type |
55 |
| - if privacy_operation == "redact": |
56 |
| - text = text.replace(ent.text, "[REDACTED]") |
57 |
| - elif privacy_operation == "annotate": |
58 |
| - text = text.replace(ent.text, f"[{ent.label_}]") |
| 87 | + text_dict = {"text": text} |
59 | 88 |
|
60 |
| - else: |
61 |
| - raise ValueError( |
62 |
| - f"Unsupported privacy operation: {privacy_operation}" |
63 |
| - ) |
64 |
| - # Add more privacy operations as needed |
| 89 | + if privacy_operation == "scan": |
| 90 | + print("Scanning for PII") |
| 91 | + results = batch_scan(text_dict, self.batch_analyzer) |
| 92 | + elif privacy_operation == "redact": |
| 93 | + print("Redacting PII") |
| 94 | + results = batch_redact(text_dict, batch_scan(text_dict, self.batch_analyzer), self.batch_anonymizer) |
65 | 95 |
|
66 |
| - return text |
| 96 | + return [str(result) for result in results] |
0 commit comments