Skip to content

Commit 8ff5c5b

Browse files
Sid MohanSid Mohan
Sid Mohan
authored and
Sid Mohan
committed
all tests passed
1 parent 314578c commit 8ff5c5b

File tree

8 files changed

+167
-56
lines changed

8 files changed

+167
-56
lines changed

datafog_debug.log

+4
Original file line numberDiff line numberDiff line change
@@ -46,3 +46,7 @@
4646
2024-03-06 17:02:19,425 - datafog - ERROR - Unsupported input source type
4747
2024-03-06 17:04:29,470 - datafog - ERROR - Unsupported input source type
4848
2024-03-06 17:04:39,082 - datafog - ERROR - Unsupported input source type
49+
2024-03-06 17:06:17,074 - src.datafog - DEBUG - This is a debug message
50+
2024-03-06 17:06:29,333 - src.datafog - DEBUG - This is a debug message
51+
2024-03-06 17:08:26,748 - datafog - ERROR - Unsupported input source type
52+
2024-03-06 17:08:35,703 - datafog - ERROR - Unsupported input source type

src/datafog/__init__.py

+22-11
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
# datafog-python/src/datafog/__init__.py
22
import json
3-
import requests
43
import logging
5-
from .pii_tools.PresidioEngine import presidio_batch_init, batch_scan, batch_redact
4+
5+
import requests
66

77
from .__about__ import __version__
8+
from .pii_tools.PresidioEngine import batch_redact, batch_scan, presidio_batch_init
89

910
__all__ = [
1011
"__version__",
@@ -13,24 +14,29 @@
1314
# Create file handler which logs even debug messages
1415

1516
# Configure basic settings for logging to console
16-
logging.basicConfig(level=logging.DEBUG,
17-
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
18-
datefmt='%Y-%m-%d %H:%M:%S')
17+
logging.basicConfig(
18+
level=logging.DEBUG,
19+
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
20+
datefmt="%Y-%m-%d %H:%M:%S",
21+
)
1922

2023
# Create and configure logger
2124
logger = logging.getLogger(__name__)
2225

2326
# Create file handler which logs debug messages
24-
fh = logging.FileHandler('datafog_debug.log')
27+
fh = logging.FileHandler("datafog_debug.log")
2528
fh.setLevel(logging.DEBUG)
26-
fh.setFormatter(logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
29+
fh.setFormatter(
30+
logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
31+
)
2732

2833
# Add the file handler to the logger
2934
logger.addHandler(fh)
3035

3136
# Now you can use logger to log messages
3237
logger.debug("This is a debug message")
3338

39+
3440
class DataFog:
3541
"""DataFog class for scanning and redacting PII from text data"""
3642

@@ -44,16 +50,17 @@ def __init__(self):
4450
logger.debug(f"Threshold: {self.threshold}")
4551
self.redaction_string = "REDACTED"
4652
logger.debug(f"Redaction string: {self.redaction_string}")
47-
self.analyzer, self.batch_analyzer, self.batch_anonymizer = presidio_batch_init()
53+
self.analyzer, self.batch_analyzer, self.batch_anonymizer = (
54+
presidio_batch_init()
55+
)
4856
logger.debug(f"Analyzer: {self.analyzer}")
4957
logger.debug(f"Batch Analyzer: {self.batch_analyzer}")
5058
logger.debug(f"Batch Anonymizer: {self.batch_anonymizer}")
5159

5260
self.entities = self.analyzer.get_supported_entities()
53-
61+
5462
logger.debug(f"Supported entities: {self.entities}")
5563

56-
5764
def __call__(self, input_source, privacy_operation):
5865
"""Scan or redact PII from input source"""
5966

@@ -91,6 +98,10 @@ def __call__(self, input_source, privacy_operation):
9198
results = batch_scan(text_dict, self.batch_analyzer)
9299
elif privacy_operation == "redact":
93100
print("Redacting PII")
94-
results = batch_redact(text_dict, batch_scan(text_dict, self.batch_analyzer), self.batch_anonymizer)
101+
results = batch_redact(
102+
text_dict,
103+
batch_scan(text_dict, self.batch_analyzer),
104+
self.batch_anonymizer,
105+
)
95106

96107
return [str(result) for result in results]
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,104 @@
11
import json
2-
from typing import Dict, List, Iterable, Union, Any
2+
from typing import Any, Dict, Iterable, List, Union
3+
34
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerResult
45
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
56

7+
68
def presidio_batch_init():
79
analyzer = AnalyzerEngine()
810
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
911
batch_anonymizer = BatchAnonymizerEngine(anonymizer_engine=AnonymizerEngine())
1012
return analyzer, batch_analyzer, batch_anonymizer
1113

14+
1215
def batch_scan(text: Dict[str, str], batch_analyzer: BatchAnalyzerEngine) -> List[str]:
1316
analyzer_results = batch_analyzer.analyze_dict(text, language="en")
14-
return [json.dumps({
15-
"key": result.key,
16-
"value": result.value,
17-
"recognizer_results": serialize_recognizer_results(result.recognizer_results)
18-
}) for result in analyzer_results]
17+
return [
18+
json.dumps(
19+
{
20+
"key": result.key,
21+
"value": result.value,
22+
"recognizer_results": serialize_recognizer_results(
23+
result.recognizer_results
24+
),
25+
}
26+
)
27+
for result in analyzer_results
28+
]
29+
30+
31+
from typing import Dict, Iterator, List, Optional, Union
1932

20-
from typing import Union, List, Dict, Optional, Iterator
2133
from presidio_analyzer import RecognizerResult
2234

23-
def serialize_recognizer_results(recognizer_results: Union[List[RecognizerResult], List[List[RecognizerResult]], Iterator[RecognizerResult]]) -> Optional[Union[List[Dict[str, Union[str, int, float, None]]], List[List[Dict[str, Union[str, int, float, None]]]]]]:
35+
36+
def serialize_recognizer_results(
37+
recognizer_results: Union[
38+
List[RecognizerResult], List[List[RecognizerResult]], Iterator[RecognizerResult]
39+
]
40+
) -> Optional[
41+
Union[
42+
List[Dict[str, Union[str, int, float, None]]],
43+
List[List[Dict[str, Union[str, int, float, None]]]],
44+
]
45+
]:
2446
if isinstance(recognizer_results, list):
2547
if recognizer_results and isinstance(recognizer_results[0], RecognizerResult):
26-
return [{
27-
"entity_type": r.entity_type,
28-
"start": r.start,
29-
"end": r.end,
30-
"score": r.score,
31-
"analysis_explanation": r.analysis_explanation
32-
} for r in recognizer_results]
48+
return [
49+
{
50+
"entity_type": r.entity_type,
51+
"start": r.start,
52+
"end": r.end,
53+
"score": r.score,
54+
"analysis_explanation": r.analysis_explanation,
55+
}
56+
for r in recognizer_results
57+
]
3358
elif recognizer_results and isinstance(recognizer_results[0], list):
3459
return [serialize_recognizer_results(rr) for rr in recognizer_results]
3560
elif isinstance(recognizer_results, Iterator):
3661
return [serialize_recognizer_results(rr) for rr in recognizer_results]
3762
else:
3863
return None
39-
40-
from typing import Union, List, Dict, Iterable, Any
64+
65+
66+
from typing import Any, Dict, Iterable, List, Union
67+
4168
from presidio_anonymizer import AnonymizerEngine, BatchAnonymizerEngine
4269
from presidio_anonymizer.entities import DictRecognizerResult, RecognizerResult
4370

44-
def batch_redact(input_data: Union[Dict[str, str], List[str]], results: List[str], anonymizer: BatchAnonymizerEngine, **kwargs) -> Union[List[str], Dict[str, str]]:
71+
72+
def batch_redact(
73+
input_data: Union[Dict[str, str], List[str]],
74+
results: List[str],
75+
anonymizer: BatchAnonymizerEngine,
76+
**kwargs
77+
) -> Union[List[str], Dict[str, str]]:
4578
if isinstance(input_data, dict):
4679
# Input is a dictionary, perform anonymize_dict
47-
analyzer_results = [DictRecognizerResult(key=result_dict["key"], value=result_dict["value"], recognizer_results=[
48-
RecognizerResult(entity_type=r['entity_type'], start=r['start'], end=r['end'], score=r['score'])
49-
for recognizer_result in result_dict['recognizer_results'] if recognizer_result
50-
for r in recognizer_result
51-
]) for result_dict in [json.loads(result) for result in results]]
80+
analyzer_results = [
81+
DictRecognizerResult(
82+
key=result_dict["key"],
83+
value=result_dict["value"],
84+
recognizer_results=[
85+
RecognizerResult(
86+
entity_type=r["entity_type"],
87+
start=r["start"],
88+
end=r["end"],
89+
score=r["score"],
90+
)
91+
for recognizer_result in result_dict["recognizer_results"]
92+
if recognizer_result
93+
for r in recognizer_result
94+
],
95+
)
96+
for result_dict in [json.loads(result) for result in results]
97+
]
5298

53-
anonymized_data = anonymizer.anonymize_dict(analyzer_results=analyzer_results, **kwargs)
99+
anonymized_data = anonymizer.anonymize_dict(
100+
analyzer_results=analyzer_results, **kwargs
101+
)
54102
return anonymized_data
55103

56104
elif isinstance(input_data, list):
@@ -60,14 +108,22 @@ def batch_redact(input_data: Union[Dict[str, str], List[str]], results: List[str
60108
for result in results:
61109
result_dict = json.loads(result)
62110
recognizer_results = [
63-
RecognizerResult(entity_type=r['entity_type'], start=r['start'], end=r['end'], score=r['score'])
64-
for recognizer_result in result_dict['recognizer_results'] if recognizer_result
111+
RecognizerResult(
112+
entity_type=r["entity_type"],
113+
start=r["start"],
114+
end=r["end"],
115+
score=r["score"],
116+
)
117+
for recognizer_result in result_dict["recognizer_results"]
118+
if recognizer_result
65119
for r in recognizer_result
66120
]
67121
analyzer_results.append(recognizer_results)
68122

69-
anonymized_texts = anonymizer.anonymize_list(texts=texts, recognizer_results_list=analyzer_results, **kwargs)
123+
anonymized_texts = anonymizer.anonymize_list(
124+
texts=texts, recognizer_results_list=analyzer_results, **kwargs
125+
)
70126
return anonymized_texts
71127

72128
else:
73-
raise ValueError("Invalid input type. Expected Dict[str, str] or List[str].")
129+
raise ValueError("Invalid input type. Expected Dict[str, str] or List[str].")

src/datafog/pii_tools/PresidioEngine/analyzer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -123,4 +123,4 @@ def __check_label(
123123
) -> bool:
124124
return any(
125125
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
126-
)
126+
)

src/datafog/pii_tools/PresidioEngine/process_csv_file.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import csv
22
import pprint
3-
from typing import List, Iterable, Optional
3+
from typing import Iterable, List, Optional
44

55
from presidio_analyzer import BatchAnalyzerEngine, DictAnalyzerResult
66
from presidio_anonymizer import BatchAnonymizerEngine
@@ -32,20 +32,23 @@ def analyze_csv(
3232
**kwargs,
3333
) -> Iterable[DictAnalyzerResult]:
3434

35-
with open(csv_full_path, 'r') as csv_file:
35+
with open(csv_full_path, "r") as csv_file:
3636
csv_list = list(csv.reader(csv_file))
37-
csv_dict = {header: list(map(str, values)) for header, *values in zip(*csv_list)}
37+
csv_dict = {
38+
header: list(map(str, values)) for header, *values in zip(*csv_list)
39+
}
3840
analyzer_results = self.analyze_dict(csv_dict, language, keys_to_skip)
3941
return list(analyzer_results)
4042

4143

4244
if __name__ == "__main__":
4345

4446
analyzer = CSVAnalyzer()
45-
analyzer_results = analyzer.analyze_csv('./csv_sample_data/sample_data.csv',
46-
language="en")
47+
analyzer_results = analyzer.analyze_csv(
48+
"./csv_sample_data/sample_data.csv", language="en"
49+
)
4750
pprint.pprint(analyzer_results)
4851

4952
anonymizer = BatchAnonymizerEngine()
5053
anonymized_results = anonymizer.anonymize_dict(analyzer_results)
51-
pprint.pprint(anonymized_results)
54+
pprint.pprint(anonymized_results)

tests/common/assertions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ def _ordered(obj):
4141
if isinstance(obj, list):
4242
return sorted(_ordered(x) for x in obj)
4343
else:
44-
return obj
44+
return obj

tests/test_redact.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,55 @@
1+
import json
12
import logging
3+
from datetime import datetime
4+
25
import pytest
6+
37
from datafog import DataFog, __version__
4-
from datetime import datetime
5-
import json
68

79
# Generate a timestamp
810
current_time = datetime.now().strftime("%Y%m%d%H%M%S")
911

1012
# Configure logging to write to a file with the current datetime in its name
1113
log_filename = f"datafog_debug{current_time}.txt"
12-
logging.basicConfig(filename=log_filename, level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
14+
logging.basicConfig(
15+
filename=log_filename,
16+
level=logging.DEBUG,
17+
format="%(asctime)s - %(levelname)s - %(message)s",
18+
)
19+
1320

1421
# Fixture for initializing DataFog
1522
@pytest.fixture
1623
def datafog():
1724
return DataFog()
25+
26+
1827
# Test data fixtures
1928
@pytest.fixture
2029
def text_file_url():
2130
return "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
2231

32+
2333
@pytest.fixture
2434
def csv_file_url():
2535
return "https://gist.githubusercontent.com/sidmohan0/4e9a86bb779bcb066bb340b9b60e078a/raw/586fd18490918445918770034e39980f20ce9180/fake_csv.csv"
2636

37+
2738
@pytest.fixture
2839
def text_string():
2940
return "John Smith drivers license is AC432223"
3041

42+
3143
@pytest.fixture
3244
def json_string():
3345
return '{"text": "John Smith drivers license is AC432223", "language": "en"}'
3446

47+
3548
# Test cases
3649
def test_version(datafog):
3750
assert datafog.version == __version__
3851

52+
3953
def test_datafog_init(datafog):
4054
logging.info("Testing DataFog initialization")
4155
assert datafog.language == "en"
@@ -46,32 +60,37 @@ def test_datafog_init(datafog):
4660
assert datafog.batch_anonymizer is not None
4761
assert datafog.entities is not None
4862

63+
4964
def test_batch_redact_string_input(datafog, text_string):
5065
redacted_results = datafog(text_string, "redact")
5166
assert len(redacted_results) == 1
5267
assert "John Smith" not in redacted_results[0]
5368
assert "Junk Fee Prevention Act" not in redacted_results[0]
5469

70+
5571
def test_batch_redact_text_file(datafog, text_file_url):
5672
scan_results = datafog(text_file_url, "scan")
5773
redacted_results = datafog(text_file_url, "redact")
5874
assert len(redacted_results) == 1
5975
assert "Chuck Schumer" not in redacted_results[0]
6076
assert "AC432223" not in redacted_results[0]
6177

78+
6279
def test_batch_redact_csv_file(datafog, csv_file_url):
6380
redacted_results = datafog(csv_file_url, "redact")
6481
assert len(redacted_results) == 1
6582
assert "John Smith" not in redacted_results[0]
6683
assert "Widget C" not in redacted_results[0]
6784

85+
6886
def test_batch_redact_json_file(datafog, json_string):
6987
redacted_results = datafog(json_string, "redact")
70-
88+
7189
assert len(redacted_results) == 1
7290
assert "John Smith" not in redacted_results[0]
7391
assert "AC432223" not in redacted_results[0]
7492

93+
7594
def test_datafog_call_unsupported_input(datafog):
7695
with pytest.raises(ValueError):
7796
datafog(123, "redact")

0 commit comments

Comments
 (0)