Skip to content

Commit 39eecf0

Browse files
Sid MohanSid Mohan
authored andcommitted
redact, hash, replace
1 parent 8d3b52e commit 39eecf0

File tree

7 files changed

+191
-6
lines changed

7 files changed

+191
-6
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,5 @@ error_log.txt
3636
docs/*
3737
!docs/*.rst
3838
!docs/conf.py
39+
test_anonymizer.py
40+
test_anonymizer.pyc

CONTRIBUTING.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,7 @@ project [license](LICENSE) and affirming that you either own copyright
2323
(automatic for most individuals) or are authorized to distribute under
2424
the project license (e.g., in case your employer retains copyright on
2525
your work).
26+
27+
### Legal Notice
28+
29+
When contributing to this project, you must agree that you have authored 100% of the content, that you have the necessary rights to the content and that the content you contribute may be provided under the project license.

datafog/__init__.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@
88
AnnotationResultWithAnaysisExplanation,
99
AnnotatorRequest,
1010
)
11+
from .models.anonymizer import (
12+
AnonymizationResult,
13+
Anonymizer,
14+
AnonymizerRequest,
15+
AnonymizerType,
16+
)
1117
from .models.common import AnnotatorMetadata, EntityTypes, Pattern, PatternRecognizer
1218
from .models.spacy_nlp import SpacyAnnotator
1319
from .processing.image_processing.donut_processor import DonutProcessor
@@ -41,4 +47,8 @@
4147
"PatternRecognizer",
4248
"get_config",
4349
"SpacyAnnotator",
50+
"AnonymizerType",
51+
"AnonymizerRequest",
52+
"AnonymizationResult",
53+
"Anonymizer",
4454
]

datafog/models/anonymizer.py

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
"""
2+
Data models for anonymization requests and results.
3+
"""
4+
5+
import hashlib
6+
import secrets
7+
from enum import Enum
8+
from typing import List, Optional
9+
10+
from pydantic import BaseModel, Field
11+
12+
from .annotator import AnnotationResult
13+
from .common import EntityTypes
14+
15+
16+
class AnonymizerType(str, Enum):
17+
REDACT = "redact"
18+
REPLACE = "replace"
19+
HASH = "hash"
20+
21+
22+
class HashType(str, Enum):
23+
MD5 = "md5"
24+
SHA256 = "sha256"
25+
SHA3_256 = "sha3_256"
26+
27+
28+
class AnonymizerRequest(BaseModel):
29+
text: str
30+
annotator_results: List[AnnotationResult]
31+
anonymizer_type: AnonymizerType
32+
entities: Optional[List[EntityTypes]] = None
33+
hash_type: Optional[HashType] = HashType.SHA256
34+
35+
36+
class AnonymizationResult(BaseModel):
37+
anonymized_text: str
38+
replaced_entities: List[dict] = Field(default_factory=list)
39+
40+
41+
class Anonymizer(BaseModel):
42+
anonymizer_type: AnonymizerType
43+
entities: Optional[List[EntityTypes]] = None
44+
hash_type: Optional[HashType] = HashType.SHA256
45+
46+
def __init__(self, **data):
47+
super().__init__(**data)
48+
49+
def anonymize(
50+
self, text: str, annotations: List[AnnotationResult]
51+
) -> AnonymizationResult:
52+
"""Anonymize PII in text based on the specified anonymizer type."""
53+
if self.anonymizer_type == AnonymizerType.REDACT:
54+
return self.redact_pii(text, annotations)
55+
elif self.anonymizer_type == AnonymizerType.REPLACE:
56+
return self.replace_pii(text, annotations)
57+
elif self.anonymizer_type == AnonymizerType.HASH:
58+
return self.hash_pii(text, annotations)
59+
else:
60+
raise ValueError(f"Unsupported anonymizer type: {self.anonymizer_type}")
61+
62+
def replace_pii(
63+
self, text: str, annotations: List[AnnotationResult]
64+
) -> AnonymizationResult:
65+
"""Replace PII in text with anonymized values."""
66+
replacements = []
67+
print(f"Entities to anonymize: {self.entities}")
68+
for annotation in sorted(annotations, key=lambda x: x.start, reverse=True):
69+
print(f"Processing annotation: {annotation}")
70+
if not self.entities or annotation.entity_type in self.entities:
71+
print(f"Matched entity type: {annotation.entity_type}")
72+
if self.anonymizer_type == AnonymizerType.REPLACE:
73+
replacement = f"[{annotation.entity_type}_{len(replacements)}]"
74+
replacements.append(
75+
{
76+
"original": text[annotation.start : annotation.end],
77+
"replacement": replacement,
78+
"entity_type": annotation.entity_type,
79+
}
80+
)
81+
print(f"Added replacement: {replacements[-1]}")
82+
83+
print(f"Final replacements: {replacements}")
84+
anonymized_text = text
85+
for replacement in reversed(replacements):
86+
start = text.index(replacement["original"])
87+
end = start + len(replacement["original"])
88+
anonymized_text = (
89+
anonymized_text[:start]
90+
+ replacement["replacement"]
91+
+ anonymized_text[end:]
92+
)
93+
94+
return AnonymizationResult(
95+
anonymized_text=anonymized_text, replaced_entities=replacements
96+
)
97+
98+
def _generate_replacement(self, original: str, entity_type: EntityTypes) -> str:
99+
"""Generate a replacement for the given entity."""
100+
if entity_type == EntityTypes.PERSON:
101+
return f"[PERSON_{secrets.token_hex(4).upper()}]"
102+
elif entity_type == EntityTypes.ORGANIZATION:
103+
return f"[ORGANIZATION_{secrets.token_hex(4).upper()}]"
104+
elif entity_type == EntityTypes.LOCATION:
105+
return f"[LOCATION_{secrets.token_hex(4).upper()}]"
106+
elif entity_type == EntityTypes.DATE:
107+
return "[REDACTED_DATE]"
108+
else:
109+
return f"[{entity_type}_{secrets.token_hex(4).upper()}]"
110+
111+
def hash_pii(
112+
self, text: str, annotations: List[AnnotationResult]
113+
) -> AnonymizationResult:
114+
"""Hash PII in text."""
115+
replacements = []
116+
for annotation in sorted(annotations, key=lambda x: x.start, reverse=True):
117+
if self.entities and annotation.entity_type not in self.entities:
118+
continue
119+
120+
start, end = annotation.start, annotation.end
121+
original = text[start:end]
122+
replacement = self._hash_text(original)[: len(original)]
123+
124+
text = text[:start] + replacement + text[end:]
125+
replacements.append(
126+
{
127+
"original": original,
128+
"replacement": replacement,
129+
"entity_type": annotation.entity_type,
130+
}
131+
)
132+
133+
return AnonymizationResult(anonymized_text=text, replaced_entities=replacements)
134+
135+
def _hash_text(self, text: str) -> str:
136+
if self.hash_type == HashType.MD5:
137+
return hashlib.md5(text.encode()).hexdigest()
138+
elif self.hash_type == HashType.SHA256:
139+
return hashlib.sha256(text.encode()).hexdigest()
140+
elif self.hash_type == HashType.SHA3_256:
141+
return hashlib.sha3_256(text.encode()).hexdigest()
142+
else:
143+
raise ValueError(f"Unsupported hash type: {self.hash_type}")
144+
145+
def redact_pii(
146+
self, text: str, annotations: List[AnnotationResult]
147+
) -> AnonymizationResult:
148+
"""Redact PII in text."""
149+
replacements = []
150+
for annotation in sorted(annotations, key=lambda x: x.start, reverse=True):
151+
if self.entities and annotation.entity_type not in self.entities:
152+
continue
153+
154+
start, end = annotation.start, annotation.end
155+
original = text[start:end]
156+
replacement = "[REDACTED]"
157+
158+
text = text[:start] + replacement + text[end:]
159+
replacements.append(
160+
{
161+
"original": original,
162+
"replacement": replacement,
163+
"entity_type": annotation.entity_type,
164+
}
165+
)
166+
167+
return AnonymizationResult(anonymized_text=text, replaced_entities=replacements)

datafog/models/common.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,17 @@
1313
class EntityTypes(str, Enum):
1414
"""PII entity types recognized by DataFog."""
1515

16-
PERSON = "Names similar to John Doe, Joe Biden, Donald Trump, Kamala Harris"
17-
LOCATION = "Full or partial name of a location"
18-
ORGANIZATION = "Full or partial name of an organization"
16+
PERSON = "PERSON"
17+
LOCATION = "LOCATION"
18+
ORGANIZATION = "ORGANIZATION"
1919
EMAIL = "email address (containing @)"
2020
PHONE_NUMBER = (
2121
"phone number (containing numbers and possibly dashes or parentheses)"
2222
)
2323
DATE = "date (in any format)"
2424
NUMBER = "number (in any format)"
2525
CREDIT_CARD = "credit card number (in any format)"
26-
UNKNOWN = "Unknown entity type"
26+
UNKNOWN = "UNKNOWN"
2727

2828

2929
class Pattern(BaseModel):
@@ -48,4 +48,4 @@ class PatternRecognizer(BaseModel):
4848
class AnnotatorMetadata(BaseModel):
4949
"""Metadata for annotation results."""
5050

51-
recognizer_name: str
51+
recognizer_name: Optional[str] = None

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@ asyncio
1414
setuptools
1515
pydantic-settings==2.3.4
1616
typer==0.12.3
17-
sphinx
17+
sphinx
18+
cryptography

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
"pydantic-settings==2.3.4",
4343
"typer==0.12.3",
4444
"sphinx",
45+
"cryptography",
4546
],
4647
python_requires=">=3.10,<3.13",
4748
entry_points={

0 commit comments

Comments
 (0)