Skip to content

Commit 929aa3c

Browse files
Sid MohanSid Mohan
Sid Mohan
authored and
Sid Mohan
committed
all tests passed with anonymizer funcs added to datafog class
1 parent 39eecf0 commit 929aa3c

12 files changed

+255
-88
lines changed

.gitignore

+1-2
Original file line numberDiff line numberDiff line change
@@ -36,5 +36,4 @@ error_log.txt
3636
docs/*
3737
!docs/*.rst
3838
!docs/conf.py
39-
test_anonymizer.py
40-
test_anonymizer.pyc
39+
scratch.py

README.md

+6-6
Original file line numberDiff line numberDiff line change
@@ -70,19 +70,19 @@ datafog scan-text "Tim Cook is the CEO of Apple and is based out of Cupertino, C
7070
To extract text from images and optionally perform PII annotation:
7171

7272
```bash
73-
datafog scan-image "path/to/image.png" --operations extract_text
73+
datafog scan-image "path/to/image.png" --operations extract
7474
```
7575

7676
**Example:**
7777

7878
```bash
79-
datafog scan-image "nokia-statement.png" --operations extract_text
79+
datafog scan-image "nokia-statement.png" --operations extract
8080
```
8181

8282
To extract text and annotate PII:
8383

8484
```bash
85-
datafog scan-image "nokia-statement.png" --operations annotate_pii
85+
datafog scan-image "nokia-statement.png" --operations scan
8686
```
8787

8888
### Utility Commands
@@ -127,7 +127,7 @@ datafog list-entities
127127

128128
## ⚠️ Important Notes
129129

130-
- For `scan-image` and `scan-text` commands, use `--operations` to specify different operations. Default is `annotate_pii`.
130+
- For `scan-image` and `scan-text` commands, use `--operations` to specify different operations. Default is `scan`.
131131
- Process multiple images or text strings in a single command by providing multiple arguments.
132132
- Ensure proper permissions and configuration of the DataFog service before running commands.
133133

@@ -145,10 +145,10 @@ To use DataFog, you'll need to create a DataFog client with the desired operatio
145145
from datafog import DataFog
146146

147147
# For text annotation
148-
client = DataFog(operations="annotate_pii")
148+
client = DataFog(operations="scan")
149149

150150
# For OCR (Optical Character Recognition)
151-
ocr_client = DataFog(operations="extract_text")
151+
ocr_client = DataFog(operations="extract")
152152
```
153153

154154
### Text PII Annotation

datafog/client.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ def scan_image(
2727
image_urls: List[str] = typer.Argument(
2828
None, help="List of image URLs or file paths to extract text from"
2929
),
30-
operations: str = typer.Option("annotate_pii", help="Operation to perform"),
30+
operations: str = typer.Option("scan", help="Operation to perform"),
3131
):
3232
"""
3333
Scan images for text and PII.
@@ -37,7 +37,7 @@ def scan_image(
3737
3838
Args:
3939
image_urls: List of image URLs or file paths
40-
operations: Pipeline operations to run (default: annotate_pii)
40+
operations: Pipeline operations to run (default: scan)
4141
4242
Prints results or exits with error on failure.
4343
"""
@@ -61,7 +61,7 @@ def scan_text(
6161
str_list: List[str] = typer.Argument(
6262
None, help="List of texts to extract text from"
6363
),
64-
operations: str = typer.Option("annotate_pii", help="Operation to perform"),
64+
operations: str = typer.Option("scan", help="Operation to perform"),
6565
):
6666
"""
6767
Scan texts for PII.
@@ -70,7 +70,7 @@ def scan_text(
7070
7171
Args:
7272
str_list: List of texts to analyze
73-
operations: Pipeline operations to run (default: annotate_pii)
73+
operations: Pipeline operations to run (default: scan)
7474
7575
Prints results or exits with error on failure.
7676
"""

datafog/config.py

+10-8
Original file line numberDiff line numberDiff line change
@@ -80,13 +80,15 @@ class OperationType(str, Enum):
8080
"""
8181
Enum for supported DataFog operations.
8282
83-
ANNOTATE_PII: Detect and annotate PII in text
84-
EXTRACT_TEXT: Extract text from images
85-
REDACT_PII: Remove PII from text
86-
ANONYMIZE_PII: Replace PII with fake data
83+
SCAN: Detect and annotate PII in text
84+
EXTRACT: Extract text from images
85+
REDACT: Remove PII from text
86+
REPLACE: Replace PII with fake data
87+
HASH: Replace PII with a hash
8788
"""
8889

89-
ANNOTATE_PII = "annotate_pii"
90-
EXTRACT_TEXT = "extract_text"
91-
REDACT_PII = "redact_pii"
92-
ANONYMIZE_PII = "anonymize_pii"
90+
SCAN = "scan"
91+
EXTRACT = "extract"
92+
REDACT = "redact"
93+
REPLACE = "replace"
94+
HASH = "hash"

datafog/main.py

+97-58
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,15 @@
66
- TextPIIAnnotator: Class for annotating PII in text.
77
88
These classes provide high-level interfaces for image and text processing,
9-
including OCR, PII detection, and annotation.
9+
including OCR, PII detection, annotation, and anonymization.
1010
"""
1111

1212
import json
1313
import logging
1414
from typing import List
1515

1616
from .config import OperationType
17+
from .models.anonymizer import Anonymizer, AnonymizerType, HashType
1718
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
1819
from .services.image_service import ImageService
1920
from .services.spark_service import SparkService
@@ -27,26 +28,28 @@ class DataFog:
2728
"""
2829
Main class for running OCR and text processing pipelines.
2930
30-
Handles image and text processing operations, including OCR and PII detection.
31+
Handles image and text processing operations, including OCR, PII detection, and anonymization.
3132
3233
Attributes:
3334
image_service: Service for image processing and OCR.
3435
text_service: Service for text processing and annotation.
3536
spark_service: Optional Spark service for distributed processing.
3637
operations: List of operations to perform.
38+
anonymizer: Anonymizer for PII redaction, replacement, or hashing.
3739
"""
3840

3941
def __init__(
4042
self,
4143
image_service=ImageService(),
4244
text_service=TextService(),
4345
spark_service=None,
44-
operations: List[OperationType] = [OperationType.ANNOTATE_PII],
46+
operations: List[OperationType] = [OperationType.SCAN],
4547
):
4648
self.image_service = image_service
4749
self.text_service = text_service
4850
self.spark_service: SparkService = spark_service
4951
self.operations: List[OperationType] = operations
52+
self.anonymizer = Anonymizer()
5053
self.logger = logging.getLogger(__name__)
5154
self.logger.info(
5255
"Initializing DataFog class with the following services and operations:"
@@ -64,38 +67,22 @@ async def run_ocr_pipeline(self, image_urls: List[str]):
6467
6568
This method performs optical character recognition (OCR) on the images specified by the URLs.
6669
If PII annotation is enabled, it also annotates the extracted text for personally identifiable information.
70+
If redaction, replacement, or hashing is enabled, it applies the corresponding anonymization.
6771
6872
Args:
6973
image_urls (List[str]): A list of URLs pointing to the images to be processed.
7074
7175
Returns:
72-
List: If PII annotation is enabled, returns a list of annotated text results.
73-
Otherwise, returns a list of extracted text from the images.
76+
List: Processed text results based on the enabled operations.
7477
7578
Raises:
76-
Exception: Any error encountered during the OCR or annotation process.
77-
78-
Note:
79-
The method logs various stages of the process, including completion of OCR extraction
80-
and text annotation, as well as any errors encountered.
79+
Exception: Any error encountered during the OCR or text processing.
8180
"""
8281
try:
8382
extracted_text = await self.image_service.ocr_extract(image_urls)
8483
self.logger.info(f"OCR extraction completed for {len(image_urls)} images.")
85-
self.logger.debug(
86-
f"Total length of extracted text: {sum(len(text) for text in extracted_text)}"
87-
)
8884

89-
if OperationType.ANNOTATE_PII in self.operations:
90-
annotated_text = await self.text_service.batch_annotate_text_async(
91-
extracted_text
92-
)
93-
self.logger.info(
94-
f"Text annotation completed with {len(annotated_text)} annotations."
95-
)
96-
return annotated_text
97-
else:
98-
return extracted_text
85+
return await self._process_text(extracted_text)
9986
except Exception as e:
10087
logging.error(f"Error in run_ocr_pipeline: {str(e)}")
10188
return [f"Error: {str(e)}"]
@@ -105,75 +92,126 @@ async def run_text_pipeline(self, str_list: List[str]):
10592
Run the text pipeline asynchronously on a list of input text.
10693
10794
This method processes a list of text strings, potentially annotating them for personally
108-
identifiable information (PII) if the ANNOTATE_PII operation is enabled.
95+
identifiable information (PII) and applying anonymization if enabled.
10996
11097
Args:
11198
str_list (List[str]): A list of text strings to be processed.
11299
113100
Returns:
114-
List: If PII annotation is enabled, returns a list of annotated text results.
115-
Otherwise, returns the original list of text strings.
101+
List: Processed text results based on the enabled operations.
116102
117103
Raises:
118-
Exception: Any error encountered during the text processing or annotation.
119-
120-
Note:
121-
The method logs the start of the pipeline, the completion of text annotation if applicable,
122-
and any errors encountered during processing.
104+
Exception: Any error encountered during the text processing.
123105
"""
124106
try:
125107
self.logger.info(f"Starting text pipeline with {len(str_list)} texts.")
126-
if OperationType.ANNOTATE_PII in self.operations:
127-
annotated_text = await self.text_service.batch_annotate_text_async(
128-
str_list
129-
)
130-
self.logger.info(
131-
f"Text annotation completed with {len(annotated_text)} annotations."
132-
)
133-
return annotated_text
134-
135-
self.logger.info("No annotation operation found; returning original texts.")
136-
return str_list
108+
return await self._process_text(str_list)
137109
except Exception as e:
138110
self.logger.error(f"Error in run_text_pipeline: {str(e)}")
139111
raise
140112

113+
async def _process_text(self, text_list: List[str]):
114+
"""
115+
Internal method to process text based on enabled operations.
116+
"""
117+
if OperationType.SCAN in self.operations:
118+
annotated_text = await self.text_service.batch_annotate_text_async(
119+
text_list
120+
)
121+
self.logger.info(
122+
f"Text annotation completed with {len(annotated_text)} annotations."
123+
)
124+
125+
if OperationType.REDACT in self.operations:
126+
return [
127+
self.anonymizer.anonymize(
128+
text, annotations, AnonymizerType.REDACT
129+
).anonymized_text
130+
for text, annotations in zip(text_list, annotated_text, strict=True)
131+
]
132+
elif OperationType.REPLACE in self.operations:
133+
return [
134+
self.anonymizer.anonymize(
135+
text, annotations, AnonymizerType.REPLACE
136+
).anonymized_text
137+
for text, annotations in zip(text_list, annotated_text, strict=True)
138+
]
139+
elif OperationType.HASH in self.operations:
140+
return [
141+
self.anonymizer.anonymize(
142+
text, annotations, AnonymizerType.HASH
143+
).anonymized_text
144+
for text, annotations in zip(text_list, annotated_text, strict=True)
145+
]
146+
else:
147+
return annotated_text
148+
149+
self.logger.info(
150+
"No annotation or anonymization operation found; returning original texts."
151+
)
152+
return text_list
153+
141154
def run_text_pipeline_sync(self, str_list: List[str]):
142155
"""
143156
Run the text pipeline synchronously on a list of input text.
144157
145158
This method processes a list of text strings in a synchronous manner, potentially
146-
annotating them for personally identifiable information (PII) if the ANNOTATE_PII
147-
operation is enabled.
159+
annotating them for personally identifiable information (PII) and applying
160+
anonymization if enabled.
148161
149162
Args:
150163
str_list (List[str]): A list of text strings to be processed.
151164
152165
Returns:
153-
List: If PII annotation is enabled, returns a list of annotated text results.
154-
Otherwise, returns the original list of text strings.
166+
List: Processed text results based on the enabled operations.
155167
156168
Raises:
157-
Exception: Any error encountered during the text processing or annotation.
158-
159-
Note:
160-
The method logs the start of the pipeline, the completion of text annotation if applicable,
161-
and any errors encountered during processing. This synchronous version may be preferred
162-
for smaller datasets or when immediate results are required.
169+
Exception: Any error encountered during the text processing.
163170
"""
164171
try:
165172
self.logger.info(f"Starting text pipeline with {len(str_list)} texts.")
166-
if OperationType.ANNOTATE_PII in self.operations:
173+
if OperationType.SCAN in self.operations:
167174
annotated_text = self.text_service.batch_annotate_text_sync(str_list)
168175
self.logger.info(
169176
f"Text annotation completed with {len(annotated_text)} annotations."
170177
)
171-
return annotated_text
172178

173-
self.logger.info("No annotation operation found; returning original texts.")
179+
if OperationType.REDACT in self.operations:
180+
return [
181+
self.anonymizer.anonymize(
182+
text, annotations, AnonymizerType.REDACT
183+
).anonymized_text
184+
for text, annotations in zip(
185+
str_list, annotated_text, strict=True
186+
)
187+
]
188+
elif OperationType.REPLACE in self.operations:
189+
return [
190+
self.anonymizer.anonymize(
191+
text, annotations, AnonymizerType.REPLACE
192+
).anonymized_text
193+
for text, annotations in zip(
194+
str_list, annotated_text, strict=True
195+
)
196+
]
197+
elif OperationType.HASH in self.operations:
198+
return [
199+
self.anonymizer.anonymize(
200+
text, annotations, AnonymizerType.HASH
201+
).anonymized_text
202+
for text, annotations in zip(
203+
str_list, annotated_text, strict=True
204+
)
205+
]
206+
else:
207+
return annotated_text
208+
209+
self.logger.info(
210+
"No annotation or anonymization operation found; returning original texts."
211+
)
174212
return str_list
175213
except Exception as e:
176-
self.logger.error(f"Error in run_text_pipeline: {str(e)}")
214+
self.logger.error(f"Error in run_text_pipeline_sync: {str(e)}")
177215
raise
178216

179217
def _add_attributes(self, attributes: dict):
@@ -194,7 +232,7 @@ def _add_attributes(self, attributes: dict):
194232
using this method to avoid overwriting existing attributes.
195233
"""
196234
for key, value in attributes.items():
197-
pass
235+
setattr(self, key, value)
198236

199237

200238
class TextPIIAnnotator:
@@ -225,4 +263,5 @@ def run(self, text, output_path=None):
225263

226264
finally:
227265
# Ensure Spark resources are released
228-
pass
266+
if self.spark_processor:
267+
self.spark_processor.stop()

datafog/models/anonymizer.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ class AnonymizationResult(BaseModel):
3939

4040

4141
class Anonymizer(BaseModel):
42-
anonymizer_type: AnonymizerType
42+
anonymizer_type: AnonymizerType = AnonymizerType.REPLACE
4343
entities: Optional[List[EntityTypes]] = None
4444
hash_type: Optional[HashType] = HashType.SHA256
4545

0 commit comments

Comments
 (0)