Skip to content

Commit a122c66

Browse files
Sid MohanSid Mohan
authored andcommitted
v3.2.0 release
1 parent bbbeba6 commit a122c66

21 files changed

+567
-42
lines changed

README.md

Lines changed: 76 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,10 @@ DataFog is an open-source DevSecOps platform that lets you scan and redact Perso
2727

2828
![image](https://github.com/DataFog/datafog-python/assets/61345237/57fba4e5-21cc-458f-ac6a-6fbbb70a8de1)
2929

30-
3130
### How it works
3231

3332
![image](https://github.com/DataFog/datafog-python/assets/61345237/91f4634a-8a9f-4621-81bc-09930feda78a)
3433

35-
3634
## Installation
3735

3836
DataFog can be installed via pip:
@@ -41,7 +39,82 @@ DataFog can be installed via pip:
4139
pip install datafog
4240
```
4341

44-
## Examples - Updated for v3.1
42+
## Examples -
43+
44+
### v3.2.0 NEW
45+
46+
Based on the provided test cases, here's a suitable "Getting Started" section for the documentation:
47+
48+
## Getting Started
49+
50+
The DataFog library provides functionality for text and image processing, including PII (Personally Identifiable Information) annotation and OCR (Optical Character Recognition) capabilities.
51+
52+
### Installation
53+
54+
To install the DataFog library, use the following command:
55+
56+
```
57+
pip install datafog
58+
```
59+
60+
### Usage
61+
62+
Here are some examples of how to use the DataFog library:
63+
64+
#### Text PII Annotation
65+
66+
To annotate PII in a given text, lets start with a set of clinical notes:
67+
68+
```
69+
!git clone https://gist.github.com/b43b72693226422bac5f083c941ecfdb.git
70+
```
71+
72+
```python
73+
from datafog import TextPIIAnnotator
74+
75+
text = "John Doe lives at 1234 Elm St, Springfield."
76+
text_annotator = TextPIIAnnotator()
77+
annotated_text = text_annotator.run(text)
78+
print(annotated_text)
79+
```
80+
81+
This will output the annotated text with PII labeled, such as `{"LOC": ["Springfield"]}`.
82+
83+
#### Image Text Extraction and Annotation
84+
85+
To extract text from an image and perform PII annotation, you can use the `DataFog` class:
86+
87+
```python
88+
from datafog import DataFog
89+
90+
image_url = "https://example.com/image.png"
91+
datafog = DataFog()
92+
annotated_text = await datafog.run_ocr_pipeline([image_url])
93+
print(annotated_text)
94+
```
95+
96+
This will download the image, extract the text using OCR, and annotate any PII found in the extracted text.
97+
98+
#### Text Processing
99+
100+
To process and annotate text using the DataFog pipeline, you can use the `DataFog` class:
101+
102+
```python
103+
from datafog import DataFog
104+
105+
text = ["Tokyo is the capital of Japan"]
106+
datafog = DataFog()
107+
annotated_text = await datafog.run_text_pipeline(text)
108+
print(annotated_text)
109+
```
110+
111+
This will process the given text and annotate entities such as person names and locations.
112+
113+
For more detailed usage and examples, please refer to the API documentation.
114+
115+
Note: The DataFog library uses asynchronous programming, so make sure to use the `async`/`await` syntax when calling the appropriate methods.
116+
117+
### v3.1.0
45118

46119
### Base case: PII annotation of text-files
47120

datafog/__about__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "3.2.0b5"
1+
__version__ = "3.2.0"

datafog/__init__.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,14 @@
1-
from .main import (DataFog, OCRPIIAnnotator, TextPIIAnnotator
2-
)
1+
from .config import OperationType
2+
from .main import DataFog, OCRPIIAnnotator, TextPIIAnnotator
33
from .processing.image_processing.donut_processor import DonutProcessor
44
from .processing.image_processing.image_downloader import ImageDownloader
55
from .processing.image_processing.pytesseract_processor import PytesseractProcessor
66
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
7-
8-
97
from .services.image_service import ImageService
108
from .services.spark_service import SparkService
119
from .services.text_service import TextService
12-
from .config import OperationType
10+
11+
# from .__about__ import __version__
1312

1413
__all__ = [
1514
"DonutProcessor",
@@ -23,4 +22,5 @@
2322
"SpacyPIIAnnotator",
2423
"ImageDownloader",
2524
"PytesseractProcessor",
26-
]
25+
# "__version__",
26+
]

datafog/main.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,28 @@
11
import asyncio
2-
import json
3-
from typing import List
42
import importlib
5-
import aiohttp
3+
import json
64
import subprocess
75
import sys
6+
from typing import List
7+
8+
import aiohttp
9+
810
from .config import OperationType
11+
from .processing.image_processing.donut_processor import DonutProcessor
12+
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
913
from .services.image_service import ImageService
1014
from .services.spark_service import SparkService
1115
from .services.text_service import TextService
1216

13-
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
14-
from .processing.image_processing.donut_processor import DonutProcessor
15-
1617

1718
class DataFog:
18-
def __init__(self, image_service = ImageService(), text_service = TextService(), spark_service = None, operations: List[OperationType] = [OperationType.ANNOTATE_PII]):
19+
def __init__(
20+
self,
21+
image_service=ImageService(),
22+
text_service=TextService(),
23+
spark_service=None,
24+
operations: List[OperationType] = [OperationType.ANNOTATE_PII],
25+
):
1926
self.image_service = image_service
2027
self.text_service = text_service
2128
self.spark_service: SparkService = spark_service
@@ -45,7 +52,6 @@ def __init__(self):
4552
self.text_annotator = SpacyPIIAnnotator.create()
4653
self.spark_service: SparkService = None
4754

48-
4955
async def run(self, image_urls: List[str], output_path=None):
5056
try:
5157
# Download and process the image to extract text

datafog/processing/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .image_processing.donut_processor import DonutProcessor
22
from .image_processing.image_downloader import ImageDownloader
33
from .image_processing.pytesseract_processor import PytesseractProcessor
4-
from .text_processing.spacy_pii_annotator import SpacyPIIAnnotator
54
from .spark_processing.pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
5+
from .text_processing.spacy_pii_annotator import SpacyPIIAnnotator

datafog/processing/image_processing/donut_processor.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
import importlib
12
import json
23
import re
3-
from io import BytesIO
4-
import importlib
54
import subprocess
65
import sys
6+
from io import BytesIO
7+
78
import requests
89
from PIL import Image
910

@@ -13,12 +14,12 @@
1314
class DonutProcessor:
1415
def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"):
1516

17+
self.ensure_installed("torch")
18+
self.ensure_installed("transformers")
1619

17-
self.ensure_installed('torch')
18-
self.ensure_installed('transformers')
19-
20-
from transformers import VisionEncoderDecoderModel, DonutProcessor as TransformersDonutProcessor
2120
import torch
21+
from transformers import DonutProcessor as TransformersDonutProcessor
22+
from transformers import VisionEncoderDecoderModel
2223

2324
self.processor = TransformersDonutProcessor.from_pretrained(model_path)
2425
self.model = VisionEncoderDecoderModel.from_pretrained(model_path)
@@ -31,7 +32,9 @@ def ensure_installed(self, package_name):
3132
try:
3233
importlib.import_module(package_name)
3334
except ImportError:
34-
subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
35+
subprocess.check_call(
36+
[sys.executable, "-m", "pip", "install", package_name]
37+
)
3538

3639
async def parse_image(self, image: Image) -> str:
3740
"""Process the image using DonutProcessor and VisionEncoderDecoderModel and extract text."""
@@ -72,4 +75,4 @@ def download_image(self, url: str) -> Image:
7275
"""Download an image from URL."""
7376
response = requests.get(url)
7477
image = Image.open(BytesIO(response.content))
75-
return image
78+
return image

datafog/processing/image_processing/pytesseract_processor.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,3 @@ def __init__(self):
1010
async def extract_text_from_image(image: Image) -> str:
1111
"""Extract text from an image using pytesseract."""
1212
return pytesseract.image_to_string(image)
13-
Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
from .pyspark_udfs import broadcast_pii_annotator_udf, pii_annotator
2-

datafog/services/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
11
from .image_service import ImageService
22
from .spark_service import SparkService
33
from .text_service import TextService
4-

datafog/services/image_service.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,28 +29,25 @@ async def ocr_extract(
2929
self,
3030
image_urls: List[str],
3131
image_files: List[Image.Image] = None,
32-
3332
) -> List[str]:
3433
if image_files is None:
3534
image_files = await self.download_images(image_urls)
3635

3736
if self.use_donut and self.use_tesseract:
3837
raise ValueError("Both OCR processors cannot be selected simultaneously")
39-
38+
4039
if not self.use_donut and not self.use_tesseract:
4140
raise ValueError("No OCR processor selected")
42-
41+
4342
if self.use_donut:
4443
return await asyncio.gather(
4544
*[self.donut_processor.parse_image(image) for image in image_files]
4645
)
47-
46+
4847
if self.use_tesseract:
4948
return await asyncio.gather(
5049
*[
5150
self.tesseract_processor.extract_text_from_image(image)
5251
for image in image_files
5352
]
5453
)
55-
56-

0 commit comments

Comments
 (0)