Skip to content

Commit 492ab5c

Browse files
authored
Merge pull request #51 from DataFog/feature/py312-support
python 3.10, 3.11, 3.12 support | model
2 parents 1e5f14b + 924b47b commit 492ab5c

File tree

14 files changed

+333
-298
lines changed

14 files changed

+333
-298
lines changed

.github/workflows/dev-cicd.yml

+20-2
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,24 @@ jobs:
2727
runs-on: ubuntu-latest
2828
strategy:
2929
matrix:
30-
python-version: ["3.10"]
30+
python-version: ["3.10", "3.11", "3.12"]
3131
steps:
3232
- name: Check out repo
3333
uses: actions/checkout@v4
34+
- name: Free Disk Space (Ubuntu)
35+
uses: jlumbroso/free-disk-space@main
36+
with:
37+
# this might remove tools that are actually needed,
38+
# if set to "true" but frees about 6 GB
39+
tool-cache: false
40+
# all of these default to true, but feel free to set to
41+
# "false" if necessary for your workflow
42+
android: true
43+
dotnet: true
44+
haskell: true
45+
large-packages: true
46+
docker-images: true
47+
swap-storage: true
3448
- name: Set up Python
3549
uses: actions/setup-python@v4
3650
with:
@@ -54,11 +68,15 @@ jobs:
5468
pip install -e .
5569
pip install tox just pre-commit
5670
- name: Run Tests with tox
57-
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
71+
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
5872
- name: Submit to Codecov
5973
uses: codecov/codecov-action@v3
6074
with:
6175
token: ${{ secrets.CODECOV_TOKEN }}
6276
files: ./coverage.xml
6377
flags: unittests
6478
name: codecov-umbrella
79+
- name: Clean up pip cache
80+
run: |
81+
pip cache purge
82+
rm -rf ~/.cache/pip

.github/workflows/feature-cicd.yml

+20-3
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,20 @@ jobs:
3131
steps:
3232
- name: Check out repo
3333
uses: actions/checkout@v4
34+
- name: Free Disk Space (Ubuntu)
35+
uses: jlumbroso/free-disk-space@main
36+
with:
37+
# this might remove tools that are actually needed,
38+
# if set to "true" but frees about 6 GB
39+
tool-cache: false
40+
# all of these default to true, but feel free to set to
41+
# "false" if necessary for your workflow
42+
android: true
43+
dotnet: true
44+
haskell: true
45+
large-packages: true
46+
docker-images: true
47+
swap-storage: true
3448
- name: Set up Python
3549
uses: actions/setup-python@v4
3650
with:
@@ -51,10 +65,13 @@ jobs:
5165
- name: Install Dependencies
5266
run: |
5367
pip install -U pip
54-
pip install -e .
55-
pip install tox just pre-commit
68+
pip install --no-cache-dir -e .
69+
pip install --no-cache-dir tox just pre-commit
70+
- name: Free up disk space
71+
run: |
72+
sudo apt-get clean
5673
- name: Run Tests with tox
57-
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
74+
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
5875
- name: Submit to Codecov
5976
uses: codecov/codecov-action@v3
6077
with:

.github/workflows/main-cicd.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
pip install -e .
5555
pip install tox just pre-commit
5656
- name: Run Tests with tox
57-
run: tox -- --cov datafog --cov-report xml --cov-report term --codeblocks
57+
run: tox -- --cov datafog --cov-report xml --cov-report term -v -s --cov-report=term-missing
5858
- name: Submit to Codecov
5959
uses: codecov/codecov-action@v3
6060
with:

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ For local development:
116116
```
117117
5. Install the package in editable mode:
118118
```
119-
pip install -e .
119+
pip install -r requirements-dev.txt
120120
```
121121
6. Set up the project:
122122
```

datafog/processing/image_processing/donut_processor.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import sys
66
from io import BytesIO
77

8+
import numpy as np
89
import requests
910
from PIL import Image
1011

@@ -13,7 +14,6 @@
1314

1415
class DonutProcessor:
1516
def __init__(self, model_path="naver-clova-ix/donut-base-finetuned-cord-v2"):
16-
1717
self.ensure_installed("torch")
1818
self.ensure_installed("transformers")
1919

@@ -36,13 +36,31 @@ def ensure_installed(self, package_name):
3636
[sys.executable, "-m", "pip", "install", package_name]
3737
)
3838

39-
async def parse_image(self, image: Image) -> str:
39+
def preprocess_image(self, image: Image.Image) -> np.ndarray:
40+
# Convert to RGB if the image is not already in RGB mode
41+
if image.mode != "RGB":
42+
image = image.convert("RGB")
43+
44+
# Convert to numpy array
45+
image_np = np.array(image)
46+
47+
# Ensure the image is 3D (height, width, channels)
48+
if image_np.ndim == 2:
49+
image_np = np.expand_dims(image_np, axis=-1)
50+
image_np = np.repeat(image_np, 3, axis=-1)
51+
52+
return image_np
53+
54+
async def parse_image(self, image: Image.Image) -> str:
4055
"""Process w/ DonutProcessor and VisionEncoderDecoderModel"""
56+
# Preprocess the image
57+
image_np = self.preprocess_image(image)
58+
4159
task_prompt = "<s_cord-v2>"
4260
decoder_input_ids = self.processor.tokenizer(
4361
task_prompt, add_special_tokens=False, return_tensors="pt"
4462
).input_ids
45-
pixel_values = self.processor(image, return_tensors="pt").pixel_values
63+
pixel_values = self.processor(images=image_np, return_tensors="pt").pixel_values
4664

4765
outputs = self.model.generate(
4866
pixel_values.to(self.device),
@@ -71,7 +89,7 @@ def process_url(self, url: str) -> str:
7189
image = self.downloader.download_image(url)
7290
return self.parse_image(image)
7391

74-
def download_image(self, url: str) -> Image:
92+
def download_image(self, url: str) -> Image.Image:
7593
"""Download an image from URL."""
7694
response = requests.get(url)
7795
image = Image.open(BytesIO(response.content))

datafog/processing/spark_processing/pyspark_udfs.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88

99
def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
10-
"""Extract features using en_spacy_pii_fast model.
10+
"""Extract features using en_core_web_lg model.
1111
1212
Returns:
1313
list[list[str]]: Values as arrays in order defined in the PII_ANNOTATION_LABELS.
@@ -40,7 +40,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
4040

4141

4242
def broadcast_pii_annotator_udf(
43-
spark_session=None, spacy_model: str = "en_spacy_pii_fast"
43+
spark_session=None, spacy_model: str = "en_core_web_lg"
4444
):
4545
"""Broadcast PII annotator across Spark cluster and create UDF"""
4646
ensure_installed("pyspark")

datafog/processing/text_processing/spacy_pii_annotator.py

+41-14
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,26 @@
33

44
from pydantic import BaseModel
55

6-
PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
6+
PII_ANNOTATION_LABELS = [
7+
"CARDINAL",
8+
"DATE",
9+
"EVENT",
10+
"FAC",
11+
"GPE",
12+
"LANGUAGE",
13+
"LAW",
14+
"LOC",
15+
"MONEY",
16+
"NORP",
17+
"ORDINAL",
18+
"ORG",
19+
"PERCENT",
20+
"PERSON",
21+
"PRODUCT",
22+
"QUANTITY",
23+
"TIME",
24+
"WORK_OF_ART",
25+
]
726
MAXIMAL_STRING_SIZE = 1000000
827

928

@@ -12,21 +31,29 @@ class SpacyPIIAnnotator(BaseModel):
1231

1332
@classmethod
1433
def create(cls) -> "SpacyPIIAnnotator":
15-
try:
16-
# Try loading as a spaCy model first
17-
import spacy
34+
import spacy
1835

19-
nlp = spacy.load("en_spacy_pii_fast")
36+
try:
37+
nlp = spacy.load("en_core_web_lg")
2038
except OSError:
21-
# If that fails, try importing as a module
22-
try:
23-
import en_spacy_pii_fast
24-
25-
nlp = en_spacy_pii_fast.load()
26-
except ImportError:
27-
raise ImportError(
28-
"Failed to load en_spacy_pii_fast. Make sure it's installed correctly."
29-
)
39+
import subprocess
40+
import sys
41+
42+
interpreter_location = sys.executable
43+
subprocess.run(
44+
[
45+
interpreter_location,
46+
"-m",
47+
"pip",
48+
"install",
49+
"--no-deps",
50+
"--no-cache-dir",
51+
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl",
52+
],
53+
check=True,
54+
)
55+
nlp = spacy.load("en_core_web_lg")
56+
3057
return cls(nlp=nlp)
3158

3259
def annotate(self, text: str) -> Dict[str, List[str]]:

datafog/services/image_service.py

+25-2
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,34 @@
11
import asyncio
2+
import io
3+
import ssl
24
from typing import List
35

6+
import aiohttp
7+
import certifi
48
from PIL import Image
59

610
from datafog.processing.image_processing.donut_processor import DonutProcessor
7-
from datafog.processing.image_processing.image_downloader import ImageDownloader
811
from datafog.processing.image_processing.pytesseract_processor import (
912
PytesseractProcessor,
1013
)
1114

1215

16+
class ImageDownloader:
17+
async def download_image(self, url: str) -> Image.Image:
18+
ssl_context = ssl.create_default_context(cafile=certifi.where())
19+
async with aiohttp.ClientSession(
20+
connector=aiohttp.TCPConnector(ssl=ssl_context)
21+
) as session:
22+
async with session.get(url) as response:
23+
if response.status == 200:
24+
image_data = await response.read()
25+
return Image.open(io.BytesIO(image_data))
26+
else:
27+
raise Exception(
28+
f"Failed to download image. Status code: {response.status}"
29+
)
30+
31+
1332
class ImageService:
1433
def __init__(self, use_donut: bool = False, use_tesseract: bool = True):
1534
self.downloader = ImageDownloader()
@@ -21,7 +40,11 @@ def __init__(self, use_donut: bool = False, use_tesseract: bool = True):
2140
)
2241

2342
async def download_images(self, urls: List[str]) -> List[Image.Image]:
24-
return await self.downloader.download_images(urls)
43+
async def download_image(url: str) -> Image.Image:
44+
return await self.downloader.download_image(url)
45+
46+
tasks = [asyncio.create_task(download_image(url)) for url in urls]
47+
return await asyncio.gather(*tasks, return_exceptions=True)
2548

2649
async def ocr_extract(
2750
self,

requirements-dev.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,12 @@ just
66
isort
77
black
88
blacken-docs
9+
certifi
910
flake8
1011
prettier
1112
tox
12-
pytest
13+
pytest==7.4.0
14+
pytest-asyncio==0.21.0
1315
pytest-cov
1416
mypy
1517
autoflake

0 commit comments

Comments
 (0)