Skip to content

Commit 565c692

Browse files
Sid MohanSid Mohan
Sid Mohan
authored and
Sid Mohan
committed
upload_files tests passed
1 parent ae68c09 commit 565c692

File tree

7 files changed

+161
-110
lines changed

7 files changed

+161
-110
lines changed

examples/uploading-file-types.ipynb

+20-16
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"outputs": [],
3232
"source": [
3333
"# Initialize\n",
34-
"%pip install datafog==2.4.0b1\n",
34+
"%pip install datafog==2.4.0b3\n",
3535
"import json\n",
3636
"\n",
3737
"import requests\n",
@@ -122,27 +122,31 @@
122122
"metadata": {},
123123
"outputs": [],
124124
"source": [
125-
"%pip list"
125+
"# Email confirmation for a event meetup\n",
126+
"# input_file = \"/Users/sidmohan/Desktop/datafog-v2.4.0/datafog-python/tests/files/input_files/agi-builder-meetup.pdf\"\n",
127+
"\n",
128+
"# readthedocs for PyPDF\n",
129+
"input_file = \"/Users/sidmohan/Desktop/datafog-v2.4.0/datafog-python/tests/files/input_files/pypdf-readthedocs-io-en-stable.pdf\"\n",
130+
"\n",
131+
"\n",
132+
"# input_file = \"/Users/sidmohan/Desktop/datafog-v2.4.0/datafog-python/tests/files/input_files/pypdf-readthedocs-io-en-stable.pdf\"\n",
133+
"output = datafog.DataFog.upload_file(uploaded_file_path=input_file)\n",
134+
"print(output)"
126135
]
127136
},
128137
{
129-
"cell_type": "code",
130-
"execution_count": 3,
138+
"cell_type": "markdown",
131139
"metadata": {},
132-
"outputs": [
133-
{
134-
"name": "stdout",
135-
"output_type": "stream",
136-
"text": [
137-
"{'agi-builder-meetup.pdf': \"2/26/24, 2:16 PM\\nAGI Builders Meetup SF · Luma\\nContact the HostReport Event29\\nEvent FullIf youʼd like, you can join the waitlist.Please click on the button below to join the waitlist.You will be notified if additional spots becomeavailable.\\nSubscribe\\nHosted ByEric LIU\\n5\\x0030pm - 6\\x0000pm: Doors open and check-in.\\nRegistration\\nFEBThursday, February 295\\x0030 PM - 8\\x0000 PM\\nOpen stage for AI builders, researchersand enthusiasts to share, inspire andtransform.\\nJoin Waitlist\\x00\\x00\\x00\\x00 PM PST\\nPresented byAGI Builders M…\\nAbout Event👋 We're thrilled to invite you to the first AGI Buildersmeetup on the leap day of 2024, February 29th.❤ It's a gathering where AI builders, researchers andenthusiasts share ideas, inspire peers and transform thefuture.💡 Participants can expect engaging tech talks coveringthe latest challenges and advancements in AI.🍕 Light refreshments will be available.Agenda:\\nAGI Builders Meetup SF\\nCloudflareSan Francisco, California\\nFeatured in Generative AI San Fra…\\nSign In\\nhttps://lu.ma/32549yyf\\n1/3\\n2/26/24, 2:16 PM\\nAGI Builders Meetup SF · Luma\\n7\\x0040pm - 8\\x0000pm: NetworkingAbout the hosts:Cloudflare helps organizations make employees,applications and networks faster & more secure.BentoML empowers developers to run any AI models inthe cloud and scale with confidence.Note:\\nThis event will be held in person, and due to limitedcapacity, registration is required for entry.Registration will close 2 days before the event.\\n7\\x0010pm - 7\\x0040pm: Enterprise Retrieval - AugmentedGeneration with LlamaIndex\\nWe host monthly meetups in San Francisco, havean idea you'd like to present at future events?Please apply here.\\nLocationCloudflare101 Townsend St, San Francisco, CA 94107, USA\\nby Chaoyu Yang, Founder & CEO, BentoML\\n6\\x0000pm - 6\\x0010pm: Opening\\nby Laurie Voss, VP Developer Relations,LlamaIndex\\n6\\x0040pm - 7\\x0010pm: RAG as a service with BentoML\\n6\\x0010pm - 6\\x0040pm: Phoney AI\\nby Craig Dennis, Developer Educator AI,Cloudflare\\nhttps://lu.ma/32549yyf\\n2/3\\n2/26/24, 2:16 PM\\nAGI Builders Meetup SF · Luma\\nWhat's NewExplorePricingHelp\\nMap data ©2024 Google\\nView larger map\\nhttps://lu.ma/32549yyf\\n3/3\\n\"}\n"
138-
]
139-
}
140-
],
141140
"source": [
142-
"input_file = \"/Users/sidmohan/Desktop/datafog-v2.4.0/datafog-python/tests/files/input_files/agi-builder-meetup.pdf\"\n",
143-
"output = datafog.DataFog.upload_file(uploaded_file_path=input_file)\n",
144-
"print(output)"
141+
"### PPTX \n"
145142
]
143+
},
144+
{
145+
"cell_type": "code",
146+
"execution_count": null,
147+
"metadata": {},
148+
"outputs": [],
149+
"source": []
146150
}
147151
],
148152
"metadata": {

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,6 @@ yarl==1.8.1
99
frozenlist==1.3.1
1010
en_spacy_pii_fast
1111
unstructured[pdf]
12+
unstructured[pptx]
1213

1314

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
def __version__():
9-
return "2.4.0b1"
9+
return "2.4.0b4"
1010

1111

1212
project_urls = {

src/datafog/__init__.py

+20
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import tempfile
55
from pathlib import Path
6+
from typing import List
67

78
import pandas as pd
89
import requests
@@ -32,6 +33,7 @@ class DataFog:
3233
nlp (spacy.lang): Spacy language model for PII detection.
3334
"""
3435

36+
# Maintaining support
3537
def __init__(self):
3638
"""
3739
Initialize the DataFog instance.
@@ -59,6 +61,7 @@ def upload_file(uploaded_file_path):
5961
if not uploaded_file_path.exists():
6062
return "File not found."
6163
else:
64+
6265
temp_file = tempfile.NamedTemporaryFile(
6366
delete=True, suffix=uploaded_file_path.suffix
6467
)
@@ -71,6 +74,23 @@ def upload_file(uploaded_file_path):
7174

7275
return texts
7376

77+
@staticmethod
78+
def upload_files(uploaded_files: List[str]):
79+
"""
80+
Process uploaded files.
81+
82+
Args:
83+
uploaded_files (List[str]): A list of file paths uploaded by the user.
84+
85+
Returns:
86+
Dict[str, str]: A dictionary containing the processed text for each uploaded file.
87+
"""
88+
texts = {}
89+
for uploaded_file in uploaded_files:
90+
result = DataFog.upload_file(uploaded_file)
91+
texts.update(result)
92+
return texts
93+
7494
def __call__(self, input_source, privacy_operation):
7595
"""
7696
Process the input data and apply the specified privacy operation.
Binary file not shown.

tests/test_datafog.py

+55-29
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# test_datafog.py
22
import pytest
3+
import requests
34

45
from datafog import DataFog
56

@@ -15,7 +16,7 @@ def test_upload_file(datafog):
1516
file_text = result[
1617
uploaded_file.split("/")[-1]
1718
] # Extract the text using the file name as key
18-
assert "Cloudflare" in file_text # confirms that OCR is not on
19+
assert "Cloudflare" in file_text
1920
assert "SF" in file_text
2021
assert "Laurie" in file_text
2122
assert "BentoML" in file_text
@@ -24,43 +25,68 @@ def test_upload_file(datafog):
2425
assert "LangChain" not in file_text
2526

2627

27-
# def test_call_with_redact(datafog):
28-
# url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
29-
# privacy_operation = "redact"
28+
def test_upload_files(datafog):
29+
uploaded_files = [
30+
"tests/files/input_files/agi-builder-meetup.pdf",
31+
"tests/files/input_files/pypdf-readthedocs-io-en-stable.pdf",
32+
]
33+
result = datafog.upload_files(uploaded_files=uploaded_files)
34+
assert "agi-builder-meetup.pdf" in result
35+
assert "pypdf-readthedocs-io-en-stable.pdf" in result
36+
assert "Cloudflare" in result["agi-builder-meetup.pdf"]
37+
assert "SF" in result["agi-builder-meetup.pdf"]
38+
assert "Laurie" in result["agi-builder-meetup.pdf"]
39+
assert "BentoML" in result["agi-builder-meetup.pdf"]
40+
assert "Llama-Index" not in result["agi-builder-meetup.pdf"]
41+
assert "LlamaIndex" in result["agi-builder-meetup.pdf"]
42+
assert "LangChain" not in result["agi-builder-meetup.pdf"]
43+
assert "John Doe" not in result["pypdf-readthedocs-io-en-stable.pdf"]
44+
assert "Emily Davis" not in result["pypdf-readthedocs-io-en-stable.pdf"]
45+
assert "546 Birch St" not in result["pypdf-readthedocs-io-en-stable.pdf"]
46+
assert "Newville" not in result["pypdf-readthedocs-io-en-stable.pdf"]
47+
assert "PyPDF" in result["pypdf-readthedocs-io-en-stable.pdf"]
48+
assert "CLI" in result["pypdf-readthedocs-io-en-stable.pdf"]
49+
assert "Python" in result["pypdf-readthedocs-io-en-stable.pdf"]
50+
assert "PyPDF2" in result["pypdf-readthedocs-io-en-stable.pdf"]
3051

31-
# result = datafog(url, privacy_operation)
3252

33-
# assert "[REDACTED]" in result
34-
# assert "Joe Biden" not in result
53+
def test_call_with_redact(datafog):
54+
url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
55+
privacy_operation = "redact"
3556

57+
result = datafog(url, privacy_operation)
3658

37-
# def test_call_with_annotate(datafog):
38-
# file_path = "sotu_2023.txt"
39-
# url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
40-
# file_content = requests.get(url).text
41-
# with open(file_path, "w") as file:
42-
# file.write(file_content)
43-
# privacy_operation = "annotate"
59+
assert "[REDACTED]" in result
60+
assert "Joe Biden" not in result
4461

45-
# result = datafog(str(file_path), privacy_operation)
4662

47-
# assert "[ORG]" in result
48-
# assert "Joe Biden" not in result
63+
def test_call_with_annotate(datafog):
64+
file_path = "sotu_2023.txt"
65+
url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
66+
file_content = requests.get(url).text
67+
with open(file_path, "w") as file:
68+
file.write(file_content)
69+
privacy_operation = "annotate"
4970

71+
result = datafog(str(file_path), privacy_operation)
5072

51-
# def test_call_with_unsupported_input_type(datafog):
52-
# input_source = 123 # Invalid input type
53-
# privacy_operation = "redact"
73+
assert "[ORG]" in result
74+
assert "Joe Biden" not in result
5475

55-
# with pytest.raises(ValueError, match="Unsupported input source type"):
56-
# datafog(input_source, privacy_operation)
5776

77+
def test_call_with_unsupported_input_type(datafog):
78+
input_source = 123 # Invalid input type
79+
privacy_operation = "redact"
5880

59-
# def test_call_with_unsupported_privacy_operation(datafog):
60-
# url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
61-
# privacy_operation = "invalid_operation"
81+
with pytest.raises(ValueError, match="Unsupported input source type"):
82+
datafog(input_source, privacy_operation)
6283

63-
# with pytest.raises(
64-
# ValueError, match=f"Unsupported privacy operation: {privacy_operation}"
65-
# ):
66-
# datafog(url, privacy_operation)
84+
85+
def test_call_with_unsupported_privacy_operation(datafog):
86+
url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
87+
privacy_operation = "invalid_operation"
88+
89+
with pytest.raises(
90+
ValueError, match=f"Unsupported privacy operation: {privacy_operation}"
91+
):
92+
datafog(url, privacy_operation)

tests/test_presidio.py

+64-64
Original file line numberDiff line numberDiff line change
@@ -1,85 +1,85 @@
1-
# import os
2-
# import sys
1+
import os
2+
import sys
33

4-
# import requests
4+
import requests
55

6-
# from datafog import PresidioEngine as presidio
6+
from datafog import PresidioEngine as presidio
77

8-
# sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
8+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
99

1010

11-
# def test_piiscan_json_detection():
12-
# # Assuming sample_file already contains the data you want to test
13-
# sample_file = "tests/files/input_files/sample.json"
11+
def test_piiscan_json_detection():
12+
# Assuming sample_file already contains the data you want to test
13+
sample_file = "tests/files/input_files/sample.json"
1414

15-
# # Read the contents of sample_file into original_value
16-
# with open(sample_file, "r") as f:
17-
# original_value = f.read()
15+
# Read the contents of sample_file into original_value
16+
with open(sample_file, "r") as f:
17+
original_value = f.read()
1818

19-
# pii_detected = presidio.scan(original_value)
19+
pii_detected = presidio.scan(original_value)
2020

21-
# # write pii_detected to a JSON file called output.json in the same directory as sample_file
22-
# with open(
23-
# "tests/files/output_files/output.json",
24-
# "w",
25-
# ) as f:
26-
# for entity in pii_detected:
27-
# f.write(entity.entity_type)
28-
# f.write("\n")
21+
# write pii_detected to a JSON file called output.json in the same directory as sample_file
22+
with open(
23+
"tests/files/output_files/output.json",
24+
"w",
25+
) as f:
26+
for entity in pii_detected:
27+
f.write(entity.entity_type)
28+
f.write("\n")
2929

3030

31-
# def test_piiscan_csv_detection():
32-
# # Assuming sample_file already contains the data you want to test
33-
# sample_file = "tests/files/input_files/sample.csv"
31+
def test_piiscan_csv_detection():
32+
# Assuming sample_file already contains the data you want to test
33+
sample_file = "tests/files/input_files/sample.csv"
3434

35-
# # Read the contents of sample_file into original_value
36-
# with open(sample_file, "r") as f:
37-
# original_value = f.read()
35+
# Read the contents of sample_file into original_value
36+
with open(sample_file, "r") as f:
37+
original_value = f.read()
3838

39-
# pii_detected = presidio.scan(original_value)
39+
pii_detected = presidio.scan(original_value)
4040

41-
# # write pii_detected to a JSON file called output.json in the same directory as sample_file
42-
# with open(
43-
# "tests/files/output_files/output.csv",
44-
# "w",
45-
# ) as f:
46-
# for entity in pii_detected:
47-
# f.write(entity.entity_type)
48-
# f.write("\n")
41+
# write pii_detected to a JSON file called output.json in the same directory as sample_file
42+
with open(
43+
"tests/files/output_files/output.csv",
44+
"w",
45+
) as f:
46+
for entity in pii_detected:
47+
f.write(entity.entity_type)
48+
f.write("\n")
4949

5050

51-
# def test_piiscan_txt_detection():
52-
# # Assuming sample_file already contains the data you want to test
53-
# sample_file = "tests/files/input_files/sample.txt"
54-
# # Read the contents of sample_file into original_value
55-
# with open(sample_file, "r") as f:
56-
# original_value = f.read()
51+
def test_piiscan_txt_detection():
52+
# Assuming sample_file already contains the data you want to test
53+
sample_file = "tests/files/input_files/sample.txt"
54+
# Read the contents of sample_file into original_value
55+
with open(sample_file, "r") as f:
56+
original_value = f.read()
5757

58-
# pii_detected = presidio.scan(original_value)
58+
pii_detected = presidio.scan(original_value)
5959

60-
# # write pii_detected to a JSON file called output.json in the same directory as sample_file
61-
# with open(
62-
# "tests/files/output_files/output.txt",
63-
# "w",
64-
# ) as f:
65-
# for entity in pii_detected:
66-
# f.write(entity.entity_type)
67-
# f.write("\n")
60+
# write pii_detected to a JSON file called output.json in the same directory as sample_file
61+
with open(
62+
"tests/files/output_files/output.txt",
63+
"w",
64+
) as f:
65+
for entity in pii_detected:
66+
f.write(entity.entity_type)
67+
f.write("\n")
6868

6969

70-
# def test_piiscan_url_detection():
71-
# # Assuming sample_file already contains the data you want to test
72-
# sample_url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
70+
def test_piiscan_url_detection():
71+
# Assuming sample_file already contains the data you want to test
72+
sample_url = "https://gist.githubusercontent.com/sidmohan0/1aa3ec38b4e6594d3c34b113f2e0962d/raw/42e57146197be0f85a5901cd1dcdd9ad15b31bab/sotu_2023.txt"
7373

74-
# response = requests.get(sample_url)
75-
# original_value = response.text
76-
# pii_detected = presidio.scan(original_value)
74+
response = requests.get(sample_url)
75+
original_value = response.text
76+
pii_detected = presidio.scan(original_value)
7777

78-
# # write pii_detected to a output.md in the same directory as sample_url
79-
# with open(
80-
# "tests/files/output_files/output.md",
81-
# "w",
82-
# ) as f:
83-
# for entity in pii_detected:
84-
# f.write(entity.entity_type)
85-
# f.write("\n")
78+
# write pii_detected to a output.md in the same directory as sample_url
79+
with open(
80+
"tests/files/output_files/output.md",
81+
"w",
82+
) as f:
83+
for entity in pii_detected:
84+
f.write(entity.entity_type)
85+
f.write("\n")

0 commit comments

Comments
 (0)