|
26 | 26 | },
|
27 | 27 | {
|
28 | 28 | "cell_type": "code",
|
29 |
| - "execution_count": 5, |
| 29 | + "execution_count": null, |
30 | 30 | "metadata": {},
|
31 |
| - "outputs": [ |
32 |
| - { |
33 |
| - "name": "stdout", |
34 |
| - "output_type": "stream", |
35 |
| - "text": [ |
36 |
| - "Collecting datafog==2.3.2b10\n", |
37 |
| - " Downloading datafog-2.3.2b10.tar.gz (13 kB)\n", |
38 |
| - " Installing build dependencies ... \u001b[?25ldone\n", |
39 |
| - "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", |
40 |
| - "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", |
41 |
| - "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", |
42 |
| - "\u001b[?25hRequirement already satisfied: pandas==2.2.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (2.2.1)\n", |
43 |
| - "Requirement already satisfied: presidio-analyzer==2.2.353 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (2.2.353)\n", |
44 |
| - "Requirement already satisfied: pytest==8.0.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (8.0.2)\n", |
45 |
| - "Requirement already satisfied: Requests==2.31.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (2.31.0)\n", |
46 |
| - "Requirement already satisfied: spacy==3.4.4 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (3.4.4)\n", |
47 |
| - "Requirement already satisfied: en-spacy-pii-fast in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (0.0.0)\n", |
48 |
| - "Requirement already satisfied: numpy<2,>=1.23.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (1.26.4)\n", |
49 |
| - "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (2.9.0.post0)\n", |
50 |
| - "Requirement already satisfied: pytz>=2020.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (2024.1)\n", |
51 |
| - "Requirement already satisfied: tzdata>=2022.7 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (2024.1)\n", |
52 |
| - "Requirement already satisfied: regex in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (2023.12.25)\n", |
53 |
| - "Requirement already satisfied: tldextract in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (5.1.2)\n", |
54 |
| - "Requirement already satisfied: pyyaml in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (6.0.1)\n", |
55 |
| - "Requirement already satisfied: phonenumbers<9.0.0,>=8.12 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (8.13.32)\n", |
56 |
| - "Requirement already satisfied: iniconfig in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pytest==8.0.2->datafog==2.3.2b10) (2.0.0)\n", |
57 |
| - "Requirement already satisfied: packaging in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pytest==8.0.2->datafog==2.3.2b10) (24.0)\n", |
58 |
| - "Requirement already satisfied: pluggy<2.0,>=1.3.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pytest==8.0.2->datafog==2.3.2b10) (1.4.0)\n", |
59 |
| - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (2.1.1)\n", |
60 |
| - "Requirement already satisfied: idna<4,>=2.5 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (3.6)\n", |
61 |
| - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (2.2.1)\n", |
62 |
| - "Requirement already satisfied: certifi>=2017.4.17 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (2024.2.2)\n", |
63 |
| - "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.10 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.0.12)\n", |
64 |
| - "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (1.0.5)\n", |
65 |
| - "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (1.0.10)\n", |
66 |
| - "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (2.0.8)\n", |
67 |
| - "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.0.9)\n", |
68 |
| - "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (8.1.12)\n", |
69 |
| - "Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (0.10.1)\n", |
70 |
| - "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (2.4.8)\n", |
71 |
| - "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (2.0.10)\n", |
72 |
| - "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (0.7.0)\n", |
73 |
| - "Requirement already satisfied: pathy>=0.3.5 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (0.11.0)\n", |
74 |
| - "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (6.4.0)\n", |
75 |
| - "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (4.66.2)\n", |
76 |
| - "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (1.10.14)\n", |
77 |
| - "Requirement already satisfied: jinja2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.1.3)\n", |
78 |
| - "Requirement already satisfied: setuptools in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (65.5.0)\n", |
79 |
| - "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.3.0)\n", |
80 |
| - "Requirement already satisfied: pathlib-abc==0.1.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pathy>=0.3.5->spacy==3.4.4->datafog==2.3.2b10) (0.1.1)\n", |
81 |
| - "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy==3.4.4->datafog==2.3.2b10) (4.10.0)\n", |
82 |
| - "Requirement already satisfied: six>=1.5 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas==2.2.1->datafog==2.3.2b10) (1.16.0)\n", |
83 |
| - "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.0->spacy==3.4.4->datafog==2.3.2b10) (0.7.11)\n", |
84 |
| - "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.0->spacy==3.4.4->datafog==2.3.2b10) (0.1.4)\n", |
85 |
| - "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from typer<0.8.0,>=0.3.0->spacy==3.4.4->datafog==2.3.2b10) (8.1.7)\n", |
86 |
| - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from jinja2->spacy==3.4.4->datafog==2.3.2b10) (2.1.5)\n", |
87 |
| - "Requirement already satisfied: requests-file>=1.4 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from tldextract->presidio-analyzer==2.2.353->datafog==2.3.2b10) (2.0.0)\n", |
88 |
| - "Requirement already satisfied: filelock>=3.0.8 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from tldextract->presidio-analyzer==2.2.353->datafog==2.3.2b10) (3.13.1)\n", |
89 |
| - "Building wheels for collected packages: datafog\n", |
90 |
| - " Building wheel for datafog (pyproject.toml) ... \u001b[?25ldone\n", |
91 |
| - "\u001b[?25h Created wheel for datafog: filename=datafog-2.3.2b10-py3-none-any.whl size=10839 sha256=98c6651a54b1e3b5d878d59fa534c7c8c22e1e6d4a49f04b43d4e447b9bd7e90\n", |
92 |
| - " Stored in directory: /Users/sidmohan/Library/Caches/pip/wheels/a2/87/a5/513ca3a2ad3d826f945f1277a85346ae1bfd4d6261bb202b2d\n", |
93 |
| - "Successfully built datafog\n", |
94 |
| - "Installing collected packages: datafog\n", |
95 |
| - " Attempting uninstall: datafog\n", |
96 |
| - " Found existing installation: datafog 2.3.2b9\n", |
97 |
| - " Uninstalling datafog-2.3.2b9:\n", |
98 |
| - " Successfully uninstalled datafog-2.3.2b9\n", |
99 |
| - "Successfully installed datafog-2.3.2b10\n", |
100 |
| - "Note: you may need to restart the kernel to use updated packages.\n" |
101 |
| - ] |
102 |
| - } |
103 |
| - ], |
| 31 | + "outputs": [], |
104 | 32 | "source": [
|
105 | 33 | "# Initialize\n",
|
106 |
| - "%pip install datafog==2.3.2b10\n", |
| 34 | + "%pip install datafog==2.4.0b1\n", |
107 | 35 | "import json\n",
|
108 | 36 | "\n",
|
109 | 37 | "import requests\n",
|
| 38 | + "import datafog\n", |
110 | 39 | "from datafog import PresidioEngine as presidio\n",
|
111 | 40 | "import pandas as pd"
|
112 | 41 | ]
|
|
120 | 49 | },
|
121 | 50 | {
|
122 | 51 | "cell_type": "code",
|
123 |
| - "execution_count": 6, |
| 52 | + "execution_count": null, |
124 | 53 | "metadata": {},
|
125 |
| - "outputs": [ |
126 |
| - { |
127 |
| - "name": "stdout", |
128 |
| - "output_type": "stream", |
129 |
| - "text": [ |
130 |
| - " uuid \\\n", |
131 |
| - "0 a1b2c3d4-e5f6-7g8h-9i0j-k1l2m3n4o5p6 \n", |
132 |
| - "1 q9w8e7r6-t5y4-u3i2-o1p0-a9s8d7f6g5h4 \n", |
133 |
| - "2 z1x2c3v4-b5n6-m7q8-w9e0-r1t2y3u4i5o6 \n", |
134 |
| - "3 p1o2i3u4-y5t6-r7e8-w9q0-a1s2d3f4g5h6 \n", |
135 |
| - "4 l1k2j3h4-g5f6-d7s8-a9q0-w1e2r3t4y5u6 \n", |
136 |
| - "\n", |
137 |
| - " text_chunk \\\n", |
138 |
| - "0 Cisco to Acquire Splunk, to Help Make Organiza... \n", |
139 |
| - "1 Cisco intends to acquire Splunk for $157 per s... \n", |
140 |
| - "2 Our combined capabilities will drive the next ... \n", |
141 |
| - "3 Tidal Partners LLC is acting as financial advi... \n", |
142 |
| - "4 Cisco will host a conference call for Thursday... \n", |
143 |
| - "\n", |
144 |
| - " doc_source \n", |
145 |
| - "0 CEO_Google_Drive_Press_Release_Draft.docx \n", |
146 |
| - "1 CEO_Google_Drive_Press_Release_Draft.docx \n", |
147 |
| - "2 CEO_Google_Drive_Press_Release_Draft.docx \n", |
148 |
| - "3 CEO_Google_Drive_Press_Release_Draft.docx \n", |
149 |
| - "4 CEO_Google_Drive_Press_Release_Draft.docx \n" |
150 |
| - ] |
151 |
| - } |
152 |
| - ], |
| 54 | + "outputs": [], |
153 | 55 | "source": [
|
154 | 56 | "# Load the JSON data from the URL\n",
|
155 | 57 | "url = \"https://gist.githubusercontent.com/sidmohan0/757185e0b9ff63fe00096baa0ce3fa45/raw/cb30da88e985d171bef281c927434cac52c239ea/sample.json\"\n",
|
|
165 | 67 | },
|
166 | 68 | {
|
167 | 69 | "cell_type": "code",
|
168 |
| - "execution_count": 7, |
| 70 | + "execution_count": null, |
169 | 71 | "metadata": {},
|
170 | 72 | "outputs": [],
|
171 | 73 | "source": [
|
|
200 | 102 | },
|
201 | 103 | {
|
202 | 104 | "cell_type": "code",
|
203 |
| - "execution_count": 8, |
| 105 | + "execution_count": null, |
| 106 | + "metadata": {}, |
| 107 | + "outputs": [], |
| 108 | + "source": [ |
| 109 | + "print(df[\"scan_results\"])" |
| 110 | + ] |
| 111 | + }, |
| 112 | + { |
| 113 | + "cell_type": "markdown", |
| 114 | + "metadata": {}, |
| 115 | + "source": [ |
| 116 | + "### PDF" |
| 117 | + ] |
| 118 | + }, |
| 119 | + { |
| 120 | + "cell_type": "code", |
| 121 | + "execution_count": null, |
| 122 | + "metadata": {}, |
| 123 | + "outputs": [], |
| 124 | + "source": [ |
| 125 | + "%pip list\n" |
| 126 | + ] |
| 127 | + }, |
| 128 | + { |
| 129 | + "cell_type": "code", |
| 130 | + "execution_count": 3, |
204 | 131 | "metadata": {},
|
205 | 132 | "outputs": [
|
206 | 133 | {
|
207 | 134 | "name": "stdout",
|
208 | 135 | "output_type": "stream",
|
209 | 136 | "text": [
|
210 |
| - "0 [type: CUSTOM_PII, start: 0, end: 5, score: 1....\n", |
211 |
| - "1 [type: CUSTOM_PII, start: 0, end: 5, score: 1....\n", |
212 |
| - "2 [type: CUSTOM_PII, start: 41, end: 56, score: ...\n", |
213 |
| - "3 [type: CUSTOM_PII, start: 0, end: 18, score: 1...\n", |
214 |
| - "4 [type: CUSTOM_PII, start: 0, end: 5, score: 1....\n", |
215 |
| - "Name: scan_results, dtype: object\n" |
| 137 | + "{'agi-builder-meetup.pdf': \"2/26/24, 2:16 PM\\nAGI Builders Meetup SF · Luma\\nContact the HostReport Event29\\nEvent FullIf youʼd like, you can join the waitlist.Please click on the button below to join the waitlist.You will be notified if additional spots becomeavailable.\\nSubscribe\\nHosted ByEric LIU\\n5\\x0030pm - 6\\x0000pm: Doors open and check-in.\\nRegistration\\nFEBThursday, February 295\\x0030 PM - 8\\x0000 PM\\nOpen stage for AI builders, researchersand enthusiasts to share, inspire andtransform.\\nJoin Waitlist\\x00\\x00\\x00\\x00 PM PST\\nPresented byAGI Builders M…\\nAbout Event👋 We're thrilled to invite you to the first AGI Buildersmeetup on the leap day of 2024, February 29th.❤ It's a gathering where AI builders, researchers andenthusiasts share ideas, inspire peers and transform thefuture.💡 Participants can expect engaging tech talks coveringthe latest challenges and advancements in AI.🍕 Light refreshments will be available.Agenda:\\nAGI Builders Meetup SF\\nCloudflareSan Francisco, California\\nFeatured in Generative AI San Fra…\\nSign In\\nhttps://lu.ma/32549yyf\\n1/3\\n2/26/24, 2:16 PM\\nAGI Builders Meetup SF · Luma\\n7\\x0040pm - 8\\x0000pm: NetworkingAbout the hosts:Cloudflare helps organizations make employees,applications and networks faster & more secure.BentoML empowers developers to run any AI models inthe cloud and scale with confidence.Note:\\nThis event will be held in person, and due to limitedcapacity, registration is required for entry.Registration will close 2 days before the event.\\n7\\x0010pm - 7\\x0040pm: Enterprise Retrieval - AugmentedGeneration with LlamaIndex\\nWe host monthly meetups in San Francisco, havean idea you'd like to present at future events?Please apply here.\\nLocationCloudflare101 Townsend St, San Francisco, CA 94107, USA\\nby Chaoyu Yang, Founder & CEO, BentoML\\n6\\x0000pm - 6\\x0010pm: Opening\\nby Laurie Voss, VP Developer Relations,LlamaIndex\\n6\\x0040pm - 7\\x0010pm: RAG as a service with BentoML\\n6\\x0010pm - 6\\x0040pm: Phoney AI\\nby Craig Dennis, Developer Educator AI,Cloudflare\\nhttps://lu.ma/32549yyf\\n2/3\\n2/26/24, 2:16 PM\\nAGI Builders Meetup SF · Luma\\nWhat's NewExplorePricingHelp\\nMap data ©2024 Google\\nView larger map\\nhttps://lu.ma/32549yyf\\n3/3\\n\"}\n" |
216 | 138 | ]
|
217 | 139 | }
|
218 | 140 | ],
|
219 | 141 | "source": [
|
220 |
| - "print(df[\"scan_results\"])" |
| 142 | + "input_file = \"/Users/sidmohan/Desktop/datafog-v2.4.0/datafog-python/tests/files/input_files/agi-builder-meetup.pdf\"\n", |
| 143 | + "output = datafog.DataFog.upload_file(uploaded_file_path=input_file)\n", |
| 144 | + "print(output)\n", |
| 145 | + "\n" |
221 | 146 | ]
|
222 | 147 | }
|
223 | 148 | ],
|
|
237 | 162 | "name": "python",
|
238 | 163 | "nbconvert_exporter": "python",
|
239 | 164 | "pygments_lexer": "ipython3",
|
240 |
| - "version": "3.11.7" |
| 165 | + "version": "3.10.1" |
241 | 166 | }
|
242 | 167 | },
|
243 | 168 | "nbformat": 4,
|
|
0 commit comments