|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "<img src=\"../public/colorlogo.png\" width=\"50%\"/>\n", |
| 8 | + "\n", |
| 9 | + "\n", |
| 10 | + "[Homepage](https://www.datafog.ai) | \n", |
| 11 | + "[Discord](https://discord.gg/bzDth394R4) | \n", |
| 12 | + "[Github](https://github.com/datafog/datafog-python) | \n", |
| 13 | + "[Contact](mailto:[email protected])\n", |
| 14 | + "\n" |
| 15 | + ] |
| 16 | + }, |
| 17 | + { |
| 18 | + "cell_type": "markdown", |
| 19 | + "metadata": {}, |
| 20 | + "source": [ |
| 21 | + "## What we're covering\n", |
| 22 | + "\n", |
| 23 | + "In this notebook, we're covering how to use DataFog with different file types.\n", |
| 24 | + "\n" |
| 25 | + ] |
| 26 | + }, |
| 27 | + { |
| 28 | + "cell_type": "code", |
| 29 | + "execution_count": 5, |
| 30 | + "metadata": {}, |
| 31 | + "outputs": [ |
| 32 | + { |
| 33 | + "name": "stdout", |
| 34 | + "output_type": "stream", |
| 35 | + "text": [ |
| 36 | + "Collecting datafog==2.3.2b10\n", |
| 37 | + " Downloading datafog-2.3.2b10.tar.gz (13 kB)\n", |
| 38 | + " Installing build dependencies ... \u001b[?25ldone\n", |
| 39 | + "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", |
| 40 | + "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", |
| 41 | + "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", |
| 42 | + "\u001b[?25hRequirement already satisfied: pandas==2.2.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (2.2.1)\n", |
| 43 | + "Requirement already satisfied: presidio-analyzer==2.2.353 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (2.2.353)\n", |
| 44 | + "Requirement already satisfied: pytest==8.0.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (8.0.2)\n", |
| 45 | + "Requirement already satisfied: Requests==2.31.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (2.31.0)\n", |
| 46 | + "Requirement already satisfied: spacy==3.4.4 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (3.4.4)\n", |
| 47 | + "Requirement already satisfied: en-spacy-pii-fast in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from datafog==2.3.2b10) (0.0.0)\n", |
| 48 | + "Requirement already satisfied: numpy<2,>=1.23.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (1.26.4)\n", |
| 49 | + "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (2.9.0.post0)\n", |
| 50 | + "Requirement already satisfied: pytz>=2020.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (2024.1)\n", |
| 51 | + "Requirement already satisfied: tzdata>=2022.7 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pandas==2.2.1->datafog==2.3.2b10) (2024.1)\n", |
| 52 | + "Requirement already satisfied: regex in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (2023.12.25)\n", |
| 53 | + "Requirement already satisfied: tldextract in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (5.1.2)\n", |
| 54 | + "Requirement already satisfied: pyyaml in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (6.0.1)\n", |
| 55 | + "Requirement already satisfied: phonenumbers<9.0.0,>=8.12 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from presidio-analyzer==2.2.353->datafog==2.3.2b10) (8.13.32)\n", |
| 56 | + "Requirement already satisfied: iniconfig in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pytest==8.0.2->datafog==2.3.2b10) (2.0.0)\n", |
| 57 | + "Requirement already satisfied: packaging in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pytest==8.0.2->datafog==2.3.2b10) (24.0)\n", |
| 58 | + "Requirement already satisfied: pluggy<2.0,>=1.3.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pytest==8.0.2->datafog==2.3.2b10) (1.4.0)\n", |
| 59 | + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (2.1.1)\n", |
| 60 | + "Requirement already satisfied: idna<4,>=2.5 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (3.6)\n", |
| 61 | + "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (2.2.1)\n", |
| 62 | + "Requirement already satisfied: certifi>=2017.4.17 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from Requests==2.31.0->datafog==2.3.2b10) (2024.2.2)\n", |
| 63 | + "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.10 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.0.12)\n", |
| 64 | + "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (1.0.5)\n", |
| 65 | + "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (1.0.10)\n", |
| 66 | + "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (2.0.8)\n", |
| 67 | + "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.0.9)\n", |
| 68 | + "Requirement already satisfied: thinc<8.2.0,>=8.1.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (8.1.12)\n", |
| 69 | + "Requirement already satisfied: wasabi<1.1.0,>=0.9.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (0.10.1)\n", |
| 70 | + "Requirement already satisfied: srsly<3.0.0,>=2.4.3 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (2.4.8)\n", |
| 71 | + "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (2.0.10)\n", |
| 72 | + "Requirement already satisfied: typer<0.8.0,>=0.3.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (0.7.0)\n", |
| 73 | + "Requirement already satisfied: pathy>=0.3.5 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (0.11.0)\n", |
| 74 | + "Requirement already satisfied: smart-open<7.0.0,>=5.2.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (6.4.0)\n", |
| 75 | + "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (4.66.2)\n", |
| 76 | + "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (1.10.14)\n", |
| 77 | + "Requirement already satisfied: jinja2 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.1.3)\n", |
| 78 | + "Requirement already satisfied: setuptools in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (65.5.0)\n", |
| 79 | + "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from spacy==3.4.4->datafog==2.3.2b10) (3.3.0)\n", |
| 80 | + "Requirement already satisfied: pathlib-abc==0.1.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pathy>=0.3.5->spacy==3.4.4->datafog==2.3.2b10) (0.1.1)\n", |
| 81 | + "Requirement already satisfied: typing-extensions>=4.2.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from pydantic!=1.8,!=1.8.1,<1.11.0,>=1.7.4->spacy==3.4.4->datafog==2.3.2b10) (4.10.0)\n", |
| 82 | + "Requirement already satisfied: six>=1.5 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from python-dateutil>=2.8.2->pandas==2.2.1->datafog==2.3.2b10) (1.16.0)\n", |
| 83 | + "Requirement already satisfied: blis<0.8.0,>=0.7.8 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.0->spacy==3.4.4->datafog==2.3.2b10) (0.7.11)\n", |
| 84 | + "Requirement already satisfied: confection<1.0.0,>=0.0.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from thinc<8.2.0,>=8.1.0->spacy==3.4.4->datafog==2.3.2b10) (0.1.4)\n", |
| 85 | + "Requirement already satisfied: click<9.0.0,>=7.1.1 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from typer<0.8.0,>=0.3.0->spacy==3.4.4->datafog==2.3.2b10) (8.1.7)\n", |
| 86 | + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from jinja2->spacy==3.4.4->datafog==2.3.2b10) (2.1.5)\n", |
| 87 | + "Requirement already satisfied: requests-file>=1.4 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from tldextract->presidio-analyzer==2.2.353->datafog==2.3.2b10) (2.0.0)\n", |
| 88 | + "Requirement already satisfied: filelock>=3.0.8 in /Users/sidmohan/Desktop/datafog-pypi-v2.3.2/.venv/lib/python3.11/site-packages (from tldextract->presidio-analyzer==2.2.353->datafog==2.3.2b10) (3.13.1)\n", |
| 89 | + "Building wheels for collected packages: datafog\n", |
| 90 | + " Building wheel for datafog (pyproject.toml) ... \u001b[?25ldone\n", |
| 91 | + "\u001b[?25h Created wheel for datafog: filename=datafog-2.3.2b10-py3-none-any.whl size=10839 sha256=98c6651a54b1e3b5d878d59fa534c7c8c22e1e6d4a49f04b43d4e447b9bd7e90\n", |
| 92 | + " Stored in directory: /Users/sidmohan/Library/Caches/pip/wheels/a2/87/a5/513ca3a2ad3d826f945f1277a85346ae1bfd4d6261bb202b2d\n", |
| 93 | + "Successfully built datafog\n", |
| 94 | + "Installing collected packages: datafog\n", |
| 95 | + " Attempting uninstall: datafog\n", |
| 96 | + " Found existing installation: datafog 2.3.2b9\n", |
| 97 | + " Uninstalling datafog-2.3.2b9:\n", |
| 98 | + " Successfully uninstalled datafog-2.3.2b9\n", |
| 99 | + "Successfully installed datafog-2.3.2b10\n", |
| 100 | + "Note: you may need to restart the kernel to use updated packages.\n" |
| 101 | + ] |
| 102 | + } |
| 103 | + ], |
| 104 | + "source": [ |
| 105 | + "# Initialize\n", |
| 106 | + "%pip install datafog==2.3.2b10\n", |
| 107 | + "import json\n", |
| 108 | + "\n", |
| 109 | + "import requests\n", |
| 110 | + "from datafog import PresidioEngine as presidio\n", |
| 111 | + "import pandas as pd" |
| 112 | + ] |
| 113 | + }, |
| 114 | + { |
| 115 | + "cell_type": "markdown", |
| 116 | + "metadata": {}, |
| 117 | + "source": [ |
| 118 | + "### JSON" |
| 119 | + ] |
| 120 | + }, |
| 121 | + { |
| 122 | + "cell_type": "code", |
| 123 | + "execution_count": 6, |
| 124 | + "metadata": {}, |
| 125 | + "outputs": [ |
| 126 | + { |
| 127 | + "name": "stdout", |
| 128 | + "output_type": "stream", |
| 129 | + "text": [ |
| 130 | + " uuid \\\n", |
| 131 | + "0 a1b2c3d4-e5f6-7g8h-9i0j-k1l2m3n4o5p6 \n", |
| 132 | + "1 q9w8e7r6-t5y4-u3i2-o1p0-a9s8d7f6g5h4 \n", |
| 133 | + "2 z1x2c3v4-b5n6-m7q8-w9e0-r1t2y3u4i5o6 \n", |
| 134 | + "3 p1o2i3u4-y5t6-r7e8-w9q0-a1s2d3f4g5h6 \n", |
| 135 | + "4 l1k2j3h4-g5f6-d7s8-a9q0-w1e2r3t4y5u6 \n", |
| 136 | + "\n", |
| 137 | + " text_chunk \\\n", |
| 138 | + "0 Cisco to Acquire Splunk, to Help Make Organiza... \n", |
| 139 | + "1 Cisco intends to acquire Splunk for $157 per s... \n", |
| 140 | + "2 Our combined capabilities will drive the next ... \n", |
| 141 | + "3 Tidal Partners LLC is acting as financial advi... \n", |
| 142 | + "4 Cisco will host a conference call for Thursday... \n", |
| 143 | + "\n", |
| 144 | + " doc_source \n", |
| 145 | + "0 CEO_Google_Drive_Press_Release_Draft.docx \n", |
| 146 | + "1 CEO_Google_Drive_Press_Release_Draft.docx \n", |
| 147 | + "2 CEO_Google_Drive_Press_Release_Draft.docx \n", |
| 148 | + "3 CEO_Google_Drive_Press_Release_Draft.docx \n", |
| 149 | + "4 CEO_Google_Drive_Press_Release_Draft.docx \n" |
| 150 | + ] |
| 151 | + } |
| 152 | + ], |
| 153 | + "source": [ |
| 154 | + "# Load the JSON data from the URL\n", |
| 155 | + "url = \"https://gist.githubusercontent.com/sidmohan0/757185e0b9ff63fe00096baa0ce3fa45/raw/cb30da88e985d171bef281c927434cac52c239ea/sample.json\"\n", |
| 156 | + "data = requests.get(url).text\n", |
| 157 | + "# Parse the JSON data\n", |
| 158 | + "data = json.loads(data)\n", |
| 159 | + "# print(data)\n", |
| 160 | + "# Create a DataFrame from the JSON data\n", |
| 161 | + "df = pd.DataFrame(data)\n", |
| 162 | + "# print(df.shape)\n", |
| 163 | + "print(df.head(10))" |
| 164 | + ] |
| 165 | + }, |
| 166 | + { |
| 167 | + "cell_type": "code", |
| 168 | + "execution_count": 7, |
| 169 | + "metadata": {}, |
| 170 | + "outputs": [], |
| 171 | + "source": [ |
| 172 | + "ban_list = [\n", |
| 173 | + " \"Cisco\",\n", |
| 174 | + " \"Splunk\",\n", |
| 175 | + " \"Tidal Partners LLC\",\n", |
| 176 | + " \"$157 per share\",\n", |
| 177 | + " \"$28 billion\",\n", |
| 178 | + " \"equity value\",\n", |
| 179 | + " \"Chuck Robbins\",\n", |
| 180 | + " \"acquire\",\n", |
| 181 | + " \"acquisition\",\n", |
| 182 | + " \"September 21, 2023\",\n", |
| 183 | + " \"5:15 am (PT)\",\n", |
| 184 | + " \"8:15 am (ET)\",\n", |
| 185 | + " \"conference call\",\n", |
| 186 | + " \"AI-enabled security\",\n", |
| 187 | + " \"observability\",\n", |
| 188 | + " \"next generation\",\n", |
| 189 | + " \"financial advisor\",\n", |
| 190 | + " \"CEO_Google_Drive_Press_Release_Draft.docx\",\n", |
| 191 | + "]\n", |
| 192 | + "\n", |
| 193 | + "\n", |
| 194 | + "def scan_text(text):\n", |
| 195 | + " return presidio.scan(text, deny_list=ban_list)\n", |
| 196 | + "\n", |
| 197 | + "\n", |
| 198 | + "df[\"scan_results\"] = df[\"text_chunk\"].apply(scan_text)" |
| 199 | + ] |
| 200 | + }, |
| 201 | + { |
| 202 | + "cell_type": "code", |
| 203 | + "execution_count": 8, |
| 204 | + "metadata": {}, |
| 205 | + "outputs": [ |
| 206 | + { |
| 207 | + "name": "stdout", |
| 208 | + "output_type": "stream", |
| 209 | + "text": [ |
| 210 | + "0 [type: CUSTOM_PII, start: 0, end: 5, score: 1....\n", |
| 211 | + "1 [type: CUSTOM_PII, start: 0, end: 5, score: 1....\n", |
| 212 | + "2 [type: CUSTOM_PII, start: 41, end: 56, score: ...\n", |
| 213 | + "3 [type: CUSTOM_PII, start: 0, end: 18, score: 1...\n", |
| 214 | + "4 [type: CUSTOM_PII, start: 0, end: 5, score: 1....\n", |
| 215 | + "Name: scan_results, dtype: object\n" |
| 216 | + ] |
| 217 | + } |
| 218 | + ], |
| 219 | + "source": [ |
| 220 | + "print(df[\"scan_results\"])" |
| 221 | + ] |
| 222 | + } |
| 223 | + ], |
| 224 | + "metadata": { |
| 225 | + "kernelspec": { |
| 226 | + "display_name": ".venv", |
| 227 | + "language": "python", |
| 228 | + "name": "python3" |
| 229 | + }, |
| 230 | + "language_info": { |
| 231 | + "codemirror_mode": { |
| 232 | + "name": "ipython", |
| 233 | + "version": 3 |
| 234 | + }, |
| 235 | + "file_extension": ".py", |
| 236 | + "mimetype": "text/x-python", |
| 237 | + "name": "python", |
| 238 | + "nbconvert_exporter": "python", |
| 239 | + "pygments_lexer": "ipython3", |
| 240 | + "version": "3.11.7" |
| 241 | + } |
| 242 | + }, |
| 243 | + "nbformat": 4, |
| 244 | + "nbformat_minor": 2 |
| 245 | +} |
0 commit comments