Updated UpdateTextPosition notebook

mykolamelnykml · mykolamelnykml · commit 16210316172d · 2020-08-11T13:38:10.000+03:00
diff --git a/jupyter/SparkOcrUpdateTextPosition.ipynb b/jupyter/SparkOcrUpdateTextPosition.ipynb
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -65,9 +65,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n",
+      "\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n",
+      "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
+      "Note: you may need to restart the kernel to use updated packages.\n"
+     ]
+    }
+   ],
    "source": [
     "# install from PYPI using secret\n",
     "%pip install spark-nlp==2.5.5\n",
@@ -93,14 +104,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "SparkConf Configured, Starting to listen on port: 59744\n",
+      "SparkConf Configured, Starting to listen on port: 53378\n",
       "JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
      ]
     },
@@ -114,11 +125,11 @@
        "        <div>\n",
        "            <p><b>SparkContext</b></p>\n",
        "\n",
-       "            <p><a href=\"http://melnyks-mbp:4043\">Spark UI</a></p>\n",
+       "            <p><a href=\"http://kolia-mbp.dlink:4041\">Spark UI</a></p>\n",
        "\n",
        "            <dl>\n",
        "              <dt>Version</dt>\n",
-       "                <dd><code>v2.4.4</code></dd>\n",
+       "                <dd><code>v2.3.2</code></dd>\n",
        "              <dt>Master</dt>\n",
        "                <dd><code>local[*]</code></dd>\n",
        "              <dt>AppName</dt>\n",
@@ -130,10 +141,10 @@
        "        "
       ],
       "text/plain": [
-       "<pyspark.sql.session.SparkSession at 0x10c27d2d0>"
+       "<pyspark.sql.session.SparkSession at 0x1195bb510>"
       ]
      },
-     "execution_count": 2,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -150,7 +161,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -170,7 +181,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -193,7 +204,7 @@
     "          .setOutputCol(\"spell\")\n",
     "    \n",
     "    tokenAssem = TokenAssembler() \\\n",
-    "          .setInputCols(\"spell\") \\\n",
+    "          .setInputCols([\"spell\", \"document\"]) \\\n",
     "          .setOutputCol(\"newDocs\")\n",
     "\n",
     "    updatedText = UpdateTextPosition() \\\n",
@@ -248,7 +259,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -266,9 +277,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 20,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "spellcheck_norvig download started this may take some time.\n",
+      "Approximate size to download 4.2 MB\n",
+      "[OK!]\n"
+     ]
+    }
+   ],
    "source": [
     "ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n",
     "updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n",
@@ -288,7 +309,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 21,
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -298,10 +319,10 @@
     {
      "data": {
       "text/plain": [
-       "72914"
+       "1671"
       ]
      },
-     "execution_count": 9,
+     "execution_count": 21,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -344,4 +365,4 @@
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
+}