Skip to content

Commit 1621031

Browse files
Updated UpdateTextPosition notebook
1 parent 104a542 commit 1621031

File tree

1 file changed

+42
-21
lines changed

1 file changed

+42
-21
lines changed

jupyter/SparkOcrUpdateTextPosition.ipynb

Lines changed: 42 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
},
2525
{
2626
"cell_type": "code",
27-
"execution_count": 1,
27+
"execution_count": 10,
2828
"metadata": {},
2929
"outputs": [],
3030
"source": [
@@ -36,7 +36,7 @@
3636
},
3737
{
3838
"cell_type": "code",
39-
"execution_count": null,
39+
"execution_count": 11,
4040
"metadata": {},
4141
"outputs": [],
4242
"source": [
@@ -51,7 +51,7 @@
5151
},
5252
{
5353
"cell_type": "code",
54-
"execution_count": null,
54+
"execution_count": 12,
5555
"metadata": {},
5656
"outputs": [],
5757
"source": [
@@ -65,9 +65,20 @@
6565
},
6666
{
6767
"cell_type": "code",
68-
"execution_count": null,
68+
"execution_count": 2,
6969
"metadata": {},
70-
"outputs": [],
70+
"outputs": [
71+
{
72+
"name": "stdout",
73+
"output_type": "stream",
74+
"text": [
75+
"Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n",
76+
"\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n",
77+
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
78+
"Note: you may need to restart the kernel to use updated packages.\n"
79+
]
80+
}
81+
],
7182
"source": [
7283
"# install from PYPI using secret\n",
7384
"%pip install spark-nlp==2.5.5\n",
@@ -93,14 +104,14 @@
93104
},
94105
{
95106
"cell_type": "code",
96-
"execution_count": 2,
107+
"execution_count": 13,
97108
"metadata": {},
98109
"outputs": [
99110
{
100111
"name": "stdout",
101112
"output_type": "stream",
102113
"text": [
103-
"SparkConf Configured, Starting to listen on port: 59744\n",
114+
"SparkConf Configured, Starting to listen on port: 53378\n",
104115
"JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
105116
]
106117
},
@@ -114,11 +125,11 @@
114125
" <div>\n",
115126
" <p><b>SparkContext</b></p>\n",
116127
"\n",
117-
" <p><a href=\"http://melnyks-mbp:4043\">Spark UI</a></p>\n",
128+
" <p><a href=\"http://kolia-mbp.dlink:4041\">Spark UI</a></p>\n",
118129
"\n",
119130
" <dl>\n",
120131
" <dt>Version</dt>\n",
121-
" <dd><code>v2.4.4</code></dd>\n",
132+
" <dd><code>v2.3.2</code></dd>\n",
122133
" <dt>Master</dt>\n",
123134
" <dd><code>local[*]</code></dd>\n",
124135
" <dt>AppName</dt>\n",
@@ -130,10 +141,10 @@
130141
" "
131142
],
132143
"text/plain": [
133-
"<pyspark.sql.session.SparkSession at 0x10c27d2d0>"
144+
"<pyspark.sql.session.SparkSession at 0x1195bb510>"
134145
]
135146
},
136-
"execution_count": 2,
147+
"execution_count": 13,
137148
"metadata": {},
138149
"output_type": "execute_result"
139150
}
@@ -150,7 +161,7 @@
150161
},
151162
{
152163
"cell_type": "code",
153-
"execution_count": 4,
164+
"execution_count": 14,
154165
"metadata": {},
155166
"outputs": [],
156167
"source": [
@@ -170,7 +181,7 @@
170181
},
171182
{
172183
"cell_type": "code",
173-
"execution_count": 5,
184+
"execution_count": 18,
174185
"metadata": {},
175186
"outputs": [],
176187
"source": [
@@ -193,7 +204,7 @@
193204
" .setOutputCol(\"spell\")\n",
194205
" \n",
195206
" tokenAssem = TokenAssembler() \\\n",
196-
" .setInputCols(\"spell\") \\\n",
207+
" .setInputCols([\"spell\", \"document\"]) \\\n",
197208
" .setOutputCol(\"newDocs\")\n",
198209
"\n",
199210
" updatedText = UpdateTextPosition() \\\n",
@@ -248,7 +259,7 @@
248259
},
249260
{
250261
"cell_type": "code",
251-
"execution_count": 6,
262+
"execution_count": 19,
252263
"metadata": {},
253264
"outputs": [],
254265
"source": [
@@ -266,9 +277,19 @@
266277
},
267278
{
268279
"cell_type": "code",
269-
"execution_count": 7,
280+
"execution_count": 20,
270281
"metadata": {},
271-
"outputs": [],
282+
"outputs": [
283+
{
284+
"name": "stdout",
285+
"output_type": "stream",
286+
"text": [
287+
"spellcheck_norvig download started this may take some time.\n",
288+
"Approximate size to download 4.2 MB\n",
289+
"[OK!]\n"
290+
]
291+
}
292+
],
272293
"source": [
273294
"ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n",
274295
"updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n",
@@ -288,7 +309,7 @@
288309
},
289310
{
290311
"cell_type": "code",
291-
"execution_count": 9,
312+
"execution_count": 21,
292313
"metadata": {
293314
"pycharm": {
294315
"name": "#%%\n"
@@ -298,10 +319,10 @@
298319
{
299320
"data": {
300321
"text/plain": [
301-
"72914"
322+
"1671"
302323
]
303324
},
304-
"execution_count": 9,
325+
"execution_count": 21,
305326
"metadata": {},
306327
"output_type": "execute_result"
307328
}
@@ -344,4 +365,4 @@
344365
},
345366
"nbformat": 4,
346367
"nbformat_minor": 2
347-
}
368+
}

0 commit comments

Comments
 (0)