24
24
},
25
25
{
26
26
"cell_type" : " code" ,
27
- "execution_count" : 1 ,
27
+ "execution_count" : 10 ,
28
28
"metadata" : {},
29
29
"outputs" : [],
30
30
"source" : [
36
36
},
37
37
{
38
38
"cell_type" : " code" ,
39
- "execution_count" : null ,
39
+ "execution_count" : 11 ,
40
40
"metadata" : {},
41
41
"outputs" : [],
42
42
"source" : [
51
51
},
52
52
{
53
53
"cell_type" : " code" ,
54
- "execution_count" : null ,
54
+ "execution_count" : 12 ,
55
55
"metadata" : {},
56
56
"outputs" : [],
57
57
"source" : [
65
65
},
66
66
{
67
67
"cell_type" : " code" ,
68
- "execution_count" : null ,
68
+ "execution_count" : 2 ,
69
69
"metadata" : {},
70
- "outputs" : [],
70
+ "outputs" : [
71
+ {
72
+ "name" : " stdout" ,
73
+ "output_type" : " stream" ,
74
+ "text" : [
75
+ " Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n " ,
76
+ " \u001b [33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n " ,
77
+ " You should consider upgrading via the 'pip install --upgrade pip' command.\u001b [0m\n " ,
78
+ " Note: you may need to restart the kernel to use updated packages.\n "
79
+ ]
80
+ }
81
+ ],
71
82
"source" : [
72
83
" # install from PYPI using secret\n " ,
73
84
" %pip install spark-nlp==2.5.5\n " ,
93
104
},
94
105
{
95
106
"cell_type" : " code" ,
96
- "execution_count" : 2 ,
107
+ "execution_count" : 13 ,
97
108
"metadata" : {},
98
109
"outputs" : [
99
110
{
100
111
"name" : " stdout" ,
101
112
"output_type" : " stream" ,
102
113
"text" : [
103
- " SparkConf Configured, Starting to listen on port: 59744 \n " ,
114
+ " SparkConf Configured, Starting to listen on port: 53378 \n " ,
104
115
" JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n "
105
116
]
106
117
},
114
125
" <div>\n " ,
115
126
" <p><b>SparkContext</b></p>\n " ,
116
127
" \n " ,
117
- " <p><a href=\" http://melnyks -mbp:4043 \" >Spark UI</a></p>\n " ,
128
+ " <p><a href=\" http://kolia -mbp.dlink:4041 \" >Spark UI</a></p>\n " ,
118
129
" \n " ,
119
130
" <dl>\n " ,
120
131
" <dt>Version</dt>\n " ,
121
- " <dd><code>v2.4.4 </code></dd>\n " ,
132
+ " <dd><code>v2.3.2 </code></dd>\n " ,
122
133
" <dt>Master</dt>\n " ,
123
134
" <dd><code>local[*]</code></dd>\n " ,
124
135
" <dt>AppName</dt>\n " ,
130
141
" "
131
142
],
132
143
"text/plain" : [
133
- " <pyspark.sql.session.SparkSession at 0x10c27d2d0 >"
144
+ " <pyspark.sql.session.SparkSession at 0x1195bb510 >"
134
145
]
135
146
},
136
- "execution_count" : 2 ,
147
+ "execution_count" : 13 ,
137
148
"metadata" : {},
138
149
"output_type" : " execute_result"
139
150
}
150
161
},
151
162
{
152
163
"cell_type" : " code" ,
153
- "execution_count" : 4 ,
164
+ "execution_count" : 14 ,
154
165
"metadata" : {},
155
166
"outputs" : [],
156
167
"source" : [
170
181
},
171
182
{
172
183
"cell_type" : " code" ,
173
- "execution_count" : 5 ,
184
+ "execution_count" : 18 ,
174
185
"metadata" : {},
175
186
"outputs" : [],
176
187
"source" : [
193
204
" .setOutputCol(\" spell\" )\n " ,
194
205
" \n " ,
195
206
" tokenAssem = TokenAssembler() \\\n " ,
196
- " .setInputCols(\" spell\" ) \\\n " ,
207
+ " .setInputCols([ \" spell\" , \" document \" ] ) \\\n " ,
197
208
" .setOutputCol(\" newDocs\" )\n " ,
198
209
" \n " ,
199
210
" updatedText = UpdateTextPosition() \\\n " ,
248
259
},
249
260
{
250
261
"cell_type" : " code" ,
251
- "execution_count" : 6 ,
262
+ "execution_count" : 19 ,
252
263
"metadata" : {},
253
264
"outputs" : [],
254
265
"source" : [
266
277
},
267
278
{
268
279
"cell_type" : " code" ,
269
- "execution_count" : 7 ,
280
+ "execution_count" : 20 ,
270
281
"metadata" : {},
271
- "outputs" : [],
282
+ "outputs" : [
283
+ {
284
+ "name" : " stdout" ,
285
+ "output_type" : " stream" ,
286
+ "text" : [
287
+ " spellcheck_norvig download started this may take some time.\n " ,
288
+ " Approximate size to download 4.2 MB\n " ,
289
+ " [OK!]\n "
290
+ ]
291
+ }
292
+ ],
272
293
"source" : [
273
294
" ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n " ,
274
295
" updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n " ,
288
309
},
289
310
{
290
311
"cell_type" : " code" ,
291
- "execution_count" : 9 ,
312
+ "execution_count" : 21 ,
292
313
"metadata" : {
293
314
"pycharm" : {
294
315
"name" : " #%%\n "
298
319
{
299
320
"data" : {
300
321
"text/plain" : [
301
- " 72914 "
322
+ " 1671 "
302
323
]
303
324
},
304
- "execution_count" : 9 ,
325
+ "execution_count" : 21 ,
305
326
"metadata" : {},
306
327
"output_type" : " execute_result"
307
328
}
344
365
},
345
366
"nbformat" : 4 ,
346
367
"nbformat_minor" : 2
347
- }
368
+ }
0 commit comments