Skip to content

Commit 28ca8de

Browse files
Updated notebooks
1 parent c2ed3d3 commit 28ca8de

File tree

4 files changed

+59
-69
lines changed

4 files changed

+59
-69
lines changed

jupyter/SparkOCRWriteImageToS3.ipynb

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -217,8 +217,8 @@
217217
"binary_to_image.setInputCol(\"content\")\n",
218218
"binary_to_image.setOutputCol(\"image\")\n",
219219
"\n",
220-
"# Run tesseract OCR for each region\n",
221-
"ocr = TesseractOcr()\n",
220+
"# Run OCR for each region\n",
221+
"ocr = ImageToText()\n",
222222
"ocr.setInputCol(\"image\")\n",
223223
"ocr.setOutputCol(\"text\")\n",
224224
"ocr.setIgnoreResolution(False)\n",
@@ -880,8 +880,13 @@
880880
}
881881
],
882882
"source": [
883-
"results.write.format(\"binaryFormat\").option(\"type\", \"text\").option(\"field\", \"text\")\\\n",
884-
" .option(\"extension\", \"txt\").mode(\"overwrite\").save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/texts/\")"
883+
"results.write \\\n",
884+
" .format(\"binaryFormat\") \\\n",
885+
" .option(\"type\", \"text\") \\\n",
886+
" .option(\"field\", \"text\") \\\n",
887+
" .option(\"extension\", \"txt\") \\\n",
888+
" .mode(\"overwrite\") \\\n",
889+
" .save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/texts/\")"
885890
]
886891
}
887892
],
@@ -906,13 +911,13 @@
906911
"pycharm": {
907912
"stem_cell": {
908913
"cell_type": "raw",
909-
"source": [],
910914
"metadata": {
911915
"collapsed": false
912-
}
916+
},
917+
"source": []
913918
}
914919
}
915920
},
916921
"nbformat": 4,
917922
"nbformat_minor": 4
918-
}
923+
}

jupyter/SparkOCRWritePdfToS3.ipynb

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -173,12 +173,10 @@
173173
},
174174
{
175175
"cell_type": "markdown",
176+
"metadata": {},
176177
"source": [
177178
"## Read pdf objects"
178-
],
179-
"metadata": {
180-
"collapsed": false
181-
}
179+
]
182180
},
183181
{
184182
"cell_type": "code",
@@ -225,8 +223,8 @@
225223
" .setInputCol(\"image_raw\") \\\n",
226224
" .setOutputCol(\"image\") \\\n",
227225
" .setThreshold(130)\n",
228-
"# Run tesseract OCR for each region\n",
229-
"ocr = TesseractOcr() \\\n",
226+
"# Run OCR for each region\n",
227+
"ocr = ImageToText() \\\n",
230228
" .setInputCol(\"image\") \\\n",
231229
" .setOutputCol(\"text\") \\\n",
232230
" .setIgnoreResolution(False) \\\n",
@@ -899,8 +897,13 @@
899897
}
900898
],
901899
"source": [
902-
"results.write.format(\"binaryFormat\").option(\"type\", \"pdf\").option(\"field\", \"pdf\")\\\n",
903-
" .option(\"extension\", \"pdf\").mode(\"overwrite\").save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/pdfs/\")"
900+
"results.write\n",
901+
" .format(\"binaryFormat\") \\\n",
902+
" .option(\"type\", \"pdf\") \\\n",
903+
" .option(\"field\", \"pdf\") \\\n",
904+
" .option(\"extension\", \"pdf\") \\\n",
905+
" .mode(\"overwrite\") \\\n",
906+
" .save(\"s3a://dev.johnsnowlabs.com/ocr/datasets/output/pdfs/\")"
904907
]
905908
}
906909
],
@@ -925,13 +928,13 @@
925928
"pycharm": {
926929
"stem_cell": {
927930
"cell_type": "raw",
928-
"source": [],
929931
"metadata": {
930932
"collapsed": false
931-
}
933+
},
934+
"source": []
932935
}
933936
}
934937
},
935938
"nbformat": 4,
936939
"nbformat_minor": 4
937-
}
940+
}

jupyter/SparkOcrSavedLoadedPipeline.ipynb

Lines changed: 31 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,25 @@
44
"cell_type": "markdown",
55
"metadata": {},
66
"source": [
7-
"# Save Images Objects to S3 using Spark OCR\n",
7+
"# Save/Load Spark OCR pipeline\n",
88
"## Initialize spark session"
99
]
1010
},
1111
{
1212
"cell_type": "code",
13+
"execution_count": null,
1314
"metadata": {
1415
"pycharm": {
1516
"name": "#%%\n"
1617
}
1718
},
19+
"outputs": [],
1820
"source": [
1921
"secret = \"\"\n",
2022
"license = \"\"\n",
2123
"version = secret.split(\"-\")[0]\n",
2224
"spark_ocr_jar_path = \"../../target/scala-2.11\""
23-
],
24-
"execution_count": null,
25-
"outputs": []
25+
]
2626
},
2727
{
2828
"cell_type": "code",
@@ -80,23 +80,14 @@
8080
"metadata": {},
8181
"outputs": [],
8282
"source": [
83-
"\n",
8483
"from pyspark import SparkConf\n",
8584
"from sparkocr import start\n",
8685
"\n",
8786
"if license:\n",
8887
" os.environ['JSL_OCR_LICENSE'] = license\n",
8988
" \n",
90-
"# you can set AWS API Keys to env variables \n",
91-
"# os.environ['AWS_ACCESS_KEY_ID'] = \"your key\"\n",
92-
"# os.environ['AWS_SECRET_ACCESS_KEY'] = \"your secret\"\n",
93-
"\n",
94-
"# set additinal dependensies for read data from S3\n",
9589
"conf = SparkConf() \\\n",
9690
" .set(\"spark.jars.packages\", \"org.apache.hadoop:hadoop-aws:2.7.3\")\n",
97-
"# or you can set AWS API Keys here\n",
98-
"# .set('spark.hadoop.fs.s3a.access.key', \"your key\" ) \\\n",
99-
"# .set('spark.hadoop.fs.s3a.secret.key', \"your secret\")\n",
10091
"\n",
10192
"spark = start(secret=secret, jar_path=spark_ocr_jar_path, extra_conf=conf)\n",
10293
"spark"
@@ -205,8 +196,8 @@
205196
"binary_to_image.setInputCol(\"content\")\n",
206197
"binary_to_image.setOutputCol(\"image\")\n",
207198
"\n",
208-
"# Run tesseract OCR for each region\n",
209-
"ocr = TesseractOcr()\n",
199+
"# Run OCR for each region\n",
200+
"ocr = ImageToText()\n",
210201
"ocr.setInputCol(\"image\")\n",
211202
"ocr.setOutputCol(\"text\")\n",
212203
"ocr.setIgnoreResolution(False)\n",
@@ -271,72 +262,63 @@
271262
},
272263
{
273264
"cell_type": "markdown",
265+
"metadata": {},
274266
"source": [
275267
"## save the fitted pipeline to disk"
276-
],
277-
"metadata": {
278-
"collapsed": false
279-
}
268+
]
280269
},
281270
{
282271
"cell_type": "code",
283272
"execution_count": null,
284-
"outputs": [],
285-
"source": [
286-
"model.write().overwrite().save(\"ocr_model\")"
287-
],
288273
"metadata": {
289-
"collapsed": false,
290274
"pycharm": {
291275
"name": "#%%\n"
292276
}
293-
}
277+
},
278+
"outputs": [],
279+
"source": [
280+
"model.write().overwrite().save(\"ocr_model\")"
281+
]
294282
},
295283
{
296284
"cell_type": "markdown",
285+
"metadata": {},
297286
"source": [
298287
"## save the unfit pipeline to disk"
299-
],
300-
"metadata": {
301-
"collapsed": false
302-
}
288+
]
303289
},
304290
{
305291
"cell_type": "code",
306292
"execution_count": null,
307-
"outputs": [],
308-
"source": [
309-
"pipeline.write().overwrite().save(\"unfit_ocr_model\")"
310-
],
311293
"metadata": {
312-
"collapsed": false,
313294
"pycharm": {
314295
"name": "#%%\n"
315296
}
316-
}
297+
},
298+
"outputs": [],
299+
"source": [
300+
"pipeline.write().overwrite().save(\"unfit_ocr_model\")"
301+
]
317302
},
318303
{
319304
"cell_type": "markdown",
305+
"metadata": {},
320306
"source": [
321307
"## load back the model pipeline"
322-
],
323-
"metadata": {
324-
"collapsed": false
325-
}
308+
]
326309
},
327310
{
328311
"cell_type": "code",
329312
"execution_count": null,
330-
"outputs": [],
331-
"source": [
332-
"sameModel = PipelineModel.load(\"ocr_model\")\n"
333-
],
334313
"metadata": {
335-
"collapsed": false,
336314
"pycharm": {
337315
"name": "#%%\n"
338316
}
339-
}
317+
},
318+
"outputs": [],
319+
"source": [
320+
"sameModel = PipelineModel.load(\"ocr_model\")\n"
321+
]
340322
}
341323
],
342324
"metadata": {
@@ -355,18 +337,18 @@
355337
"name": "python",
356338
"nbconvert_exporter": "python",
357339
"pygments_lexer": "ipython3",
358-
"version": "3.7.6"
340+
"version": "3.7.7"
359341
},
360342
"pycharm": {
361343
"stem_cell": {
362344
"cell_type": "raw",
363-
"source": [],
364345
"metadata": {
365346
"collapsed": false
366-
}
347+
},
348+
"source": []
367349
}
368350
}
369351
},
370352
"nbformat": 4,
371353
"nbformat_minor": 2
372-
}
354+
}

jupyter/SparkOcrUpdateTextPosition.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@
226226
" .setOutputCol(\"image\") \\\n",
227227
" .setThreshold(130)\n",
228228
"\n",
229-
" ocr = TesseractOcr() \\\n",
229+
" ocr = ImageToText() \\\n",
230230
" .setInputCol(\"image\") \\\n",
231231
" .setOutputCol(\"text\") \\\n",
232232
" .setIgnoreResolution(False) \\\n",
@@ -336,4 +336,4 @@
336336
},
337337
"nbformat": 4,
338338
"nbformat_minor": 2
339-
}
339+
}

0 commit comments

Comments
 (0)