Skip to content

Commit cfe25a0

Browse files
Merge branch 'master1' into 130-release-candidate
2 parents 289c296 + 8c58c40 commit cfe25a0

10 files changed

+39
-39
lines changed

jupyter/SparkOCRGreyBackground.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -205,14 +205,14 @@
205205
"remove_objects.setMaxSizeObject(1000)\n",
206206
"remove_objects.setMinSizeObject(None)\n",
207207
"\n",
208-
"# Run OCR for each region\n",
209-
"ocr_corrected = ImageToText()\n",
208+
"# Run tesseract OCR for each region\n",
209+
"ocr_corrected = TesseractOcr()\n",
210210
"ocr_corrected.setInputCol(\"corrected_image\")\n",
211211
"ocr_corrected.setOutputCol(\"text_corrected\")\n",
212212
"ocr_corrected.setPositionsCol(\"positions_corrected\")\n",
213213
"ocr_corrected.setConfidenceThreshold(75)\n",
214214
"\n",
215-
"ocr = ImageToText()\n",
215+
"ocr = TesseractOcr()\n",
216216
"ocr.setInputCol(\"image\")\n",
217217
"ocr.setOutputCol(\"text\")\n",
218218
"\n",
@@ -535,4 +535,4 @@
535535
},
536536
"nbformat": 4,
537537
"nbformat_minor": 2
538-
}
538+
}

jupyter/SparkOCRS3AccesExample.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,8 +216,8 @@
216216
"binary_to_image = BinaryToImage()\n",
217217
"binary_to_image.setOutputCol(\"image\")\n",
218218
"\n",
219-
"# Run OCR for each region\n",
220-
"ocr = ImageToText()\n",
219+
"# Run tesseract OCR for each region\n",
220+
"ocr = TesseractOcr()\n",
221221
"ocr.setInputCol(\"image\")\n",
222222
"ocr.setOutputCol(\"text\")\n",
223223
"ocr.setIgnoreResolution(False)\n",

jupyter/SparkOCRremoveBackgroundNoise.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,8 @@
207207
"remove_objects.setOutputCol(\"corrected_image\")\n",
208208
"remove_objects.setMinSizeFont(30)\n",
209209
"\n",
210-
"# Run OCR for each region\n",
211-
"ocr = ImageToText()\n",
210+
"# Run tesseract OCR for each region\n",
211+
"ocr = TesseractOcr()\n",
212212
"ocr.setInputCol(\"corrected_image\")\n",
213213
"ocr.setOutputCol(\"text\")\n",
214214
"# Path to the tessdata related to the OS and version\n",
@@ -647,4 +647,4 @@
647647
},
648648
"nbformat": 4,
649649
"nbformat_minor": 2
650-
}
650+
}

jupyter/SparkOCRremoveRackgroundNoiseAndDrawRegions.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -202,15 +202,15 @@
202202
"draw_regions.setInputRegionsCol(\"region\")\n",
203203
"draw_regions.setOutputCol(\"image_with_regions\")\n",
204204
"\n",
205-
"# Run OCR for corrected image\n",
206-
"ocr_corrected = ImageToText()\n",
205+
"# Run tesseract OCR for corrected image\n",
206+
"ocr_corrected = TesseractOcr()\n",
207207
"ocr_corrected.setInputCol(\"corrected_image\")\n",
208208
"ocr_corrected.setOutputCol(\"corrected_text\")\n",
209209
"ocr_corrected.setPositionsCol(\"corrected_positions\")\n",
210210
"ocr_corrected.setConfidenceThreshold(65)\n",
211211
"\n",
212212
"# Run tesseract OCR for original image\n",
213-
"ocr = ImageToText()\n",
213+
"ocr = TesseractOcr()\n",
214214
"ocr.setInputCol(\"image\")\n",
215215
"ocr.setOutputCol(\"text\")\n",
216216
"\n",
@@ -293,8 +293,8 @@
293293
"name": "stdout",
294294
"output_type": "stream",
295295
"text": [
296-
"\u001B[31mFilename:\n",
297-
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001B[0m\n",
296+
"\u001b[31mFilename:\n",
297+
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001b[0m\n",
298298
"Recognized text:\n",
299299
" \n",
300300
"\n",
@@ -376,8 +376,8 @@
376376
"name": "stdout",
377377
"output_type": "stream",
378378
"text": [
379-
"\u001B[31mFilename:\n",
380-
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001B[0m\n",
379+
"\u001b[31mFilename:\n",
380+
"file:/Users/nmelnik/IdeaProjects/spark-ocr/workshop/jupyter/data/pdfs/noised.pdf , page: 0\u001b[0m\n",
381381
"Recognized text:\n",
382382
"° Date 7/16/68\n",
383383
"Sanple No 5031\n",
@@ -583,4 +583,4 @@
583583
},
584584
"nbformat": 4,
585585
"nbformat_minor": 2
586-
}
586+
}

jupyter/SparkOcrHttpSource.ipynb

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -183,8 +183,8 @@
183183
" pdf_to_image.setInputCol(\"content\")\n",
184184
" pdf_to_image.setOutputCol(\"image\")\n",
185185
"\n",
186-
" # Run OCR\n",
187-
" ocr = ImageToText()\n",
186+
" # Run tesseract OCR\n",
187+
" ocr = TesseractOcr()\n",
188188
" ocr.setInputCol(\"image\")\n",
189189
" ocr.setOutputCol(\"text\")\n",
190190
" ocr.setConfidenceThreshold(65)\n",
@@ -438,7 +438,7 @@
438438
" binary_to_image.setOutputCol(\"image\")\n",
439439
"\n",
440440
" # Run tesseract OCR\n",
441-
" ocr = ImageToText()\n",
441+
" ocr = TesseractOcr()\n",
442442
" ocr.setInputCol(\"image\")\n",
443443
" ocr.setOutputCol(\"text\")\n",
444444
" ocr.setConfidenceThreshold(65)\n",
@@ -606,4 +606,4 @@
606606
},
607607
"nbformat": 4,
608608
"nbformat_minor": 2
609-
}
609+
}

jupyter/SparkOcrSimpleExample.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -178,8 +178,8 @@
178178
" pdf_to_image.setInputCol(\"content\")\n",
179179
" pdf_to_image.setOutputCol(\"image\")\n",
180180
"\n",
181-
" # Run OCR\n",
182-
" ocr = ImageToText()\n",
181+
" # Run tesseract OCR\n",
182+
" ocr = TesseractOcr()\n",
183183
" ocr.setInputCol(\"image\")\n",
184184
" ocr.setOutputCol(\"text\")\n",
185185
" ocr.setConfidenceThreshold(65)\n",
@@ -543,4 +543,4 @@
543543
},
544544
"nbformat": 4,
545545
"nbformat_minor": 2
546-
}
546+
}

jupyter/SparkOcrStoreResultsToPdfWithTextLayout.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,7 @@
181181
" .setKeepInput(True)\n",
182182
" \n",
183183
" # Run OCR\n",
184-
" ocr = ImageToText() \\\n",
184+
" ocr = TesseractOcr() \\\n",
185185
" .setInputCol(\"image\") \\\n",
186186
" .setOutputCol(\"text\") \\\n",
187187
" .setConfidenceThreshold(60) \\\n",
@@ -372,4 +372,4 @@
372372
},
373373
"nbformat": 4,
374374
"nbformat_minor": 2
375-
}
375+
}

jupyter/SparkOcrStoreResultsToPdfWithTextLayoutWithFallback.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@
9696
"Requirement already satisfied: kiwisolver>=1.0.1 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.2.0) (1.1.0)\n",
9797
"Requirement already satisfied: six>=1.5 in /Users/nmelnik/Library/Python/3.7/lib/python/site-packages (from python-dateutil>=2.1->matplotlib!=3.0.0,>=2.0.0->scikit-image==0.16.2->spark-ocr==1.2.0) (1.14.0)\n",
9898
"Building wheels for collected packages: spark-ocr\n",
99-
" Building wheel for spark-ocr (setup.py) ... \u001B[?25ldone\n",
100-
"\u001B[?25h Created wheel for spark-ocr: filename=spark_ocr-1.2.0-py3-none-any.whl size=5012116 sha256=b79c63e97b4235bbb3c7e061d6a42840bb1886c0351fa6e52262964bfe8333f3\n",
99+
" Building wheel for spark-ocr (setup.py) ... \u001b[?25ldone\n",
100+
"\u001b[?25h Created wheel for spark-ocr: filename=spark_ocr-1.2.0-py3-none-any.whl size=5012116 sha256=b79c63e97b4235bbb3c7e061d6a42840bb1886c0351fa6e52262964bfe8333f3\n",
101101
" Stored in directory: /Users/nmelnik/Library/Caches/pip/wheels/8f/18/a8/6a746cb146272537dd3c50b17baa2711dab0a33acc5ed77549\n",
102102
"Successfully built spark-ocr\n",
103103
"Installing collected packages: spark-ocr\n",
@@ -112,7 +112,7 @@
112112
],
113113
"source": [
114114
"# or install from local path\n",
115-
"#%pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz"
115+
"%pip install --user ../../python/dist/spark-ocr-1.2.0.tar.gz"
116116
]
117117
},
118118
{
@@ -227,7 +227,7 @@
227227
" .setKeepInput(True)\n",
228228
" \n",
229229
" # Run OCR\n",
230-
" ocr = ImageToText() \\\n",
230+
" ocr = TesseractOcr() \\\n",
231231
" .setInputCol(\"image\") \\\n",
232232
" .setOutputCol(\"text\") \\\n",
233233
" .setConfidenceThreshold(60) \\\n",
@@ -503,4 +503,4 @@
503503
},
504504
"nbformat": 4,
505505
"nbformat_minor": 2
506-
}
506+
}

jupyter/SparkOcrStreamingPDF.ipynb

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -184,8 +184,8 @@
184184
"pdf_to_image = PdfToImage()\n",
185185
"pdf_to_image.setOutputCol(\"image\")\n",
186186
"\n",
187-
"# Run OCR for each region\n",
188-
"ocr = ImageToText()\n",
187+
"# Run tesseract OCR for each region\n",
188+
"ocr = TesseractOcr()\n",
189189
"ocr.setInputCol(\"image\")\n",
190190
"ocr.setOutputCol(\"text\")\n",
191191
"ocr.setConfidenceThreshold(60)\n",
@@ -264,7 +264,7 @@
264264
}
265265
],
266266
"source": [
267-
"# get progress of streaming job\n",
267+
"# get progress of streamig job\n",
268268
"query.lastProgress"
269269
]
270270
},
@@ -274,7 +274,7 @@
274274
"metadata": {},
275275
"outputs": [],
276276
"source": [
277-
"# need to run for stop streaming job\n",
277+
"# need to run for stop steraming job\n",
278278
"query.stop()"
279279
]
280280
},
@@ -489,4 +489,4 @@
489489
},
490490
"nbformat": 4,
491491
"nbformat_minor": 2
492-
}
492+
}

jupyter/SparkOcrWithSkewCorrection.ipynb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@
7373
"outputs": [],
7474
"source": [
7575
"# or install from local path\n",
76-
"# %pip install --user ../../python/dist/spark-ocr-1.3.0rc1.tar.gz"
76+
"# %pip install --user ../../python/dist/spark-ocr-1.1.0rc1.tar.gz"
7777
]
7878
},
7979
{
@@ -178,8 +178,8 @@
178178
" skew_corrector.setOutputCol(\"corrected_image\")\n",
179179
" skew_corrector.setAutomaticSkewCorrection(skew_correction)\n",
180180
"\n",
181-
" # Run OCR\n",
182-
" ocr = ImageToText()\n",
181+
" # Run tesseract OCR\n",
182+
" ocr = TesseractOcr()\n",
183183
" ocr.setInputCol(\"corrected_image\")\n",
184184
" ocr.setOutputCol(\"text\")\n",
185185
" \n",

0 commit comments

Comments
 (0)