Skip to content

Commit 67d7b8b

Browse files
added svlm materials
1 parent f5deef5 commit 67d7b8b

File tree

12 files changed

+11473
-0
lines changed

12 files changed

+11473
-0
lines changed

tutorials/Certification_Trainings/Training_Notebooks & Slides/SmallVLMs_Notebooks/1.SmallVLMDocumentUnderstanding.ipynb

Lines changed: 9017 additions & 0 deletions
Large diffs are not rendered by default.

tutorials/Certification_Trainings/Training_Notebooks & Slides/SmallVLMs_Notebooks/2.SmallVLMDocumentUnderstanding.ipynb

Lines changed: 862 additions & 0 deletions
Large diffs are not rendered by default.

tutorials/Certification_Trainings/Training_Notebooks & Slides/SmallVLMs_Notebooks/3.SmallVLMPrompting.ipynb

Lines changed: 1212 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 382 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,382 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {
6+
"application/vnd.databricks.v1+cell": {
7+
"cellMetadata": {},
8+
"inputWidgets": {},
9+
"nuid": "78976ba6-d9be-4bb1-9e7d-7c36c788ba94",
10+
"showTitle": false,
11+
"tableResultSettingsMap": {},
12+
"title": ""
13+
}
14+
},
15+
"source": [
16+
"### Practical Prompting Examples #2\n",
17+
"\n",
18+
"Sample taken from [here](https://github.com/JohnSnowLabs/pdf-deid-dataset/blob/main/PDF_Original/Hard/PDF_Deid_Deidentification_Hard_0.pdf)"
19+
]
20+
},
21+
{
22+
"cell_type": "code",
23+
"execution_count": null,
24+
"metadata": {
25+
"application/vnd.databricks.v1+cell": {
26+
"cellMetadata": {
27+
"byteLimit": 2048000,
28+
"rowLimit": 10000
29+
},
30+
"inputWidgets": {},
31+
"nuid": "2488da2b-b8ee-46a7-a7ef-0ee18a3851b4",
32+
"showTitle": false,
33+
"tableResultSettingsMap": {},
34+
"title": ""
35+
}
36+
},
37+
"outputs": [
38+
{
39+
"data": {
40+
"text/html": [
41+
"<style scoped>\n",
42+
" .ansiout {\n",
43+
" display: block;\n",
44+
" unicode-bidi: embed;\n",
45+
" white-space: pre-wrap;\n",
46+
" word-wrap: break-word;\n",
47+
" word-break: break-all;\n",
48+
" font-family: \"Menlo\", \"Monaco\", \"Consolas\", \"Ubuntu Mono\", \"Source Code Pro\", monospace;\n",
49+
" font-size: 13px;\n",
50+
" color: #555;\n",
51+
" margin-left: 4px;\n",
52+
" line-height: 19px;\n",
53+
" }\n",
54+
"</style>"
55+
]
56+
},
57+
"metadata": {
58+
"application/vnd.databricks.v1+output": {
59+
"arguments": {},
60+
"data": "",
61+
"errorSummary": "",
62+
"errorTraceType": null,
63+
"metadata": {},
64+
"type": "ipynbError"
65+
}
66+
},
67+
"output_type": "display_data"
68+
}
69+
],
70+
"source": [
71+
"prompt = \"\"\" You are an information extraction system.\n",
72+
"\n",
73+
"Your task: Extract patient information according to the following schema.\n",
74+
"\n",
75+
"{\n",
76+
" \"Patient Name\": \"string | null\",\n",
77+
" \"Date of Birth\": \"string | null\",\n",
78+
" \"Social Security Number\": \"string | null\",\n",
79+
" \"Encounter Participant\": \"string | null\",\n",
80+
"}\n",
81+
"\"\"\""
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": null,
87+
"metadata": {
88+
"application/vnd.databricks.v1+cell": {
89+
"cellMetadata": {
90+
"byteLimit": 2048000,
91+
"rowLimit": 10000
92+
},
93+
"inputWidgets": {},
94+
"nuid": "7c43e1ef-3126-4651-8a4f-9062614dcfeb",
95+
"showTitle": false,
96+
"tableResultSettingsMap": {},
97+
"title": ""
98+
}
99+
},
100+
"outputs": [],
101+
"source": [
102+
"from pyspark.ml import PipelineModel\n",
103+
"import pyspark.sql.functions as f\n",
104+
"from sparkocr.transformers import *\n",
105+
"from sparkocr.enums import *\n",
106+
"from sparkocr.utils import display_images\n",
107+
"from sparkocr.dataextraction.visual_prescriptions_recognition import VisualPrescriptionsRecognition\n",
108+
"\n",
109+
"pdf_to_img = PdfToImage() \\\n",
110+
".setKeepInput(False)\n",
111+
"\n",
112+
"ocr = VisualPrescriptionsRecognition() \\\n",
113+
".setInputCol(\"image\") \\\n",
114+
".setOutputCol(\"text\") \\\n",
115+
".setKeepInput(False) \\\n",
116+
".setPrompt(f\"{prompt}\") \\\n",
117+
".setMaxNewTokens(4096)\n",
118+
"\n",
119+
"image_path = \"dbfs:/FileStore/pdfs/PDF_Deid_Deidentification_Medium_0.pdf\"\n",
120+
"pdf_df = spark.read.format(\"binaryFile\").load(image_path)\n"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": null,
126+
"metadata": {
127+
"application/vnd.databricks.v1+cell": {
128+
"cellMetadata": {
129+
"byteLimit": 2048000,
130+
"rowLimit": 10000
131+
},
132+
"inputWidgets": {},
133+
"nuid": "34520eb1-37db-4e25-898d-aacad8b59b37",
134+
"showTitle": false,
135+
"tableResultSettingsMap": {},
136+
"title": ""
137+
}
138+
},
139+
"outputs": [
140+
{
141+
"name": "stdout",
142+
"output_type": "stream",
143+
"text": [
144+
"+--------------------+-------------------+------+--------------------+-----------+---------+-------+-----------+\n",
145+
"| path| modificationTime|length| image|total_pages|exception|pagenum|documentnum|\n",
146+
"+--------------------+-------------------+------+--------------------+-----------+---------+-------+-----------+\n",
147+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...| 4| | 1| 0|\n",
148+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...| 4| | 0| 0|\n",
149+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...| 4| | 3| 0|\n",
150+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436|{dbfs:/FileStore/...| 4| | 2| 0|\n",
151+
"+--------------------+-------------------+------+--------------------+-----------+---------+-------+-----------+\n",
152+
"\n"
153+
]
154+
},
155+
{
156+
"data": {
157+
"text/html": [
158+
"<style scoped>\n",
159+
" .ansiout {\n",
160+
" display: block;\n",
161+
" unicode-bidi: embed;\n",
162+
" white-space: pre-wrap;\n",
163+
" word-wrap: break-word;\n",
164+
" word-break: break-all;\n",
165+
" font-family: \"Menlo\", \"Monaco\", \"Consolas\", \"Ubuntu Mono\", \"Source Code Pro\", monospace;\n",
166+
" font-size: 13px;\n",
167+
" color: #555;\n",
168+
" margin-left: 4px;\n",
169+
" line-height: 19px;\n",
170+
" }\n",
171+
"</style>"
172+
]
173+
},
174+
"metadata": {
175+
"application/vnd.databricks.v1+output": {
176+
"arguments": {},
177+
"data": "",
178+
"errorSummary": "",
179+
"errorTraceType": null,
180+
"metadata": {},
181+
"type": "ipynbError"
182+
}
183+
},
184+
"output_type": "display_data"
185+
}
186+
],
187+
"source": [
188+
"image_df = pdf_to_img.transform(pdf_df).limit(8).cache()\n",
189+
"image_df.show()"
190+
]
191+
},
192+
{
193+
"cell_type": "code",
194+
"execution_count": null,
195+
"metadata": {
196+
"application/vnd.databricks.v1+cell": {
197+
"cellMetadata": {
198+
"byteLimit": 2048000,
199+
"rowLimit": 10000
200+
},
201+
"inputWidgets": {},
202+
"nuid": "386cb57f-67c9-40a0-a832-2ff390f8fc4a",
203+
"showTitle": false,
204+
"tableResultSettingsMap": {},
205+
"title": ""
206+
}
207+
},
208+
"outputs": [
209+
{
210+
"name": "stdout",
211+
"output_type": "stream",
212+
"text": [
213+
"+--------------------+-------------------+------+-----------+---------+-------+-----------+--------------------+\n",
214+
"| path| modificationTime|length|total_pages|exception|pagenum|documentnum| text|\n",
215+
"+--------------------+-------------------+------+-----------+---------+-------+-----------+--------------------+\n",
216+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436| 4| | 1| 0|{'Patient Name': ...|\n",
217+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436| 4| | 0| 0|{'Patient Name': ...|\n",
218+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436| 4| | 3| 0| |\n",
219+
"|dbfs:/FileStore/p...|2025-10-22 16:01:12|461436| 4| | 2| 0|{'Patient Name': ...|\n",
220+
"+--------------------+-------------------+------+-----------+---------+-------+-----------+--------------------+\n",
221+
"\n"
222+
]
223+
},
224+
{
225+
"data": {
226+
"text/html": [
227+
"<style scoped>\n",
228+
" .ansiout {\n",
229+
" display: block;\n",
230+
" unicode-bidi: embed;\n",
231+
" white-space: pre-wrap;\n",
232+
" word-wrap: break-word;\n",
233+
" word-break: break-all;\n",
234+
" font-family: \"Menlo\", \"Monaco\", \"Consolas\", \"Ubuntu Mono\", \"Source Code Pro\", monospace;\n",
235+
" font-size: 13px;\n",
236+
" color: #555;\n",
237+
" margin-left: 4px;\n",
238+
" line-height: 19px;\n",
239+
" }\n",
240+
"</style>"
241+
]
242+
},
243+
"metadata": {
244+
"application/vnd.databricks.v1+output": {
245+
"arguments": {},
246+
"data": "",
247+
"errorSummary": "",
248+
"errorTraceType": null,
249+
"metadata": {},
250+
"type": "ipynbError"
251+
}
252+
},
253+
"output_type": "display_data"
254+
}
255+
],
256+
"source": [
257+
"result = ocr.transform(image_df).cache()\n",
258+
"result.show()"
259+
]
260+
},
261+
{
262+
"cell_type": "code",
263+
"execution_count": null,
264+
"metadata": {
265+
"application/vnd.databricks.v1+cell": {
266+
"cellMetadata": {
267+
"byteLimit": 2048000,
268+
"rowLimit": 10000
269+
},
270+
"inputWidgets": {},
271+
"nuid": "b2328475-44a7-4d19-93bf-eed1dde5fe58",
272+
"showTitle": false,
273+
"tableResultSettingsMap": {},
274+
"title": ""
275+
}
276+
},
277+
"outputs": [
278+
{
279+
"name": "stdout",
280+
"output_type": "stream",
281+
"text": [
282+
"+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
283+
"|text |\n",
284+
"+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
285+
"|{'Patient Name': 'Susan Frances Martin', 'Date of Birth': '09/03/1951', 'Social Security Number': '103-15-0825', 'Encounter': 'HOSP20831933', 'Encounter Participant': 'Brittany Gallagher'} |\n",
286+
"|{'Patient Name': 'Susan Frances Martin', 'Date of Birth': '09/03/1951', 'Social Security Number': '103-15-0825', 'Encounter': 'Coordinator For Healthplans - Encounter Summary for Susan Frances Martin, Electronically signed by Dr. Brittany Gallagher, FNP-C', 'Encounter Participant': 'Dr. Brittany Gallagher'}|\n",
287+
"| |\n",
288+
"|{'Patient Name': 'Susan Frances Martin', 'Date of Birth': '09/03/1951', 'Social Security Number': None, 'Encounter': None, 'Encounter Participant': None} |\n",
289+
"+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
290+
"\n"
291+
]
292+
},
293+
{
294+
"data": {
295+
"text/html": [
296+
"<style scoped>\n",
297+
" .ansiout {\n",
298+
" display: block;\n",
299+
" unicode-bidi: embed;\n",
300+
" white-space: pre-wrap;\n",
301+
" word-wrap: break-word;\n",
302+
" word-break: break-all;\n",
303+
" font-family: \"Menlo\", \"Monaco\", \"Consolas\", \"Ubuntu Mono\", \"Source Code Pro\", monospace;\n",
304+
" font-size: 13px;\n",
305+
" color: #555;\n",
306+
" margin-left: 4px;\n",
307+
" line-height: 19px;\n",
308+
" }\n",
309+
"</style>"
310+
]
311+
},
312+
"metadata": {
313+
"application/vnd.databricks.v1+output": {
314+
"arguments": {},
315+
"data": "",
316+
"errorSummary": "",
317+
"errorTraceType": null,
318+
"metadata": {},
319+
"type": "ipynbError"
320+
}
321+
},
322+
"output_type": "display_data"
323+
}
324+
],
325+
"source": [
326+
"result.select(\"text\").show(truncate=False)"
327+
]
328+
},
329+
{
330+
"cell_type": "code",
331+
"execution_count": null,
332+
"metadata": {
333+
"application/vnd.databricks.v1+cell": {
334+
"cellMetadata": {},
335+
"inputWidgets": {},
336+
"nuid": "402baa34-43c6-4b87-895e-3cfef3dae1fd",
337+
"showTitle": false,
338+
"tableResultSettingsMap": {},
339+
"title": ""
340+
}
341+
},
342+
"outputs": [],
343+
"source": []
344+
}
345+
],
346+
"metadata": {
347+
"application/vnd.databricks.v1+notebook": {
348+
"computePreferences": null,
349+
"dashboards": [],
350+
"environmentMetadata": {
351+
"base_environment": "",
352+
"environment_version": "4"
353+
},
354+
"inputWidgetPreferences": null,
355+
"language": "python",
356+
"notebookMetadata": {
357+
"pythonIndentUnit": 4
358+
},
359+
"notebookName": "Alberto's clone of Alex's",
360+
"widgets": {}
361+
},
362+
"kernelspec": {
363+
"display_name": "Python 3 (ipykernel)",
364+
"language": "python",
365+
"name": "python3"
366+
},
367+
"language_info": {
368+
"codemirror_mode": {
369+
"name": "ipython",
370+
"version": 3
371+
},
372+
"file_extension": ".py",
373+
"mimetype": "text/x-python",
374+
"name": "python",
375+
"nbconvert_exporter": "python",
376+
"pygments_lexer": "ipython3",
377+
"version": "3.12.3"
378+
}
379+
},
380+
"nbformat": 4,
381+
"nbformat_minor": 4
382+
}
279 KB
Loading
598 KB
Loading
1020 KB
Loading
1.37 MB
Loading
455 KB
Loading

0 commit comments

Comments
 (0)