@@ -16,12 +16,20 @@ Options:
16
16
--hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR
17
17
--fast fast strategy: No OCR, just extract embedded text
18
18
--ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation.
19
+ --vlm vlm strategy: Use Vision Language Model for processing
20
+ --vlm-provider Specify the VLM model provider
21
+ (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
22
+ --vlm-model Specify the VLM model when using
23
+ (see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
19
24
--tables Enable table extraction: tables are represented as html in metadata
20
25
--images Include base64images in json
21
26
--coordinates Include coordinates in the output
22
27
--trace Enable trace logging for debugging, useful to cut and paste the executed curl call
23
28
--verbose Enable verbose logging including printing first 8 elements to stdout
24
29
--s3 Write the resulting output to s3 (like a pastebin)
30
+ --write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option.
31
+ --open-html Automatically open HTML output in browser (macOS only) if --write-html.
32
+ Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option.
25
33
--help Display this help and exit.
26
34
27
35
@@ -64,6 +72,7 @@ copy_to_clipboard() {
64
72
HI_RES=false
65
73
FAST=false
66
74
OCR_ONLY=false
75
+ VLM=false
67
76
STRATEGY=" "
68
77
VERBOSE=false
69
78
TRACE=false
@@ -72,6 +81,10 @@ FREEMIUM=false
72
81
TABLES=true
73
82
IMAGES=false
74
83
S3=" "
84
+ WRITE_HTML=${UNST_WRITE_HTML:- false}
85
+ OPEN_HTML=${UNST_AUTO_OPEN_HTML:- false}
86
+ VLM_PROVIDER=" "
87
+ VLM_MODEL=" "
75
88
76
89
while [[ " $# " -gt 0 ]]; do
77
90
case " $1 " in
@@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do
87
100
OCR_ONLY=true
88
101
shift
89
102
;;
103
+ --vlm)
104
+ VLM=true
105
+ shift
106
+ ;;
107
+ --vlm-provider)
108
+ if [ -n " $2 " ] && [ " ${2: 0: 1} " != " -" ]; then
109
+ VLM_PROVIDER=$2
110
+ shift 2
111
+ else
112
+ echo " Error: Argument for $1 is missing" >&2
113
+ exit 1
114
+ fi
115
+ ;;
116
+ --vlm-model)
117
+ if [ -n " $2 " ] && [ " ${2: 0: 1} " != " -" ]; then
118
+ VLM_MODEL=$2
119
+ shift 2
120
+ else
121
+ echo " Error: Argument for $1 is missing" >&2
122
+ exit 1
123
+ fi
124
+ ;;
90
125
--trace)
91
126
TRACE=true
92
127
shift
@@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do
99
134
S3=true
100
135
shift
101
136
;;
137
+ --write-html)
138
+ WRITE_HTML=true
139
+ shift
140
+ ;;
141
+ --open-html)
142
+ OPEN_HTML=true
143
+ shift
144
+ ;;
102
145
--tables)
103
146
TABLES=true
104
147
shift
@@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then
140
183
exit 1
141
184
fi
142
185
186
+ # Check for strategy conflicts after all arguments are processed
187
+ STRATEGY_COUNT=0
188
+ $HI_RES && STRATEGY_COUNT=$(( STRATEGY_COUNT + 1 ))
189
+ $FAST && STRATEGY_COUNT=$(( STRATEGY_COUNT + 1 ))
190
+ $OCR_ONLY && STRATEGY_COUNT=$(( STRATEGY_COUNT + 1 ))
191
+ $VLM && STRATEGY_COUNT=$(( STRATEGY_COUNT + 1 ))
192
+
193
+ if [ " $STRATEGY_COUNT " -gt 1 ]; then
194
+ echo " Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time."
195
+ exit 1
196
+ fi
197
+
198
+ # Check if vlm-provider or vlm-model are provided without --vlm
199
+ if { [ -n " $VLM_PROVIDER " ] || [ -n " $VLM_MODEL " ]; } && ! $VLM ; then
200
+ echo " Error: --vlm-provider or --vlm-model can only be used with --vlm strategy."
201
+ exit 1
202
+ fi
203
+
143
204
if $TRACE ; then
144
205
set -x
145
206
fi
@@ -175,6 +236,25 @@ elif $OCR_ONLY; then
175
236
STRATEGY=" -ocr-only"
176
237
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR} /${FILENAME}${STRATEGY} .json
177
238
CURL_STRATEGY=(-F " strategy=ocr_only" )
239
+ elif $VLM ; then
240
+ if $VERBOSE ; then echo " Sending API request with vlm strategy" ; fi
241
+ STRATEGY=" -vlm"
242
+ # Add provider and model to filename if specified
243
+ if [ -n " $VLM_PROVIDER " ] && [ -n " $VLM_MODEL " ]; then
244
+ STRATEGY=" -vlm-${VLM_PROVIDER} -${VLM_MODEL} "
245
+ elif [ -n " $VLM_PROVIDER " ]; then
246
+ STRATEGY=" -vlm-${VLM_PROVIDER} "
247
+ elif [ -n " $VLM_MODEL " ]; then
248
+ STRATEGY=" -vlm-model-${VLM_MODEL} "
249
+ fi
250
+ JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR} /${FILENAME}${STRATEGY} .json
251
+ CURL_STRATEGY=(-F " strategy=vlm" )
252
+ if [ -n " $VLM_PROVIDER " ]; then
253
+ CURL_STRATEGY+=(-F " vlm_model_provider=$VLM_PROVIDER " )
254
+ fi
255
+ if [ -n " $VLM_MODEL " ]; then
256
+ CURL_STRATEGY+=(-F " vlm_model=$VLM_MODEL " )
257
+ fi
178
258
else
179
259
if $VERBOSE ; then echo " Sending API request WITHOUT a strategy" ; fi
180
260
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR} /${FILENAME}${STRATEGY} .json
@@ -213,6 +293,44 @@ else
213
293
fi
214
294
echo " JSON Output file: ${JSON_OUTPUT_FILEPATH} "
215
295
296
+ # Convert JSON to HTML if requested
297
+ if [ " $WRITE_HTML " = true ]; then
298
+ HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH% .json} .html
299
+
300
+ if $VLM ; then
301
+ # VLM output has all metadata.text_as_html fields defined, so
302
+ # create HTML directly from the metadata.text_as_html fields
303
+ {
304
+ echo " <!DOCTYPE html>"
305
+ echo " <html>"
306
+ echo " <head>"
307
+ echo " <meta charset=\" UTF-8\" >"
308
+ echo " <meta name=\" viewport\" content=\" width=device-width, initial-scale=1.0\" >"
309
+ echo " <title>${FILENAME} </title>"
310
+ echo " <style>"
311
+ echo " body { font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }"
312
+ echo " </style>"
313
+ echo " </head>"
314
+ echo " <body>"
315
+ jq -r ' map(.metadata.text_as_html) | join("\n")' " ${JSON_OUTPUT_FILEPATH} "
316
+ echo " </body>"
317
+ echo " </html>"
318
+ } > " ${HTML_OUTPUT_FILEPATH} "
319
+ echo " HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH} "
320
+ else
321
+ # most elements will not have metadata.text_as_html defined (by design on Table elements do),
322
+ # so use the unstructured library's python script for the conversion.
323
+ SCRIPT_DIR=" $( cd " $( dirname " ${BASH_SOURCE[0]} " ) " && pwd) "
324
+ PYTHONPATH=" ${SCRIPT_DIR} /../.." python3 " ${SCRIPT_DIR} /../html/elements_json_to_html.py" " ${JSON_OUTPUT_FILEPATH} " --outdir " ${TMP_OUTPUTS_DIR} "
325
+ echo " HTML written using Python script to: ${HTML_OUTPUT_FILEPATH} "
326
+ fi
327
+
328
+ # Open HTML file in browser if requested and on macOS
329
+ if [ " $OPEN_HTML " = true ] && [ " $( uname) " == " Darwin" ]; then
330
+ open " ${HTML_OUTPUT_FILEPATH} "
331
+ fi
332
+ fi
333
+
216
334
# write .json output to s3 location
217
335
if [ -n " $S3 " ]; then
218
336
0 commit comments