Skip to content

Commit 19fc1fc

Browse files
authored
feat: convenience unstructured-get-json.sh update (#3971)
* script now supports: * the --vlm flag, to process the document with the VLM strategy * optionally takes --vlm-model, --vlm-provider args * optionally also writes .html outputs by converting unstructured .json output * optionally opens those .html outputs in a browser Tested with: ``` unstructured-get-json.sh --write-html --open-html --fast layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --hi-res layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --ocr-only layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider openai --vlm-model gpt-4o layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider vertexai --vlm-model gemini-2.0-flash-001 layout-parser-paper-p2.pdf unstructured-get-json.sh --write-html --open-html --vlm --vlm-provider anthropic --vlm-model claude-3-5-sonnet-20241022 layout-parser-paper-p2.pdf ``` [layout-parser-paper-p2.pdf](https://github.com/user-attachments/files/19514007/layout-parser-paper-p2.pdf)
1 parent 9a239fa commit 19fc1fc

File tree

4 files changed

+129
-2
lines changed

4 files changed

+129
-2
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -208,4 +208,5 @@ outputhtmldiff.txt
208208
metricsdiff.txt
209209

210210
# analysis
211-
annotated/
211+
annotated/
212+
.aider*

CHANGELOG.md

+8
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
## 0.17.6-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
19
## 0.17.5
210

311
### Enhancements

scripts/user/unstructured-get-json.sh

+118
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,20 @@ Options:
1616
--hi-res hi_res strategy: Enable high-resolution processing, with layout segmentation and OCR
1717
--fast fast strategy: No OCR, just extract embedded text
1818
--ocr-only ocr_only strategy: Perform OCR (Optical Character Recognition) only. No layout segmentation.
19+
--vlm vlm strategy: Use Vision Language Model for processing
20+
--vlm-provider Specify the VLM model provider
21+
(see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
22+
--vlm-model Specify the VLM model when using
23+
(see: https://docs.unstructured.io/api-reference/workflow/workflows#vlm-strategy)
1924
--tables Enable table extraction: tables are represented as html in metadata
2025
--images Include base64images in json
2126
--coordinates Include coordinates in the output
2227
--trace Enable trace logging for debugging, useful to cut and paste the executed curl call
2328
--verbose Enable verbose logging including printing first 8 elements to stdout
2429
--s3 Write the resulting output to s3 (like a pastebin)
30+
--write-html Convert JSON output to HTML. Set the env var $UNST_WRITE_HTML to skip providing this option.
31+
--open-html Automatically open HTML output in browser (macOS only) if --write-html.
32+
Set the env var UNST_AUTO_OPEN_HTML=true to skip providing this option.
2533
--help Display this help and exit.
2634
2735
@@ -64,6 +72,7 @@ copy_to_clipboard() {
6472
HI_RES=false
6573
FAST=false
6674
OCR_ONLY=false
75+
VLM=false
6776
STRATEGY=""
6877
VERBOSE=false
6978
TRACE=false
@@ -72,6 +81,10 @@ FREEMIUM=false
7281
TABLES=true
7382
IMAGES=false
7483
S3=""
84+
WRITE_HTML=${UNST_WRITE_HTML:-false}
85+
OPEN_HTML=${UNST_AUTO_OPEN_HTML:-false}
86+
VLM_PROVIDER=""
87+
VLM_MODEL=""
7588

7689
while [[ "$#" -gt 0 ]]; do
7790
case "$1" in
@@ -87,6 +100,28 @@ while [[ "$#" -gt 0 ]]; do
87100
OCR_ONLY=true
88101
shift
89102
;;
103+
--vlm)
104+
VLM=true
105+
shift
106+
;;
107+
--vlm-provider)
108+
if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
109+
VLM_PROVIDER=$2
110+
shift 2
111+
else
112+
echo "Error: Argument for $1 is missing" >&2
113+
exit 1
114+
fi
115+
;;
116+
--vlm-model)
117+
if [ -n "$2" ] && [ "${2:0:1}" != "-" ]; then
118+
VLM_MODEL=$2
119+
shift 2
120+
else
121+
echo "Error: Argument for $1 is missing" >&2
122+
exit 1
123+
fi
124+
;;
90125
--trace)
91126
TRACE=true
92127
shift
@@ -99,6 +134,14 @@ while [[ "$#" -gt 0 ]]; do
99134
S3=true
100135
shift
101136
;;
137+
--write-html)
138+
WRITE_HTML=true
139+
shift
140+
;;
141+
--open-html)
142+
OPEN_HTML=true
143+
shift
144+
;;
102145
--tables)
103146
TABLES=true
104147
shift
@@ -140,6 +183,24 @@ if [ -z "$INPUT" ]; then
140183
exit 1
141184
fi
142185

186+
# Check for strategy conflicts after all arguments are processed
187+
STRATEGY_COUNT=0
188+
$HI_RES && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
189+
$FAST && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
190+
$OCR_ONLY && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
191+
$VLM && STRATEGY_COUNT=$((STRATEGY_COUNT + 1))
192+
193+
if [ "$STRATEGY_COUNT" -gt 1 ]; then
194+
echo "Error: Only one strategy option (--hi-res, --fast, --ocr-only, --vlm) can be specified at a time."
195+
exit 1
196+
fi
197+
198+
# Check if vlm-provider or vlm-model are provided without --vlm
199+
if { [ -n "$VLM_PROVIDER" ] || [ -n "$VLM_MODEL" ]; } && ! $VLM; then
200+
echo "Error: --vlm-provider or --vlm-model can only be used with --vlm strategy."
201+
exit 1
202+
fi
203+
143204
if $TRACE; then
144205
set -x
145206
fi
@@ -175,6 +236,25 @@ elif $OCR_ONLY; then
175236
STRATEGY="-ocr-only"
176237
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
177238
CURL_STRATEGY=(-F "strategy=ocr_only")
239+
elif $VLM; then
240+
if $VERBOSE; then echo "Sending API request with vlm strategy"; fi
241+
STRATEGY="-vlm"
242+
# Add provider and model to filename if specified
243+
if [ -n "$VLM_PROVIDER" ] && [ -n "$VLM_MODEL" ]; then
244+
STRATEGY="-vlm-${VLM_PROVIDER}-${VLM_MODEL}"
245+
elif [ -n "$VLM_PROVIDER" ]; then
246+
STRATEGY="-vlm-${VLM_PROVIDER}"
247+
elif [ -n "$VLM_MODEL" ]; then
248+
STRATEGY="-vlm-model-${VLM_MODEL}"
249+
fi
250+
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
251+
CURL_STRATEGY=(-F "strategy=vlm")
252+
if [ -n "$VLM_PROVIDER" ]; then
253+
CURL_STRATEGY+=(-F "vlm_model_provider=$VLM_PROVIDER")
254+
fi
255+
if [ -n "$VLM_MODEL" ]; then
256+
CURL_STRATEGY+=(-F "vlm_model=$VLM_MODEL")
257+
fi
178258
else
179259
if $VERBOSE; then echo "Sending API request WITHOUT a strategy"; fi
180260
JSON_OUTPUT_FILEPATH=${TMP_OUTPUTS_DIR}/${FILENAME}${STRATEGY}.json
@@ -213,6 +293,44 @@ else
213293
fi
214294
echo "JSON Output file: ${JSON_OUTPUT_FILEPATH}"
215295

296+
# Convert JSON to HTML if requested
297+
if [ "$WRITE_HTML" = true ]; then
298+
HTML_OUTPUT_FILEPATH=${JSON_OUTPUT_FILEPATH%.json}.html
299+
300+
if $VLM; then
301+
# VLM output has all metadata.text_as_html fields defined, so
302+
# create HTML directly from the metadata.text_as_html fields
303+
{
304+
echo "<!DOCTYPE html>"
305+
echo "<html>"
306+
echo "<head>"
307+
echo " <meta charset=\"UTF-8\">"
308+
echo " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">"
309+
echo " <title>${FILENAME}</title>"
310+
echo " <style>"
311+
echo " body { font-family: Arial, sans-serif; line-height: 1.6; margin: 20px; }"
312+
echo " </style>"
313+
echo "</head>"
314+
echo "<body>"
315+
jq -r 'map(.metadata.text_as_html) | join("\n")' "${JSON_OUTPUT_FILEPATH}"
316+
echo "</body>"
317+
echo "</html>"
318+
} >"${HTML_OUTPUT_FILEPATH}"
319+
echo "HTML written directly from metadata.text_as_html fields to: ${HTML_OUTPUT_FILEPATH}"
320+
else
321+
# most elements will not have metadata.text_as_html defined (by design on Table elements do),
322+
# so use the unstructured library's python script for the conversion.
323+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
324+
PYTHONPATH="${SCRIPT_DIR}/../.." python3 "${SCRIPT_DIR}/../html/elements_json_to_html.py" "${JSON_OUTPUT_FILEPATH}" --outdir "${TMP_OUTPUTS_DIR}"
325+
echo "HTML written using Python script to: ${HTML_OUTPUT_FILEPATH}"
326+
fi
327+
328+
# Open HTML file in browser if requested and on macOS
329+
if [ "$OPEN_HTML" = true ] && [ "$(uname)" == "Darwin" ]; then
330+
open "${HTML_OUTPUT_FILEPATH}"
331+
fi
332+
fi
333+
216334
# write .json output to s3 location
217335
if [ -n "$S3" ]; then
218336

unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.17.5" # pragma: no cover
1+
__version__ = "0.17.6-dev0" # pragma: no cover

0 commit comments

Comments
 (0)