Skip to content

Commit dc944fe

Browse files
authored
Add torchao checkpoint tests (#14074)
This PR adds new tests that the pre-quantized model checkpoints we publish on pytorch work with ExecuTorch (lowering and C++ runner). qwen3-4b is tested for both lowering and runtime. phi4-mini is tested for lowering. There appears to be a regression in the C++ HF tokenizer used in the ExecuTorch, and it no longer works with the phi4-mini tokenizer. See #14077
1 parent 8973eeb commit dc944fe

File tree

2 files changed

+172
-2
lines changed

2 files changed

+172
-2
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/env bash
2+
set -euo pipefail
3+
4+
# -------------------------
5+
# Args / flags
6+
# -------------------------
7+
TEST_WITH_RUNNER=0
8+
MODEL_NAME=""
9+
10+
# Parse args
11+
if [[ $# -lt 1 ]]; then
12+
echo "Usage: $0 <model_name> [--test_with_runner]"
13+
echo "Supported model_name values: qwen3_4b, phi_4_mini"
14+
exit 1
15+
fi
16+
17+
MODEL_NAME="$1"
18+
shift
19+
20+
while [[ $# -gt 0 ]]; do
21+
case "$1" in
22+
--test_with_runner)
23+
TEST_WITH_RUNNER=1
24+
;;
25+
-h|--help)
26+
echo "Usage: $0 <model_name> [--test_with_runner]"
27+
echo " model_name: qwen3_4b | phi_4_mini"
28+
echo " --test_with_runner: build ET + run llama_main to sanity-check the export"
29+
exit 0
30+
;;
31+
*)
32+
echo "Unknown option: $1"
33+
exit 1
34+
;;
35+
esac
36+
shift
37+
done
38+
39+
if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
40+
PYTHON_EXECUTABLE=python3
41+
fi
42+
43+
MODEL_OUT=model.pte
44+
45+
case "$MODEL_NAME" in
46+
qwen3_4b)
47+
echo "Running Qwen3-4B export..."
48+
HF_MODEL_DIR=$(hf download pytorch/Qwen3-4B-INT8-INT4)
49+
EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
50+
$PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \
51+
$HF_MODEL_DIR \
52+
pytorch_model_converted.bin
53+
54+
$PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
55+
--model "qwen3_4b" \
56+
--checkpoint pytorch_model_converted.bin \
57+
--params examples/models/qwen3/config/4b_config.json \
58+
--output_name $MODEL_OUT \
59+
-kv \
60+
--use_sdpa_with_kv_cache \
61+
-X \
62+
--xnnpack-extended-ops \
63+
--max_context_length 1024 \
64+
--max_seq_length 1024 \
65+
--dtype fp32 \
66+
--metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
67+
;;
68+
69+
phi_4_mini)
70+
echo "Running Phi-4-mini export..."
71+
HF_MODEL_DIR=$(hf download pytorch/Phi-4-mini-instruct-INT8-INT4)
72+
EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB
73+
$PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \
74+
$HF_MODEL_DIR \
75+
pytorch_model_converted.bin
76+
77+
$PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \
78+
--model "phi_4_mini" \
79+
--checkpoint pytorch_model_converted.bin \
80+
--params examples/models/phi_4_mini/config/config.json \
81+
--output_name $MODEL_OUT \
82+
-kv \
83+
--use_sdpa_with_kv_cache \
84+
-X \
85+
--xnnpack-extended-ops \
86+
--max_context_length 1024 \
87+
--max_seq_length 1024 \
88+
--dtype fp32 \
89+
--metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}'
90+
;;
91+
92+
*)
93+
echo "Error: unsupported model_name '$MODEL_NAME'"
94+
echo "Supported values: qwen3_4b, phi_4_mini"
95+
exit 1
96+
;;
97+
esac
98+
99+
# Check file size
100+
MODEL_SIZE=$(stat --printf="%s" $MODEL_OUT 2>/dev/null || stat -f%z $MODEL_OUT)
101+
if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then
102+
echo "Error: model size $MODEL_SIZE is greater than expected upper bound $EXPECTED_MODEL_SIZE_UPPER_BOUND"
103+
exit 1
104+
fi
105+
106+
# Install ET with CMake
107+
if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then
108+
echo "[runner] Building and testing llama_main ..."
109+
cmake -DPYTHON_EXECUTABLE=python \
110+
-DCMAKE_INSTALL_PREFIX=cmake-out \
111+
-DEXECUTORCH_ENABLE_LOGGING=1 \
112+
-DCMAKE_BUILD_TYPE=Release \
113+
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
114+
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
115+
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
116+
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
117+
-DEXECUTORCH_BUILD_XNNPACK=ON \
118+
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
119+
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
120+
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
121+
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
122+
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
123+
-Bcmake-out .
124+
cmake --build cmake-out -j16 --config Release --target install
125+
126+
127+
# Install llama runner
128+
cmake -DPYTHON_EXECUTABLE=python \
129+
-DCMAKE_BUILD_TYPE=Release \
130+
-Bcmake-out/examples/models/llama \
131+
examples/models/llama
132+
cmake --build cmake-out/examples/models/llama -j16 --config Release
133+
134+
# Run the model
135+
./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time,"
136+
fi
137+
138+
# Clean up
139+
rm -f pytorch_model_converted.bin "$MODEL_OUT"

.github/workflows/trunk.yml

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -582,6 +582,37 @@ jobs:
582582
# Test llama2
583583
PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}"
584584
585+
test-torchao-huggingface-checkpoints:
586+
name: test-torchao-huggingface-checkpoints
587+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
588+
permissions:
589+
id-token: write
590+
contents: read
591+
strategy:
592+
matrix:
593+
model: [qwen3_4b, phi_4_mini]
594+
include:
595+
- model: qwen3_4b
596+
test_with_runner: true
597+
- model: phi_4_mini
598+
test_with_runner: false
599+
fail-fast: false
600+
with:
601+
runner: linux.2xlarge
602+
docker-image: ci-image:executorch-ubuntu-22.04-clang12
603+
submodules: 'recursive'
604+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
605+
timeout: 900
606+
script: |
607+
# The generic Linux job chooses to use base env, not the one setup by the image
608+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
609+
conda activate "${CONDA_ENV}"
610+
611+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
612+
pip install -U "huggingface_hub[cli]"
613+
614+
bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
615+
585616
# # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
586617
# test-llava-runner-macos:
587618
# name: test-llava-runner-macos
@@ -990,13 +1021,13 @@ jobs:
9901021
timeout: 60
9911022
script: |
9921023
conda init powershell
993-
1024+
9941025
powershell -Command "& {
9951026
Set-PSDebug -Trace 1
9961027
\$ErrorActionPreference = 'Stop'
9971028
\$PSNativeCommandUseErrorActionPreference = \$true
9981029
999-
.ci/scripts/setup-windows.ps1
1030+
.ci/scripts/setup-windows.ps1
10001031
10011032
powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }}
10021033
}"

0 commit comments

Comments
 (0)