Skip to content

Qualcomm AI Engine Direct - backward compatibility CI #12748

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .ci/scripts/test_qnn_static_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@ exit_code1=$?
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
exit_code2=$?

# Check BC
bash backends/qualcomm/bc/test_qnn_static_llama_bc.sh
exit_code3=$?

# Check the exit codes and print messages
if [ $exit_code1 -ne 0 ]; then
echo "Static Llama compile only with weight sharing test failed. $exit_code1."
Expand All @@ -50,8 +54,12 @@ if [ $exit_code2 -ne 0 ]; then
echo "Static Llama accuracy test failed. $exit_code2."
fi

if [ $exit_code3 -ne 0 ]; then
echo "Static Llama BACKWARD COMPATIBILITY test failed. $exit_code3."
fi

# Return failure if either program failed
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ] || [ $exit_code3 -ne 0 ]; then
exit 1
else
exit 0
Expand Down
57 changes: 57 additions & 0 deletions backends/qualcomm/bc/test_qnn_static_llama_bc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/bin/bash
# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
PYTHON_EXECUTABLE=python3
fi

which "${PYTHON_EXECUTABLE}"


llama_artifacts="."
PTE_ARTIFACT="examples/qualcomm/oss_scripts/llama/artifacts"

# Download stories260K.pt and tokenizer from Github
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt" --output stories260K.pt
curl -Ls "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model" --output tokenizer.model
# Create params.json file
touch params.json
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json

# Checks e2e accuracy
expected=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts $llama_artifacts --enable_x86_64 | grep "Model CI result:")
exit_code1=$?

# Checks accuracy with precompiled
output=$($PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir $PTE_ARTIFACT --llama_artifacts $llama_artifacts --enable_x86_64 --pre_gen_pte $PTE_ARTIFACT | grep "Model CI result:")
exit_code2=$?

if [[ "$output" == "$expected" ]]; then
echo "[BACKWARD COMPATIBILITY CHECK] Output matches expected result."
else
echo "[BACKWARD COMPATIBILITY CHECK] Output mismatch!"
echo "[BACKWARD COMPATIBILITY CHECK] Expected: $expected"
echo "[BACKWARD COMPATIBILITY CHECK] Actual: $output"
exit 1
fi

# Check the exit codes and print messages
if [ $exit_code1 -ne 0 ]; then
echo "Static Llama compile only test failed. $exit_code1."
fi

if [ $exit_code2 -ne 0 ]; then
echo "Static Llama execute precompiled test failed. $exit_code2."
fi

# Return failure if either program failed
if [ $exit_code1 -ne 0 ] || [ $exit_code2 -ne 0 ]; then
exit 1
else
exit 0
fi
78 changes: 78 additions & 0 deletions backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4094,6 +4094,84 @@ def test_llama3_2_1b(self):
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai

def test_llama_stories_260k(self):
if not self.required_envs():
self.skipTest("missing required envs")
assert (
self.llama_artifacts is not None
), "Please provide path to llama artifacts"

prompt = "Once"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--model",
self.model,
"--checkpoint",
f"{self.llama_artifacts}/stories260K.pt",
"--params",
f"{self.llama_artifacts}/params.json",
"--tokenizer_model",
f"{self.llama_artifacts}/tokenizer.model",
"--tokenizer_bin",
f"{self.llama_artifacts}/tokenizer.bin",
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
f"{prompt}",
"--ptq",
"16a4w",
"--temperature",
"0",
"--decoder_model",
"stories260k",
"--model_mode",
"hybrid",
"--prefill_ar_len",
"32",
"--max_seq_len",
"128",
]
if self.compile_only:
cmds.extend(["--compile_only"])
elif self.device:
cmds.extend(["--device", self.device])
if self.host:
cmds.extend(["--host", self.host])
elif self.enable_x86_64:
cmds.extend(["--enable_x86_64"])
if self.pre_gen_pte:
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])

golden_start_with = "Once upon a time,"
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
if not self.compile_only:
model_out = msg["result"][0]
print(f"Model CI result:{model_out[: len(golden_start_with)]}")
self.assertTrue(
model_out.startswith(golden_start_with),
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
)
# x86 does not allow weight sharing, so we don't check pte size
if not self.enable_x86_64:
pte_size = msg["pte_size"]
self.assertLessEqual(pte_size, 2020000)
if not self.compile_only and not self.enable_x86_64:
self.assertGreaterEqual(msg["inference_speed"], 1600) # Lanai

def test_llama_stories_110m(self):
if not self.required_envs():
self.skipTest("missing required envs")
Expand Down
47 changes: 47 additions & 0 deletions examples/qualcomm/oss_scripts/llama/artifacts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Artifacts folder for LLaMA backward compatibility validation
This folder contains the stories260K(a smaller LLaMA variant) .pte artifact for backward compatibility (BC) validation in CI pipelines.

Model source: [karpathy/tinyllamas/stories260K](https://huggingface.co/karpathy/tinyllamas/tree/main/stories260K)

## Purpose
The .pte files stored here serve as reference pte to ensure that changes to the ExecuTorch do not introduce backward-incompatible changes.

These files are used in CI to:
1. Compile story llama with the previous (n-1) commit.
2. Run and validate with the current (n) commit.

We use the stories260K model because it is a minimal LLaMA variant, making it ideal for efficient validation in CI pipelines.

## File Structure
- stories260k_hybrid_llama_qnn.pte: precompiled story llama used for backward compatibility validation.
## Updating Artifacts
To update the .pte file, follow these steps:

1. Checkout the latest commit before all your changes.

2. Download and prepare stories260K model

```bash
# tokenizer.model & stories260K.pt:
wget "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/stories260K.pt"
wget -O tokenizer.model "https://huggingface.co/karpathy/tinyllamas/resolve/main/stories260K/tok512.model"

# tokenizer.bin:
python -m pytorch_tokenizers.tools.llama2c.convert -t tokenizer.model -o tokenizer.bin

# params.json:
echo '{"dim": 64, "n_layers": 5, "n_heads": 8, "n_kv_heads": 4, "vocab_size": 512, "multiple_of": 4, "max_seq_len": 512}' > params.json
```

3. Run the following command to regenerate and update .pte file:

``` bash
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_260k --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir ./examples/qualcomm/oss_scripts/llama/artifacts --llama_artifacts . --enable_x86_64 --compile_only

```
4. Commit the hybrid_llama_qnn.pte file to the repository.

5. Update this README if necessary then commit your changes.

Note: The .pte file is large (~2MB). In the future, we may host it on Hugging Face and download it during CI to reduce repository size.
Binary file not shown.
24 changes: 16 additions & 8 deletions examples/qualcomm/oss_scripts/llama/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,9 @@ def compile(args, pte_filename, tokenizer):
if "model" in state_dict:
state_dict = state_dict["model"]

if args.decoder_model == "stories260k":
state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}

# Change to HuggingFace weight to improve the performance of RoPE in HTP backend.
def permute(w, heads):
dim_0 = w.size(0)
Expand Down Expand Up @@ -751,7 +754,7 @@ def permute(w, heads):
annotate_conv=args.ptq != "16a8w",
),
)
if args.decoder_model == "stories110m":
if args.decoder_model == {"stories110m", "stories260k"}:
custom_annotations = custom_annotations + (
annotate_linear_16a8w_in_affine_layer,
)
Expand Down Expand Up @@ -946,7 +949,7 @@ def post_process():
f"--model_path {pte_path}",
f"--seq_len {seq_len}",
f"--output_path {args.artifact}/outputs/outputs.txt",
f"--performance_output_path {performance_output_path}",
f"--performance_output_path {args.artifact}/{performance_output_path}",
f"--kv_updater ShiftPointer",
runner_args,
]
Expand Down Expand Up @@ -995,7 +998,9 @@ def post_process():
adb.pull(output_path=args.artifact, callback=post_process)
if args.ip and args.port != -1:
inference_speed = 0
with open(f"{args.artifact}/{performance_output_path}", "r") as f:
with open(
f"{os.path.abspath(args.artifact)}/{performance_output_path}", "r"
) as f:
inference_speed = float(f.read())

pte_size = os.path.getsize(pte_path)
Expand Down Expand Up @@ -1033,8 +1038,8 @@ def _build_parser():

parser.add_argument(
"--decoder_model",
choices=["stories110m", "llama3_2", "qwen2_5"],
help="The Llama model to export. Current available options are: [stories110m, llama3_2, qwen2_5]",
choices=["stories260k", "stories110m", "llama3_2", "qwen2_5"],
help="The Llama model to export. Current available options are: [stories260k, stories110m, llama3_2, qwen2_5]",
required=True,
)

Expand Down Expand Up @@ -1208,16 +1213,19 @@ def export_llama(args) -> None:
else:
raise RuntimeError(f"Unknown model_mode: {args.model_mode}.")

if args.decoder_model == "stories260k":
pte_filename = f"{args.decoder_model}_" + pte_filename

tokenizer = None
runtime_tokenizer_path, decoder_model_version = "", ""
if args.decoder_model == "stories110m":
if args.decoder_model in {"stories110m", "stories260k"}:
tokenizer = get_tokenizer(args.tokenizer_model)
assert isinstance(
tokenizer, SentencePieceTokenizer
), f"Wrong tokenizer provided for stories110m."
), f"Wrong tokenizer provided for stories."
assert (
args.tokenizer_bin is not None
), "Please provide tokenizer_bin for stories110m."
), "Please provide tokenizer_bin for stories."
runtime_tokenizer_path = args.tokenizer_bin
decoder_model_version = "llama2"
elif args.decoder_model == "llama3_2":
Expand Down
Loading