@@ -4638,6 +4638,77 @@ def test_qnn_backend_generate_optrace(self):
4638
4638
4639
4639
4640
4640
class TestExampleLLMScript (TestQNN ):
4641
+ def test_static_gemma3_1b (self ):
4642
+ if not self .required_envs ():
4643
+ self .skipTest ("missing required envs" )
4644
+
4645
+ prompt = "My favourite condiment is "
4646
+ cmds = [
4647
+ "python" ,
4648
+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
4649
+ "--artifact" ,
4650
+ self .artifact_dir ,
4651
+ "--build_folder" ,
4652
+ self .build_folder ,
4653
+ "--model" ,
4654
+ self .model ,
4655
+ "--ip" ,
4656
+ self .ip ,
4657
+ "--port" ,
4658
+ str (self .port ),
4659
+ "--prompt" ,
4660
+ f"{ prompt } " ,
4661
+ "--ptq" ,
4662
+ "16a4w_block" ,
4663
+ "--temperature" ,
4664
+ "0" ,
4665
+ "--decoder_model" ,
4666
+ "gemma3-1b" ,
4667
+ "--model_mode" ,
4668
+ "kv" ,
4669
+ "--max_seq_len" ,
4670
+ "1024" ,
4671
+ "--eval_perplexity" ,
4672
+ "--tasks" ,
4673
+ "wikitext" ,
4674
+ "--limit" ,
4675
+ "1" ,
4676
+ "--enable_masked_softmax" ,
4677
+ ]
4678
+ if self .compile_only :
4679
+ cmds .extend (["--compile_only" ])
4680
+ elif self .device :
4681
+ cmds .extend (["--device" , self .device ])
4682
+ if self .host :
4683
+ cmds .extend (["--host" , self .host ])
4684
+ elif self .enable_x86_64 :
4685
+ cmds .extend (["--enable_x86_64" ])
4686
+ if self .pre_gen_pte :
4687
+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
4688
+
4689
+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
4690
+ with Listener ((self .ip , self .port )) as listener :
4691
+ conn = listener .accept ()
4692
+ p .communicate ()
4693
+ msg = json .loads (conn .recv ())
4694
+ if "Error" in msg :
4695
+ self .fail (msg ["Error" ])
4696
+ else :
4697
+ if not self .compile_only :
4698
+ self .assertLessEqual (msg ["wiki_ppl" ], 23 )
4699
+ if not self .enable_x86_64 :
4700
+ pte_size = msg ["pte_size" ]
4701
+ self .assertLessEqual (pte_size , 1_200_000_000 ) # 1.2GB
4702
+ inference_speed_ref = {"SM8650" : 70 , "SM8750" : 100 }
4703
+ if (
4704
+ not self .compile_only
4705
+ and not self .enable_x86_64
4706
+ and self .model in inference_speed_ref
4707
+ ):
4708
+ self .assertGreaterEqual (
4709
+ msg ["inference_speed" ], inference_speed_ref [self .model ]
4710
+ )
4711
+
4641
4712
def test_llama3_2_1b (self ):
4642
4713
if not self .required_envs ():
4643
4714
self .skipTest ("missing required envs" )
@@ -4708,7 +4779,7 @@ def test_llama3_2_1b(self):
4708
4779
# Inference speed on x86 is slow, so we only check when running on Android
4709
4780
if not self .enable_x86_64 :
4710
4781
pte_size = msg ["pte_size" ]
4711
- self .assertLessEqual (pte_size , 1300000000 )
4782
+ self .assertLessEqual (pte_size , 1_300_000_000 ) # 1.3GB
4712
4783
if not self .compile_only and not self .enable_x86_64 :
4713
4784
self .assertGreaterEqual (msg ["inference_speed" ], 66 ) # Lanai
4714
4785
@@ -4784,7 +4855,7 @@ def test_llama_stories_260k(self):
4784
4855
# x86 does not allow weight sharing, so we don't check pte size
4785
4856
if not self .enable_x86_64 :
4786
4857
pte_size = msg ["pte_size" ]
4787
- self .assertLessEqual (pte_size , 2020000 )
4858
+ self .assertLessEqual (pte_size , 2_020_000 ) # 2MB
4788
4859
if not self .compile_only and not self .enable_x86_64 :
4789
4860
self .assertGreaterEqual (msg ["inference_speed" ], 1600 ) # Lanai
4790
4861
@@ -4859,7 +4930,7 @@ def test_llama_stories_110m(self):
4859
4930
# x86 does not allow weight sharing, so we don't check pte size
4860
4931
if not self .enable_x86_64 :
4861
4932
pte_size = msg ["pte_size" ]
4862
- self .assertLessEqual (pte_size , 130000000 )
4933
+ self .assertLessEqual (pte_size , 130_000_000 ) # 130MB
4863
4934
if not self .compile_only and not self .enable_x86_64 :
4864
4935
self .assertGreaterEqual (msg ["inference_speed" ], 220 ) # Lanai
4865
4936
@@ -4922,7 +4993,7 @@ def test_static_phi4(self):
4922
4993
else :
4923
4994
inference_speed_ref = {"SM8650" : 14 , "SM8750" : 19 }
4924
4995
self .assertLessEqual (msg ["wiki_ppl" ], 12 )
4925
- self .assertLessEqual (msg ["pte_size" ], 4000000000 ) # 4gb
4996
+ self .assertLessEqual (msg ["pte_size" ], 4_000_000_000 ) # 4GB
4926
4997
if self .model in inference_speed_ref :
4927
4998
self .assertGreaterEqual (
4928
4999
msg ["inference_speed" ], inference_speed_ref [self .model ]
@@ -4981,7 +5052,7 @@ def test_static_qwen2_5(self):
4981
5052
else :
4982
5053
inference_speed_ref = {"SM8650" : 110 , "SM8750" : 130 }
4983
5054
self .assertLessEqual (msg ["wiki_ppl" ], 15 )
4984
- self .assertLessEqual (msg ["pte_size" ], 800000000 ) # 800mb
5055
+ self .assertLessEqual (msg ["pte_size" ], 800_000_000 ) # 800MB
4985
5056
if self .model in inference_speed_ref :
4986
5057
self .assertGreaterEqual (
4987
5058
msg ["inference_speed" ], inference_speed_ref [self .model ]
@@ -5040,7 +5111,7 @@ def test_static_qwen3(self):
5040
5111
else :
5041
5112
inference_speed_ref = {"SM8650" : 38 , "SM8750" : 56 }
5042
5113
self .assertLessEqual (msg ["wiki_ppl" ], 18 )
5043
- self .assertLessEqual (msg ["pte_size" ], 950_000_000 ) # 950mb
5114
+ self .assertLessEqual (msg ["pte_size" ], 950_000_000 ) # 950MB
5044
5115
if self .model in inference_speed_ref :
5045
5116
self .assertGreaterEqual (
5046
5117
msg ["inference_speed" ], inference_speed_ref [self .model ]
0 commit comments