Skip to content

Commit 3236545

Browse files
crazydemodsingal0
authored andcommitted
test: move mistral / mixtral test cases in QA test list into the new accuracy test suite (#3440)
* add mistral-7b-v0.1 torch flow test case Signed-off-by: Ivy Zhang <[email protected]> * rearrange mistral Signed-off-by: Ivy Zhang <[email protected]> * rearrange mixtral case Signed-off-by: Ivy Zhang <[email protected]> * remove api function test Signed-off-by: Ivy Zhang <[email protected]> * move mistral nemo cases Signed-off-by: Ivy Zhang <[email protected]> * move mixtral cases Signed-off-by: Ivy Zhang <[email protected]> * update threshold Signed-off-by: Ivy Zhang <[email protected]> * fix failure Signed-off-by: Ivy Zhang <[email protected]> * fix name Signed-off-by: Ivy Zhang <[email protected]> * fix failure cases Signed-off-by: Ivy Zhang <[email protected]> * update list Signed-off-by: Ivy Zhang <[email protected]> * update threshold Signed-off-by: Ivy Zhang <[email protected]> * remove awq llmapi test Signed-off-by: Ivy Zhang <[email protected]> * adjust threshold Signed-off-by: Ivy Zhang <[email protected]> * fix ci Signed-off-by: Ivy Zhang <[email protected]> * fix partial comments Signed-off-by: Ivy Zhang <[email protected]> * fix path Signed-off-by: Ivy Zhang <[email protected]> * update thres Signed-off-by: Ivy Zhang <[email protected]> * update Signed-off-by: Ivy Zhang <[email protected]> * remove duplicate test case Signed-off-by: Ivy Zhang <[email protected]> * fix ci Signed-off-by: Ivy Zhang <[email protected]> --------- Signed-off-by: Ivy Zhang <[email protected]> Signed-off-by: Dhruv Singal <[email protected]>
1 parent 2afc5c0 commit 3236545

File tree

11 files changed

+374
-1066
lines changed

11 files changed

+374
-1066
lines changed

tests/integration/defs/accuracy/references/cnn_dailymail.yaml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,32 @@ meta-llama/Llama-3.3-70B-Instruct:
149149
accuracy: 34.927
150150
mistralai/Mistral-7B-v0.1:
151151
- accuracy: 25.741
152+
- extra_acc_spec: beam_width=4
153+
accuracy: 28.368
154+
- extra_acc_spec: beam_width=2
155+
accuracy: 27.663
156+
- extra_acc_spec: beam_width=1
157+
accuracy: 25.604
158+
- quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
159+
accuracy: 24.806
160+
- quant_algo: FP8
161+
accuracy: 25.180
162+
- quant_algo: W4A16_AWQ
163+
accuracy: 24.806
164+
mistralai/Mistral-7B-Instruct-v0.3:
165+
- quant_algo: W4A16
166+
accuracy: 31.372
167+
- quant_algo: W4A16_AWQ
168+
accuracy: 31.457
169+
- quant_algo: W4A8_AWQ
170+
accuracy: 31.201
171+
mistralai/Mistral-Nemo-Base-2407:
172+
- quant_algo: FP8
173+
kv_cache_quant_algo: FP8
174+
accuracy: 24.0
175+
nvidia/Mistral-NeMo-Minitron-8B-Instruct:
176+
- quant_algo: FP8
177+
accuracy: 32.553
152178
mistralai/Mixtral-8x7B-v0.1:
153179
- accuracy: 28.810
154180
- quant_algo: NVFP4
@@ -157,6 +183,22 @@ mistralai/Mixtral-8x7B-v0.1:
157183
- quant_algo: FP8
158184
kv_cache_quant_algo: FP8
159185
accuracy: 27.109
186+
- quant_algo: W4A16
187+
accuracy: 23.229
188+
- quant_algo: W4A16_AWQ
189+
accuracy: 23.229
190+
- quant_algo: W8A16
191+
accuracy: 27.624
192+
- quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
193+
accuracy: 27.979
194+
mistralai/Mixtral-8x7B-Instruct-v0.1:
195+
- quant_algo: W4A16_AWQ
196+
accuracy: 28.810
197+
mistralai/Mixtral-8x22B-v0.1:
198+
- quant_algo: FP8
199+
accuracy: 25.519
200+
- quant_algo: W8A16
201+
accuracy: 27.427
160202
google/gemma-2b:
161203
- accuracy: 23.194
162204
- quant_algo: W8A16

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
3535
- accuracy: 86.40
3636
meta-llama/Llama-4-Scout-17B-16E-Instruct:
3737
- accuracy: 80.00
38+
mistralai/Mistral-7B-v0.1:
39+
- accuracy: 66
40+
mistralai/Mistral-7B-Instruct-v0.3:
41+
- quant_algo: W4A16
42+
accuracy: 59.23
43+
- quant_algo: W4A16_AWQ
44+
accuracy: 61.06
45+
- quant_algo: W4A8_AWQ
46+
accuracy: 60.04
3847
mistralai/Mixtral-8x7B-v0.1:
3948
- accuracy: 71.35
4049
- quant_algo: FP8
@@ -43,6 +52,11 @@ mistralai/Mixtral-8x7B-v0.1:
4352
- quant_algo: NVFP4
4453
kv_cache_quant_algo: FP8
4554
accuracy: 69.64
55+
mistralai/Mixtral-8x7B-Instruct-v0.1:
56+
- accuracy: 68.0
57+
mistralai/Mixtral-8x22B-v0.1:
58+
- quant_algo: FP8
59+
accuracy: 77.63
4660
google/gemma-2-9b-it:
4761
- accuracy: 73.05
4862
Qwen/Qwen2-0.5B-Instruct:

tests/integration/defs/accuracy/test_cli_flow.py

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -794,6 +794,44 @@ def test_cyclic_kv_cache_beam_search(self):
794794
])
795795

796796

797+
class TestMistral7B(CliFlowAccuracyTestHarness):
798+
MODEL_NAME = "mistralai/Mistral-7B-v0.1"
799+
MODEL_PATH = f"{llm_models_root()}/mistral-7b-v0.1"
800+
EXAMPLE_FOLDER = "models/core/llama"
801+
802+
@skip_pre_blackwell
803+
def test_beam_search(self):
804+
self.run(extra_acc_spec="beam_width=4",
805+
extra_build_args=["--gemm_plugin=auto", "--max_beam_width=4"],
806+
extra_summarize_args=["--num_beams=4"])
807+
import gc
808+
809+
import torch
810+
for num_beams in [1, 2]:
811+
gc.collect()
812+
torch.cuda.empty_cache()
813+
self.extra_acc_spec = f"beam_width={num_beams}"
814+
self.extra_summarize_args = [f"--num_beams={num_beams}"]
815+
self.evaluate()
816+
817+
@skip_pre_ada
818+
@pytest.mark.skip_less_device(8)
819+
def test_fp8_tp4pp2(self):
820+
self.run(quant_algo=QuantAlgo.FP8,
821+
tp_size=4,
822+
pp_size=2,
823+
extra_convert_args=["--calib_size=4"],
824+
extra_build_args=["--gemm_plugin=auto"])
825+
826+
@skip_post_blackwell
827+
@pytest.mark.skip_less_device(4)
828+
def test_smooth_quant_tp4pp1(self):
829+
self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
830+
tp_size=4,
831+
pp_size=1,
832+
extra_build_args=["--gemm_plugin=auto"])
833+
834+
797835
class TestMixtral8x7B(CliFlowAccuracyTestHarness):
798836
MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"
799837
MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-v0.1"
@@ -804,6 +842,43 @@ class TestMixtral8x7B(CliFlowAccuracyTestHarness):
804842
def test_tp2(self):
805843
self.run(dtype='auto', tp_size=2)
806844

845+
@skip_post_blackwell
846+
@pytest.mark.skip_less_device(8)
847+
@pytest.mark.skip_less_device_memory(45000)
848+
@pytest.mark.parametrize(
849+
"moe_tp_size", [1, 4, 8],
850+
ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel'])
851+
def test_ootb_except_mha_tp8(self, moe_tp_size):
852+
self.run(tp_size=8,
853+
extra_convert_args=[
854+
f"--moe_tp_size={moe_tp_size}",
855+
f"--moe_ep_size={8 // moe_tp_size}",
856+
f"--moe_renorm_mode={0}"
857+
],
858+
extra_build_args=[
859+
"--gemm_plugin=disable", "--moe_plugin=disable",
860+
f"--max_seq_len={8192}"
861+
])
862+
863+
@pytest.mark.skip_less_device(8)
864+
@pytest.mark.skip_less_device_memory(45000)
865+
@pytest.mark.parametrize(
866+
"moe_tp_size", [1, 4, 8],
867+
ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel'])
868+
@pytest.mark.parametrize("moe_renorm_mode", [0, 1],
869+
ids=['no_renormalize', 'renormalize'])
870+
def test_plugin_tp8(self, moe_tp_size, moe_renorm_mode):
871+
self.run(tp_size=8,
872+
extra_convert_args=[
873+
f"--moe_tp_size={moe_tp_size}",
874+
f"--moe_ep_size={8 // moe_tp_size}",
875+
f"--moe_renorm_mode={moe_renorm_mode}"
876+
],
877+
extra_build_args=[
878+
"--gemm_plugin=auto", "--moe_plugin=auto",
879+
f"--max_seq_len={8192}"
880+
])
881+
807882
@skip_pre_ada
808883
@pytest.mark.skip_less_device(2)
809884
@pytest.mark.skip_less_device_memory(80000)
@@ -835,6 +910,43 @@ def test_fp8_tp2pp2_manage_weights(self):
835910
pp_size=2,
836911
extra_build_args=["--fast_build"])
837912

913+
@pytest.mark.skip_less_device(2)
914+
@pytest.mark.skip_less_device_memory(80000)
915+
def test_weight_only_int4_tp2(self):
916+
self.run(quant_algo=QuantAlgo.W4A16,
917+
tp_size=2,
918+
extra_build_args=["--gemm_plugin=auto"])
919+
920+
@pytest.mark.skip_less_device(2)
921+
@pytest.mark.skip_less_device_memory(80000)
922+
def test_weight_only_int8_tp2(self):
923+
self.run(quant_algo=QuantAlgo.W8A16,
924+
tp_size=2,
925+
extra_build_args=["--gemm_plugin=auto"])
926+
927+
@skip_post_blackwell
928+
@pytest.mark.skip_less_device(4)
929+
@pytest.mark.skip_less_device_memory(45000)
930+
def test_pp_reduce_scatter_tp2pp2(self):
931+
self.run(quant_algo=QuantAlgo.W8A16,
932+
tp_size=2,
933+
pp_size=2,
934+
extra_build_args=[
935+
"--gemm_plugin=auto", "--pp_reduce_scatter=enable"
936+
])
937+
938+
@skip_pre_blackwell
939+
@pytest.mark.skip_less_device_memory(180000)
940+
def test_fp4_plugin(self):
941+
build_args = [
942+
"--max_input_len=2048", "--gemm_plugin=nvfp4",
943+
"--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable"
944+
]
945+
self.run(tasks=[MMLU(self.MODEL_NAME)],
946+
quant_algo=QuantAlgo.NVFP4,
947+
kv_cache_quant_algo=QuantAlgo.FP8,
948+
extra_build_args=build_args)
949+
838950
@skip_pre_blackwell
839951
def test_nvfp4_prequantized(self, mocker):
840952
mocker.patch.object(
@@ -845,6 +957,45 @@ def test_nvfp4_prequantized(self, mocker):
845957
kv_cache_quant_algo=QuantAlgo.FP8)
846958

847959

960+
class TestMixtral8x22B(CliFlowAccuracyTestHarness):
961+
MODEL_NAME = "mistralai/Mixtral-8x22B-v0.1"
962+
MODEL_PATH = f"{llm_models_root()}/Mixtral-8x22B-v0.1"
963+
EXAMPLE_FOLDER = "models/core/llama"
964+
965+
@skip_pre_ada
966+
@pytest.mark.skip_less_device(4)
967+
@pytest.mark.skip_less_device_memory(80000)
968+
def test_fp8_tp2pp2(self):
969+
self.run(tasks=[CnnDailymail(self.MODEL_NAME),
970+
MMLU(self.MODEL_NAME)],
971+
quant_algo=QuantAlgo.FP8,
972+
tp_size=2,
973+
pp_size=2,
974+
extra_convert_args=["--calib_size=32"],
975+
extra_build_args=["--gemm_plugin=auto"])
976+
977+
@skip_post_blackwell
978+
@pytest.mark.skip_less_device(8)
979+
@pytest.mark.skip_less_device_memory(45000)
980+
@pytest.mark.parametrize(
981+
"moe_tp_size", [1, 4, 8],
982+
ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel'])
983+
@pytest.mark.parametrize("moe_renorm_mode", [0, 1],
984+
ids=['no_renormalize', 'renormalize'])
985+
def test_int8_plugin_tp8(self, moe_tp_size, moe_renorm_mode):
986+
self.run(quant_algo=QuantAlgo.W8A16,
987+
tp_size=8,
988+
extra_convert_args=[
989+
f"--moe_tp_size={moe_tp_size}",
990+
f"--moe_ep_size={8 // moe_tp_size}",
991+
f"--moe_renorm_mode={moe_renorm_mode}"
992+
],
993+
extra_build_args=[
994+
"--max_beam_width=4", "--gemm_plugin=auto",
995+
"--moe_plugin=auto", f"--max_seq_len={8192}"
996+
])
997+
998+
848999
class TestGemma2B(CliFlowAccuracyTestHarness):
8491000
MODEL_NAME = "google/gemma-2b"
8501001
MODEL_PATH = f"{llm_models_root()}/gemma/gemma-2b"

tests/integration/defs/accuracy/test_llm_api.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,58 @@ def test_fp8_rowwise(self):
3838
task.evaluate(llm)
3939

4040

41+
class TestMistral7B_0_3(LlmapiAccuracyTestHarness):
42+
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
43+
MODEL_PATH = f"{llm_models_root()}/Mistral-7B-Instruct-v0.3"
44+
45+
@skip_post_blackwell
46+
@skip_pre_ada
47+
@pytest.mark.skip_less_device(4)
48+
@pytest.mark.skip_less_device_memory(80000)
49+
@pytest.mark.parametrize("quant", ['int4', 'int4_awq', 'int8_awq'])
50+
def test_quant_tp4(self, quant):
51+
if quant == 'int4':
52+
quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
53+
elif quant == 'int4_awq':
54+
quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ)
55+
elif quant == 'int8_awq':
56+
quant_config = QuantConfig(quant_algo=QuantAlgo.W4A8_AWQ)
57+
58+
with LLM(self.MODEL_PATH,
59+
tensor_parallel_size=4,
60+
quant_config=quant_config) as llm:
61+
task = CnnDailymail(self.MODEL_NAME)
62+
task.evaluate(llm)
63+
task = MMLU(self.MODEL_NAME)
64+
task.evaluate(llm)
65+
66+
67+
class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness):
68+
MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407"
69+
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
70+
71+
def test_fp8(self):
72+
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
73+
kv_cache_quant_algo=QuantAlgo.FP8)
74+
75+
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
76+
task = CnnDailymail(self.MODEL_NAME)
77+
task.evaluate(llm)
78+
79+
80+
class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness):
81+
MODEL_NAME = "nvidia/Mistral-NeMo-Minitron-8B-Instruct"
82+
MODEL_PATH = f"{llm_models_root()}/Mistral-NeMo-Minitron-8B-Instruct"
83+
84+
@skip_pre_ada
85+
def test_fp8(self):
86+
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
87+
88+
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
89+
task = CnnDailymail(self.MODEL_NAME)
90+
task.evaluate(llm)
91+
92+
4193
class TestMixtral8x7B(LlmapiAccuracyTestHarness):
4294
MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"
4395
MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-v0.1"
@@ -50,6 +102,32 @@ def test_tp2(self):
50102
task = MMLU(self.MODEL_NAME)
51103
task.evaluate(llm)
52104

105+
@skip_pre_ada
106+
@pytest.mark.skip_less_device(4)
107+
def test_smooth_quant_tp2pp2(self):
108+
quant_config = QuantConfig(
109+
quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
110+
with LLM(self.MODEL_PATH,
111+
quant_config=quant_config,
112+
tensor_parallel_size=2,
113+
pipeline_parallel_size=2) as llm:
114+
task = CnnDailymail(self.MODEL_NAME)
115+
task.evaluate(llm)
116+
117+
118+
class TestMixtral8x7BInstruct(LlmapiAccuracyTestHarness):
119+
MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
120+
MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-Instruct-v0.1"
121+
122+
@skip_post_blackwell
123+
def test_awq_tp2(self):
124+
quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ)
125+
with LLM(self.MODEL_PATH,
126+
quant_config=quant_config,
127+
tensor_parallel_size=2) as llm:
128+
task = CnnDailymail(self.MODEL_NAME)
129+
task.evaluate(llm)
130+
53131

54132
class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
55133
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"

0 commit comments

Comments
 (0)