NVIDIA
diff --git a/‎tests/integration/defs/accuracy/references/cnn_dailymail.yaml
Lines changed: 42 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/cnn_dailymail.yaml
Lines changed: 42 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/references/mmlu.yaml
Lines changed: 14 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/mmlu.yaml
Lines changed: 14 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/test_cli_flow.py
Lines changed: 151 additions & 0 deletions b/‎tests/integration/defs/accuracy/test_cli_flow.py
Lines changed: 151 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/test_llm_api.py
Lines changed: 78 additions & 0 deletions b/‎tests/integration/defs/accuracy/test_llm_api.py
Lines changed: 78 additions & 0 deletions
@@ -149,6 +149,32 @@ meta-llama/Llama-3.3-70B-Instruct:
     accuracy: 34.927
 mistralai/Mistral-7B-v0.1:
   - accuracy: 25.741
+  - extra_acc_spec: beam_width=4
+    accuracy: 28.368
+  - extra_acc_spec: beam_width=2
+    accuracy: 27.663
+  - extra_acc_spec: beam_width=1
+    accuracy: 25.604
+  - quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
+    accuracy: 24.806
+  - quant_algo: FP8
+    accuracy: 25.180
+  - quant_algo: W4A16_AWQ
+    accuracy: 24.806
+mistralai/Mistral-7B-Instruct-v0.3:
+  - quant_algo: W4A16
+    accuracy: 31.372
+  - quant_algo: W4A16_AWQ
+    accuracy: 31.457
+  - quant_algo: W4A8_AWQ
+    accuracy: 31.201
+mistralai/Mistral-Nemo-Base-2407:
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 24.0
+nvidia/Mistral-NeMo-Minitron-8B-Instruct:
+  - quant_algo: FP8
+    accuracy: 32.553
 mistralai/Mixtral-8x7B-v0.1:
   - accuracy: 28.810
   - quant_algo: NVFP4
@@ -157,6 +183,22 @@ mistralai/Mixtral-8x7B-v0.1:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 27.109
+  - quant_algo: W4A16
+    accuracy: 23.229
+  - quant_algo: W4A16_AWQ
+    accuracy: 23.229
+  - quant_algo: W8A16
+    accuracy: 27.624
+  - quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
+    accuracy: 27.979
+mistralai/Mixtral-8x7B-Instruct-v0.1:
+  - quant_algo: W4A16_AWQ
+    accuracy: 28.810
+mistralai/Mixtral-8x22B-v0.1:
+  - quant_algo: FP8
+    accuracy: 25.519
+  - quant_algo: W8A16
+    accuracy: 27.427
 google/gemma-2b:
   - accuracy: 23.194
   - quant_algo: W8A16
 
@@ -35,6 +35,15 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 80.00
+mistralai/Mistral-7B-v0.1:
+  - accuracy: 66
+mistralai/Mistral-7B-Instruct-v0.3:
+  - quant_algo: W4A16
+    accuracy: 59.23
+  - quant_algo: W4A16_AWQ
+    accuracy: 61.06
+  - quant_algo: W4A8_AWQ
+    accuracy: 60.04
 mistralai/Mixtral-8x7B-v0.1:
   - accuracy: 71.35
   - quant_algo: FP8
@@ -43,6 +52,11 @@ mistralai/Mixtral-8x7B-v0.1:
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
     accuracy: 69.64
+mistralai/Mixtral-8x7B-Instruct-v0.1:
+  - accuracy: 68.0
+mistralai/Mixtral-8x22B-v0.1:
+  - quant_algo: FP8
+    accuracy: 77.63
 google/gemma-2-9b-it:
   - accuracy: 73.05
 Qwen/Qwen2-0.5B-Instruct:
 
@@ -794,6 +794,44 @@ def test_cyclic_kv_cache_beam_search(self):
                  ])
 
 
+class TestMistral7B(CliFlowAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mistral-7B-v0.1"
+    MODEL_PATH = f"{llm_models_root()}/mistral-7b-v0.1"
+    EXAMPLE_FOLDER = "models/core/llama"
+
+    @skip_pre_blackwell
+    def test_beam_search(self):
+        self.run(extra_acc_spec="beam_width=4",
+                 extra_build_args=["--gemm_plugin=auto", "--max_beam_width=4"],
+                 extra_summarize_args=["--num_beams=4"])
+        import gc
+
+        import torch
+        for num_beams in [1, 2]:
+            gc.collect()
+            torch.cuda.empty_cache()
+            self.extra_acc_spec = f"beam_width={num_beams}"
+            self.extra_summarize_args = [f"--num_beams={num_beams}"]
+            self.evaluate()
+
+    @skip_pre_ada
+    @pytest.mark.skip_less_device(8)
+    def test_fp8_tp4pp2(self):
+        self.run(quant_algo=QuantAlgo.FP8,
+                 tp_size=4,
+                 pp_size=2,
+                 extra_convert_args=["--calib_size=4"],
+                 extra_build_args=["--gemm_plugin=auto"])
+
+    @skip_post_blackwell
+    @pytest.mark.skip_less_device(4)
+    def test_smooth_quant_tp4pp1(self):
+        self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN,
+                 tp_size=4,
+                 pp_size=1,
+                 extra_build_args=["--gemm_plugin=auto"])
+
+
 class TestMixtral8x7B(CliFlowAccuracyTestHarness):
     MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"
     MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-v0.1"
@@ -804,6 +842,43 @@ class TestMixtral8x7B(CliFlowAccuracyTestHarness):
     def test_tp2(self):
         self.run(dtype='auto', tp_size=2)
 
+    @skip_post_blackwell
+    @pytest.mark.skip_less_device(8)
+    @pytest.mark.skip_less_device_memory(45000)
+    @pytest.mark.parametrize(
+        "moe_tp_size", [1, 4, 8],
+        ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel'])
+    def test_ootb_except_mha_tp8(self, moe_tp_size):
+        self.run(tp_size=8,
+                 extra_convert_args=[
+                     f"--moe_tp_size={moe_tp_size}",
+                     f"--moe_ep_size={8 // moe_tp_size}",
+                     f"--moe_renorm_mode={0}"
+                 ],
+                 extra_build_args=[
+                     "--gemm_plugin=disable", "--moe_plugin=disable",
+                     f"--max_seq_len={8192}"
+                 ])
+
+    @pytest.mark.skip_less_device(8)
+    @pytest.mark.skip_less_device_memory(45000)
+    @pytest.mark.parametrize(
+        "moe_tp_size", [1, 4, 8],
+        ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel'])
+    @pytest.mark.parametrize("moe_renorm_mode", [0, 1],
+                             ids=['no_renormalize', 'renormalize'])
+    def test_plugin_tp8(self, moe_tp_size, moe_renorm_mode):
+        self.run(tp_size=8,
+                 extra_convert_args=[
+                     f"--moe_tp_size={moe_tp_size}",
+                     f"--moe_ep_size={8 // moe_tp_size}",
+                     f"--moe_renorm_mode={moe_renorm_mode}"
+                 ],
+                 extra_build_args=[
+                     "--gemm_plugin=auto", "--moe_plugin=auto",
+                     f"--max_seq_len={8192}"
+                 ])
+
     @skip_pre_ada
     @pytest.mark.skip_less_device(2)
     @pytest.mark.skip_less_device_memory(80000)
@@ -835,6 +910,43 @@ def test_fp8_tp2pp2_manage_weights(self):
                  pp_size=2,
                  extra_build_args=["--fast_build"])
 
+    @pytest.mark.skip_less_device(2)
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_weight_only_int4_tp2(self):
+        self.run(quant_algo=QuantAlgo.W4A16,
+                 tp_size=2,
+                 extra_build_args=["--gemm_plugin=auto"])
+
+    @pytest.mark.skip_less_device(2)
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_weight_only_int8_tp2(self):
+        self.run(quant_algo=QuantAlgo.W8A16,
+                 tp_size=2,
+                 extra_build_args=["--gemm_plugin=auto"])
+
+    @skip_post_blackwell
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_less_device_memory(45000)
+    def test_pp_reduce_scatter_tp2pp2(self):
+        self.run(quant_algo=QuantAlgo.W8A16,
+                 tp_size=2,
+                 pp_size=2,
+                 extra_build_args=[
+                     "--gemm_plugin=auto", "--pp_reduce_scatter=enable"
+                 ])
+
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_device_memory(180000)
+    def test_fp4_plugin(self):
+        build_args = [
+            "--max_input_len=2048", "--gemm_plugin=nvfp4",
+            "--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable"
+        ]
+        self.run(tasks=[MMLU(self.MODEL_NAME)],
+                 quant_algo=QuantAlgo.NVFP4,
+                 kv_cache_quant_algo=QuantAlgo.FP8,
+                 extra_build_args=build_args)
+
     @skip_pre_blackwell
     def test_nvfp4_prequantized(self, mocker):
         mocker.patch.object(
@@ -845,6 +957,45 @@ def test_nvfp4_prequantized(self, mocker):
                  kv_cache_quant_algo=QuantAlgo.FP8)
 
 
+class TestMixtral8x22B(CliFlowAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mixtral-8x22B-v0.1"
+    MODEL_PATH = f"{llm_models_root()}/Mixtral-8x22B-v0.1"
+    EXAMPLE_FOLDER = "models/core/llama"
+
+    @skip_pre_ada
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_fp8_tp2pp2(self):
+        self.run(tasks=[CnnDailymail(self.MODEL_NAME),
+                        MMLU(self.MODEL_NAME)],
+                 quant_algo=QuantAlgo.FP8,
+                 tp_size=2,
+                 pp_size=2,
+                 extra_convert_args=["--calib_size=32"],
+                 extra_build_args=["--gemm_plugin=auto"])
+
+    @skip_post_blackwell
+    @pytest.mark.skip_less_device(8)
+    @pytest.mark.skip_less_device_memory(45000)
+    @pytest.mark.parametrize(
+        "moe_tp_size", [1, 4, 8],
+        ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel'])
+    @pytest.mark.parametrize("moe_renorm_mode", [0, 1],
+                             ids=['no_renormalize', 'renormalize'])
+    def test_int8_plugin_tp8(self, moe_tp_size, moe_renorm_mode):
+        self.run(quant_algo=QuantAlgo.W8A16,
+                 tp_size=8,
+                 extra_convert_args=[
+                     f"--moe_tp_size={moe_tp_size}",
+                     f"--moe_ep_size={8 // moe_tp_size}",
+                     f"--moe_renorm_mode={moe_renorm_mode}"
+                 ],
+                 extra_build_args=[
+                     "--max_beam_width=4", "--gemm_plugin=auto",
+                     "--moe_plugin=auto", f"--max_seq_len={8192}"
+                 ])
+
+
 class TestGemma2B(CliFlowAccuracyTestHarness):
     MODEL_NAME = "google/gemma-2b"
     MODEL_PATH = f"{llm_models_root()}/gemma/gemma-2b"
 
@@ -38,6 +38,58 @@ def test_fp8_rowwise(self):
             task.evaluate(llm)
 
 
+class TestMistral7B_0_3(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.3"
+    MODEL_PATH = f"{llm_models_root()}/Mistral-7B-Instruct-v0.3"
+
+    @skip_post_blackwell
+    @skip_pre_ada
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_less_device_memory(80000)
+    @pytest.mark.parametrize("quant", ['int4', 'int4_awq', 'int8_awq'])
+    def test_quant_tp4(self, quant):
+        if quant == 'int4':
+            quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+        elif quant == 'int4_awq':
+            quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ)
+        elif quant == 'int8_awq':
+            quant_config = QuantConfig(quant_algo=QuantAlgo.W4A8_AWQ)
+
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=4,
+                 quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407"
+    MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
+
+    def test_fp8(self):
+        quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
+                                   kv_cache_quant_algo=QuantAlgo.FP8)
+
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Mistral-NeMo-Minitron-8B-Instruct"
+    MODEL_PATH = f"{llm_models_root()}/Mistral-NeMo-Minitron-8B-Instruct"
+
+    @skip_pre_ada
+    def test_fp8(self):
+        quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestMixtral8x7B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1"
     MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-v0.1"
@@ -50,6 +102,32 @@ def test_tp2(self):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_ada
+    @pytest.mark.skip_less_device(4)
+    def test_smooth_quant_tp2pp2(self):
+        quant_config = QuantConfig(
+            quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN)
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 tensor_parallel_size=2,
+                 pipeline_parallel_size=2) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestMixtral8x7BInstruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+    MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-Instruct-v0.1"
+
+    @skip_post_blackwell
+    def test_awq_tp2(self):
+        quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16_AWQ)
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 tensor_parallel_size=2) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen2-7B-Instruct"