Skip to content

Commit d78a50a

Browse files
bobbolisyuoni
authored andcommitted
test: Add fp8kv to DS-v3-lite integration tests. (#3950)
* Add fp8 kv cache tests to DSV3-Lite integration tests. Signed-off-by: Bo Li <[email protected]> * Refactor. Make fp8kv parallel to attention_dp, overlap_scheduler and cuda_graph. Signed-off-by: Bo Li <[email protected]> * Update gsm8k. Signed-off-by: Bo Li <[email protected]> * Update CI list. Signed-off-by: Bo Li <[email protected]> * Update TestDeepSeekR1. Signed-off-by: Bo Li <[email protected]> * Fix test list. Signed-off-by: Bo Li <[email protected]> * Need quant_config besides pytorch_config. Signed-off-by: Bo Li <[email protected]> * Update waive list (bug 5239087). Signed-off-by: Bo Li <[email protected]> * Update waive list. Signed-off-by: Bo Li <[email protected]> * Correct test name. Signed-off-by: Bo Li <[email protected]> * Update waive list. Signed-off-by: Bo Li <[email protected]> --------- Signed-off-by: Bo Li <[email protected]> Signed-off-by: Bo Li <[email protected]> Signed-off-by: Enwei Zhu <[email protected]> Co-authored-by: Enwei Zhu <[email protected]> Signed-off-by: Dhruv Singal <[email protected]>
1 parent 3236545 commit d78a50a

File tree

7 files changed

+186
-64
lines changed

7 files changed

+186
-64
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,24 @@ deepseek-ai/DeepSeek-V3-Lite:
1919
- accuracy: 64.74
2020
- quant_algo: NVFP4
2121
accuracy: 63.71
22+
- quant_algo: NVFP4
23+
kv_cache_quant_algo: FP8
24+
accuracy: 63.71
2225
- quant_algo: FP8_BLOCK_SCALES
2326
accuracy: 64.74
27+
- quant_algo: FP8_BLOCK_SCALES
28+
kv_cache_quant_algo: FP8
29+
accuracy: 64.74
2430
- spec_dec_algo: MTP
2531
accuracy: 64.44
32+
- spec_dec_algo: MTP
33+
kv_cache_quant_algo: FP8
34+
accuracy: 64.44
35+
- quant_algo: FP8_BLOCK_SCALES
36+
spec_dec_algo: MTP
37+
accuracy: 64.14
2638
- quant_algo: FP8_BLOCK_SCALES
39+
kv_cache_quant_algo: FP8
2740
spec_dec_algo: MTP
2841
accuracy: 64.14
2942
deepseek-ai/DeepSeek-R1:

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 144 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -371,126 +371,200 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
371371
task.evaluate(llm)
372372

373373
@pytest.mark.skip_device_not_contain(["H100"])
374-
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
375-
[(False, False, False), (True, False, False),
376-
(False, True, False), (False, False, True),
377-
(True, True, True)])
374+
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
375+
[(False, False, False, False),
376+
(True, False, False, False),
377+
(False, True, False, False),
378+
(False, False, True, False),
379+
(False, False, False, True),
380+
(True, True, True, True)])
378381
@parametrize_with_ids("mtp_nextn", [None, 2])
379-
def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
382+
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
380383
overlap_scheduler):
381384
# OOM on H100 with default free_gpu_memory_fraction=0.9
382385
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
383386
pytorch_config = PyTorchConfig(
384387
enable_overlap_scheduler=overlap_scheduler,
385388
use_cuda_graph=cuda_graph)
389+
390+
quant_config = QuantConfig()
391+
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
392+
if fp8kv:
393+
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
394+
pytorch_config.kv_cache_dtype = "fp8"
395+
386396
if mtp_nextn is not None and mtp_nextn > 0:
387397
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
388398
else:
389399
mtp_config = None
400+
390401
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
391402
kv_cache_config=kv_cache_config,
392403
pytorch_backend_config=pytorch_config,
404+
quant_config=quant_config,
393405
enable_attention_dp=attention_dp,
394406
speculative_config=mtp_config)
407+
395408
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
409+
if fp8kv:
410+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
411+
396412
with llm:
397-
task = CnnDailymail(self.MODEL_NAME)
398-
task.evaluate(llm)
399-
task = MMLU(self.MODEL_NAME)
400-
task.evaluate(llm)
401-
if attention_dp and cuda_graph and overlap_scheduler:
413+
# No need to run these tests for fp8kv
414+
if not fp8kv:
415+
task = CnnDailymail(self.MODEL_NAME)
416+
task.evaluate(llm)
417+
task = MMLU(self.MODEL_NAME)
418+
task.evaluate(llm)
419+
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
420+
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
402421
task = GSM8K(self.MODEL_NAME)
403422
task.evaluate(llm)
404423

405424
@pytest.mark.skip_less_device(4)
406425
@pytest.mark.skip_device_not_contain(["H100"])
407-
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
408-
[(False, False, False), (True, False, False),
409-
(False, True, False), (False, False, True),
410-
(True, True, True)])
426+
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
427+
[(False, False, False, False),
428+
(True, False, False, False),
429+
(False, True, False, False),
430+
(False, False, True, False),
431+
(False, False, False, True),
432+
(False, True, True, True), (True, True, True, True)])
411433
@parametrize_with_ids("mtp_nextn", [None, 2])
412434
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
413435
(2, 2, 1), (1, 4, 1)],
414436
ids=["tp4", "ep4", "tp2pp2", "pp4"])
415437
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
416-
attention_dp, cuda_graph,
438+
fp8kv, attention_dp, cuda_graph,
417439
overlap_scheduler):
418440
# OOM on H100 with default free_gpu_memory_fraction=0.9
419441
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
420442
pytorch_config = PyTorchConfig(
421443
enable_overlap_scheduler=overlap_scheduler,
422444
use_cuda_graph=cuda_graph)
445+
446+
quant_config = QuantConfig()
447+
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
448+
if fp8kv:
449+
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
450+
pytorch_config.kv_cache_dtype = "fp8"
451+
423452
if mtp_nextn is not None and mtp_nextn > 0:
424453
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
425454
else:
426455
mtp_config = None
456+
427457
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
428458
tensor_parallel_size=tp_size,
429459
pipeline_parallel_size=pp_size,
430460
moe_expert_parallel_size=ep_size,
431461
kv_cache_config=kv_cache_config,
432462
pytorch_backend_config=pytorch_config,
463+
quant_config=quant_config,
433464
enable_attention_dp=attention_dp,
434465
speculative_config=mtp_config)
466+
435467
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
468+
if fp8kv:
469+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
470+
436471
with llm:
437-
task = CnnDailymail(self.MODEL_NAME)
438-
task.evaluate(llm)
439-
task = MMLU(self.MODEL_NAME)
440-
task.evaluate(llm)
441-
if attention_dp and cuda_graph and overlap_scheduler:
472+
# No need to run these tests for fp8kv
473+
if not fp8kv:
474+
task = CnnDailymail(self.MODEL_NAME)
475+
task.evaluate(llm)
476+
task = MMLU(self.MODEL_NAME)
477+
task.evaluate(llm)
478+
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
479+
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
442480
task = GSM8K(self.MODEL_NAME)
443481
task.evaluate(llm)
444482

445483
@skip_pre_blackwell
446-
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
447-
[(False, False, False), (True, False, False),
448-
(False, True, False), (False, False, True),
449-
(True, True, True)])
450-
def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
484+
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
485+
[(False, False, False, False),
486+
(True, False, False, False),
487+
(False, True, False, False),
488+
(False, False, True, False),
489+
(False, False, False, True),
490+
(True, True, True, True)])
491+
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
451492
pytorch_config = PyTorchConfig(
452493
enable_overlap_scheduler=overlap_scheduler,
453494
use_cuda_graph=cuda_graph)
495+
496+
quant_config = QuantConfig()
497+
quant_config.quant_algo = QuantAlgo.NVFP4
498+
if fp8kv:
499+
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
500+
pytorch_config.kv_cache_dtype = "fp8"
501+
454502
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
455503
pytorch_backend_config=pytorch_config,
504+
quant_config=quant_config,
456505
enable_attention_dp=attention_dp)
506+
457507
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
508+
if fp8kv:
509+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
510+
458511
with llm:
459-
task = CnnDailymail(self.MODEL_NAME)
460-
task.evaluate(llm)
461-
task = MMLU(self.MODEL_NAME)
462-
task.evaluate(llm)
463-
if attention_dp and cuda_graph and overlap_scheduler:
512+
# No need to run these tests for fp8kv
513+
if not fp8kv:
514+
task = CnnDailymail(self.MODEL_NAME)
515+
task.evaluate(llm)
516+
task = MMLU(self.MODEL_NAME)
517+
task.evaluate(llm)
518+
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
519+
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
464520
task = GSM8K(self.MODEL_NAME)
465521
task.evaluate(llm)
466522

467523
@pytest.mark.skip_less_device(4)
468524
@skip_pre_blackwell
469-
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
470-
[(False, False, False), (True, False, False),
471-
(False, True, False), (False, False, True),
472-
(True, True, True)])
525+
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
526+
[(False, False, False, False),
527+
(True, False, False, False),
528+
(False, True, False, False),
529+
(False, False, True, False),
530+
(False, False, False, True),
531+
(True, True, True, True)])
473532
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
474533
(2, 2, 1), (1, 4, 1)],
475534
ids=["tp4", "ep4", "tp2pp2", "pp4"])
476-
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
477-
cuda_graph, overlap_scheduler):
535+
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
536+
overlap_scheduler, tp_size, pp_size, ep_size):
478537
pytorch_config = PyTorchConfig(
479538
enable_overlap_scheduler=overlap_scheduler,
480539
use_cuda_graph=cuda_graph)
540+
541+
quant_config = QuantConfig()
542+
quant_config.quant_algo = QuantAlgo.NVFP4
543+
if fp8kv:
544+
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
545+
pytorch_config.kv_cache_dtype = "fp8"
546+
481547
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
482548
tensor_parallel_size=tp_size,
483549
pipeline_parallel_size=pp_size,
484550
moe_expert_parallel_size=ep_size,
485551
pytorch_backend_config=pytorch_config,
552+
quant_config=quant_config,
486553
enable_attention_dp=attention_dp)
554+
487555
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
556+
if fp8kv:
557+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
558+
488559
with llm:
489-
task = CnnDailymail(self.MODEL_NAME)
490-
task.evaluate(llm)
491-
task = MMLU(self.MODEL_NAME)
492-
task.evaluate(llm)
493-
if attention_dp and cuda_graph and overlap_scheduler:
560+
# No need to run these tests for fp8kv
561+
if not fp8kv:
562+
task = CnnDailymail(self.MODEL_NAME)
563+
task.evaluate(llm)
564+
task = MMLU(self.MODEL_NAME)
565+
task.evaluate(llm)
566+
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
567+
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
494568
task = GSM8K(self.MODEL_NAME)
495569
task.evaluate(llm)
496570

@@ -504,16 +578,24 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
504578
@parametrize_with_ids("overlap_scheduler", [False, True])
505579
@parametrize_with_ids("cuda_graph", [False, True])
506580
@parametrize_with_ids("attention_dp", [False, True])
581+
@parametrize_with_ids("fp8kv", [False, True])
507582
@parametrize_with_ids("mtp_nextn", [None, 2])
508583
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
509584
(8, 1, 8)],
510585
ids=["tp8", "tp8ep4", "tp8ep8"])
511-
def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
586+
def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
512587
attention_dp, cuda_graph, overlap_scheduler):
513588
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
514589
pytorch_config = PyTorchConfig(
515590
enable_overlap_scheduler=overlap_scheduler,
516591
use_cuda_graph=cuda_graph)
592+
593+
quant_config = QuantConfig()
594+
quant_config.quant_algo = QuantAlgo.NVFP4
595+
if fp8kv:
596+
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
597+
pytorch_config.kv_cache_dtype = "fp8"
598+
517599
if mtp_nextn is not None and mtp_nextn > 0:
518600
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
519601
else:
@@ -524,9 +606,13 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
524606
moe_expert_parallel_size=ep_size,
525607
kv_cache_config=kv_cache_config,
526608
pytorch_backend_config=pytorch_config,
609+
quant_config=quant_config,
527610
enable_attention_dp=attention_dp,
528611
speculative_config=mtp_config)
529612
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
613+
if fp8kv:
614+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
615+
530616
with llm:
531617
task = MMLU(self.MODEL_NAME)
532618
task.evaluate(llm)
@@ -539,17 +625,24 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
539625
@pytest.mark.skip_less_device(8)
540626
@skip_pre_hopper
541627
@pytest.mark.parametrize(
542-
"tp_size,pp_size,ep_size,mtp_nextn,attention_dp,cuda_graph,overlap_scheduler,batch_size",
543-
[(8, 1, 4, 3, False, True, True, 1),
544-
(8, 1, 8, 0, True, True, True, 24)],
628+
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,batch_size",
629+
[(8, 1, 4, 3, False, False, True, True, 1),
630+
(8, 1, 8, 0, True, True, True, True, 24)],
545631
ids=["latency", "throughput"])
546-
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
632+
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
547633
attention_dp, cuda_graph, overlap_scheduler,
548634
batch_size):
549635
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
550636
pytorch_config = PyTorchConfig(
551637
enable_overlap_scheduler=overlap_scheduler,
552638
use_cuda_graph=cuda_graph)
639+
640+
quant_config = QuantConfig()
641+
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
642+
if fp8kv:
643+
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
644+
pytorch_config.kv_cache_dtype = "fp8"
645+
553646
if mtp_nextn is not None and mtp_nextn > 0:
554647
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
555648
else:
@@ -561,8 +654,13 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
561654
moe_expert_parallel_size=ep_size,
562655
kv_cache_config=kv_cache_config,
563656
pytorch_backend_config=pytorch_config,
657+
quant_config=quant_config,
564658
enable_attention_dp=attention_dp,
565659
speculative_config=mtp_config)
660+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
661+
if fp8kv:
662+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
663+
566664
with llm:
567665
task = CnnDailymail(self.MODEL_NAME)
568666
task.evaluate(llm)

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,12 +439,12 @@ accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
439439
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
440440
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
441441
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
442-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8---cuda_graph-overlap_scheduler]
443-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2--cuda_graph-overlap_scheduler]
444-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4---cuda_graph-overlap_scheduler]
445-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2--cuda_graph-overlap_scheduler]
446-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8---cuda_graph-overlap_scheduler]
447-
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2--cuda_graph-overlap_scheduler]
442+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8--fp8kv--cuda_graph-overlap_scheduler]
443+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
444+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4--fp8kv--cuda_graph-overlap_scheduler]
445+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
446+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8--fp8kv--cuda_graph-overlap_scheduler]
447+
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
448448
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
449449
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
450450
accuracy/test_disaggregated_serving.py::TestLlama3_1_8B::test_auto_dtype[False]

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,10 +26,11 @@ l0_b200:
2626
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
2727
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
2828
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
29+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv]
2930
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp]
3031
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph]
3132
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler]
32-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler]
33+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv-attention_dp-cuda_graph-overlap_scheduler]
3334
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
3435
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
3536
- test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]

0 commit comments

Comments
 (0)