@@ -371,126 +371,200 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
371
371
task .evaluate (llm )
372
372
373
373
@pytest .mark .skip_device_not_contain (["H100" ])
374
- @parametrize_with_ids ("attention_dp,cuda_graph,overlap_scheduler" ,
375
- [(False , False , False ), (True , False , False ),
376
- (False , True , False ), (False , False , True ),
377
- (True , True , True )])
374
+ @parametrize_with_ids ("fp8kv,attention_dp,cuda_graph,overlap_scheduler" ,
375
+ [(False , False , False , False ),
376
+ (True , False , False , False ),
377
+ (False , True , False , False ),
378
+ (False , False , True , False ),
379
+ (False , False , False , True ),
380
+ (True , True , True , True )])
378
381
@parametrize_with_ids ("mtp_nextn" , [None , 2 ])
379
- def test_fp8_block_scales (self , mtp_nextn , attention_dp , cuda_graph ,
382
+ def test_fp8_block_scales (self , mtp_nextn , fp8kv , attention_dp , cuda_graph ,
380
383
overlap_scheduler ):
381
384
# OOM on H100 with default free_gpu_memory_fraction=0.9
382
385
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
383
386
pytorch_config = PyTorchConfig (
384
387
enable_overlap_scheduler = overlap_scheduler ,
385
388
use_cuda_graph = cuda_graph )
389
+
390
+ quant_config = QuantConfig ()
391
+ quant_config .quant_algo = QuantAlgo .FP8_BLOCK_SCALES
392
+ if fp8kv :
393
+ quant_config .kv_cache_quant_algo = QuantAlgo .FP8
394
+ pytorch_config .kv_cache_dtype = "fp8"
395
+
386
396
if mtp_nextn is not None and mtp_nextn > 0 :
387
397
mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
388
398
else :
389
399
mtp_config = None
400
+
390
401
llm = LLM (f"{ llm_models_root ()} /DeepSeek-V3-Lite/fp8" ,
391
402
kv_cache_config = kv_cache_config ,
392
403
pytorch_backend_config = pytorch_config ,
404
+ quant_config = quant_config ,
393
405
enable_attention_dp = attention_dp ,
394
406
speculative_config = mtp_config )
407
+
395
408
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8_BLOCK_SCALES
409
+ if fp8kv :
410
+ assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
411
+
396
412
with llm :
397
- task = CnnDailymail (self .MODEL_NAME )
398
- task .evaluate (llm )
399
- task = MMLU (self .MODEL_NAME )
400
- task .evaluate (llm )
401
- if attention_dp and cuda_graph and overlap_scheduler :
413
+ # No need to run these tests for fp8kv
414
+ if not fp8kv :
415
+ task = CnnDailymail (self .MODEL_NAME )
416
+ task .evaluate (llm )
417
+ task = MMLU (self .MODEL_NAME )
418
+ task .evaluate (llm )
419
+ # Run GSM8K for fp8kv, or if all the other optimizations are enabled
420
+ if fp8kv or (attention_dp and cuda_graph and overlap_scheduler ):
402
421
task = GSM8K (self .MODEL_NAME )
403
422
task .evaluate (llm )
404
423
405
424
@pytest .mark .skip_less_device (4 )
406
425
@pytest .mark .skip_device_not_contain (["H100" ])
407
- @parametrize_with_ids ("attention_dp,cuda_graph,overlap_scheduler" ,
408
- [(False , False , False ), (True , False , False ),
409
- (False , True , False ), (False , False , True ),
410
- (True , True , True )])
426
+ @parametrize_with_ids ("fp8kv,attention_dp,cuda_graph,overlap_scheduler" ,
427
+ [(False , False , False , False ),
428
+ (True , False , False , False ),
429
+ (False , True , False , False ),
430
+ (False , False , True , False ),
431
+ (False , False , False , True ),
432
+ (False , True , True , True ), (True , True , True , True )])
411
433
@parametrize_with_ids ("mtp_nextn" , [None , 2 ])
412
434
@pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 1 ), (4 , 1 , 4 ),
413
435
(2 , 2 , 1 ), (1 , 4 , 1 )],
414
436
ids = ["tp4" , "ep4" , "tp2pp2" , "pp4" ])
415
437
def test_fp8_block_scales_4gpus (self , tp_size , pp_size , ep_size , mtp_nextn ,
416
- attention_dp , cuda_graph ,
438
+ fp8kv , attention_dp , cuda_graph ,
417
439
overlap_scheduler ):
418
440
# OOM on H100 with default free_gpu_memory_fraction=0.9
419
441
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.8 )
420
442
pytorch_config = PyTorchConfig (
421
443
enable_overlap_scheduler = overlap_scheduler ,
422
444
use_cuda_graph = cuda_graph )
445
+
446
+ quant_config = QuantConfig ()
447
+ quant_config .quant_algo = QuantAlgo .FP8_BLOCK_SCALES
448
+ if fp8kv :
449
+ quant_config .kv_cache_quant_algo = QuantAlgo .FP8
450
+ pytorch_config .kv_cache_dtype = "fp8"
451
+
423
452
if mtp_nextn is not None and mtp_nextn > 0 :
424
453
mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
425
454
else :
426
455
mtp_config = None
456
+
427
457
llm = LLM (f"{ llm_models_root ()} /DeepSeek-V3-Lite/fp8" ,
428
458
tensor_parallel_size = tp_size ,
429
459
pipeline_parallel_size = pp_size ,
430
460
moe_expert_parallel_size = ep_size ,
431
461
kv_cache_config = kv_cache_config ,
432
462
pytorch_backend_config = pytorch_config ,
463
+ quant_config = quant_config ,
433
464
enable_attention_dp = attention_dp ,
434
465
speculative_config = mtp_config )
466
+
435
467
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8_BLOCK_SCALES
468
+ if fp8kv :
469
+ assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
470
+
436
471
with llm :
437
- task = CnnDailymail (self .MODEL_NAME )
438
- task .evaluate (llm )
439
- task = MMLU (self .MODEL_NAME )
440
- task .evaluate (llm )
441
- if attention_dp and cuda_graph and overlap_scheduler :
472
+ # No need to run these tests for fp8kv
473
+ if not fp8kv :
474
+ task = CnnDailymail (self .MODEL_NAME )
475
+ task .evaluate (llm )
476
+ task = MMLU (self .MODEL_NAME )
477
+ task .evaluate (llm )
478
+ # Run GSM8K for fp8kv, or if all the other optimizations are enabled
479
+ if fp8kv or (attention_dp and cuda_graph and overlap_scheduler ):
442
480
task = GSM8K (self .MODEL_NAME )
443
481
task .evaluate (llm )
444
482
445
483
@skip_pre_blackwell
446
- @parametrize_with_ids ("attention_dp,cuda_graph,overlap_scheduler" ,
447
- [(False , False , False ), (True , False , False ),
448
- (False , True , False ), (False , False , True ),
449
- (True , True , True )])
450
- def test_nvfp4 (self , attention_dp , cuda_graph , overlap_scheduler ):
484
+ @parametrize_with_ids ("fp8kv,attention_dp,cuda_graph,overlap_scheduler" ,
485
+ [(False , False , False , False ),
486
+ (True , False , False , False ),
487
+ (False , True , False , False ),
488
+ (False , False , True , False ),
489
+ (False , False , False , True ),
490
+ (True , True , True , True )])
491
+ def test_nvfp4 (self , fp8kv , attention_dp , cuda_graph , overlap_scheduler ):
451
492
pytorch_config = PyTorchConfig (
452
493
enable_overlap_scheduler = overlap_scheduler ,
453
494
use_cuda_graph = cuda_graph )
495
+
496
+ quant_config = QuantConfig ()
497
+ quant_config .quant_algo = QuantAlgo .NVFP4
498
+ if fp8kv :
499
+ quant_config .kv_cache_quant_algo = QuantAlgo .FP8
500
+ pytorch_config .kv_cache_dtype = "fp8"
501
+
454
502
llm = LLM (f"{ llm_models_root ()} /DeepSeek-V3-Lite/nvfp4_moe_only" ,
455
503
pytorch_backend_config = pytorch_config ,
504
+ quant_config = quant_config ,
456
505
enable_attention_dp = attention_dp )
506
+
457
507
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
508
+ if fp8kv :
509
+ assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
510
+
458
511
with llm :
459
- task = CnnDailymail (self .MODEL_NAME )
460
- task .evaluate (llm )
461
- task = MMLU (self .MODEL_NAME )
462
- task .evaluate (llm )
463
- if attention_dp and cuda_graph and overlap_scheduler :
512
+ # No need to run these tests for fp8kv
513
+ if not fp8kv :
514
+ task = CnnDailymail (self .MODEL_NAME )
515
+ task .evaluate (llm )
516
+ task = MMLU (self .MODEL_NAME )
517
+ task .evaluate (llm )
518
+ # Run GSM8K for fp8kv, or if all the other optimizations are enabled
519
+ if fp8kv or (attention_dp and cuda_graph and overlap_scheduler ):
464
520
task = GSM8K (self .MODEL_NAME )
465
521
task .evaluate (llm )
466
522
467
523
@pytest .mark .skip_less_device (4 )
468
524
@skip_pre_blackwell
469
- @parametrize_with_ids ("attention_dp,cuda_graph,overlap_scheduler" ,
470
- [(False , False , False ), (True , False , False ),
471
- (False , True , False ), (False , False , True ),
472
- (True , True , True )])
525
+ @parametrize_with_ids ("fp8kv,attention_dp,cuda_graph,overlap_scheduler" ,
526
+ [(False , False , False , False ),
527
+ (True , False , False , False ),
528
+ (False , True , False , False ),
529
+ (False , False , True , False ),
530
+ (False , False , False , True ),
531
+ (True , True , True , True )])
473
532
@pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(4 , 1 , 1 ), (4 , 1 , 4 ),
474
533
(2 , 2 , 1 ), (1 , 4 , 1 )],
475
534
ids = ["tp4" , "ep4" , "tp2pp2" , "pp4" ])
476
- def test_nvfp4_4gpus (self , tp_size , pp_size , ep_size , attention_dp ,
477
- cuda_graph , overlap_scheduler ):
535
+ def test_nvfp4_4gpus (self , fp8kv , attention_dp , cuda_graph ,
536
+ overlap_scheduler , tp_size , pp_size , ep_size ):
478
537
pytorch_config = PyTorchConfig (
479
538
enable_overlap_scheduler = overlap_scheduler ,
480
539
use_cuda_graph = cuda_graph )
540
+
541
+ quant_config = QuantConfig ()
542
+ quant_config .quant_algo = QuantAlgo .NVFP4
543
+ if fp8kv :
544
+ quant_config .kv_cache_quant_algo = QuantAlgo .FP8
545
+ pytorch_config .kv_cache_dtype = "fp8"
546
+
481
547
llm = LLM (f"{ llm_models_root ()} /DeepSeek-V3-Lite/nvfp4_moe_only" ,
482
548
tensor_parallel_size = tp_size ,
483
549
pipeline_parallel_size = pp_size ,
484
550
moe_expert_parallel_size = ep_size ,
485
551
pytorch_backend_config = pytorch_config ,
552
+ quant_config = quant_config ,
486
553
enable_attention_dp = attention_dp )
554
+
487
555
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
556
+ if fp8kv :
557
+ assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
558
+
488
559
with llm :
489
- task = CnnDailymail (self .MODEL_NAME )
490
- task .evaluate (llm )
491
- task = MMLU (self .MODEL_NAME )
492
- task .evaluate (llm )
493
- if attention_dp and cuda_graph and overlap_scheduler :
560
+ # No need to run these tests for fp8kv
561
+ if not fp8kv :
562
+ task = CnnDailymail (self .MODEL_NAME )
563
+ task .evaluate (llm )
564
+ task = MMLU (self .MODEL_NAME )
565
+ task .evaluate (llm )
566
+ # Run GSM8K for fp8kv, or if all the other optimizations are enabled
567
+ if fp8kv or (attention_dp and cuda_graph and overlap_scheduler ):
494
568
task = GSM8K (self .MODEL_NAME )
495
569
task .evaluate (llm )
496
570
@@ -504,16 +578,24 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
504
578
@parametrize_with_ids ("overlap_scheduler" , [False , True ])
505
579
@parametrize_with_ids ("cuda_graph" , [False , True ])
506
580
@parametrize_with_ids ("attention_dp" , [False , True ])
581
+ @parametrize_with_ids ("fp8kv" , [False , True ])
507
582
@parametrize_with_ids ("mtp_nextn" , [None , 2 ])
508
583
@pytest .mark .parametrize ("tp_size,pp_size,ep_size" , [(8 , 1 , 1 ), (8 , 1 , 4 ),
509
584
(8 , 1 , 8 )],
510
585
ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
511
- def test_nvfp4_8gpus (self , tp_size , pp_size , ep_size , mtp_nextn ,
586
+ def test_nvfp4_8gpus (self , tp_size , pp_size , ep_size , mtp_nextn , fp8kv ,
512
587
attention_dp , cuda_graph , overlap_scheduler ):
513
588
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
514
589
pytorch_config = PyTorchConfig (
515
590
enable_overlap_scheduler = overlap_scheduler ,
516
591
use_cuda_graph = cuda_graph )
592
+
593
+ quant_config = QuantConfig ()
594
+ quant_config .quant_algo = QuantAlgo .NVFP4
595
+ if fp8kv :
596
+ quant_config .kv_cache_quant_algo = QuantAlgo .FP8
597
+ pytorch_config .kv_cache_dtype = "fp8"
598
+
517
599
if mtp_nextn is not None and mtp_nextn > 0 :
518
600
mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
519
601
else :
@@ -524,9 +606,13 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
524
606
moe_expert_parallel_size = ep_size ,
525
607
kv_cache_config = kv_cache_config ,
526
608
pytorch_backend_config = pytorch_config ,
609
+ quant_config = quant_config ,
527
610
enable_attention_dp = attention_dp ,
528
611
speculative_config = mtp_config )
529
612
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
613
+ if fp8kv :
614
+ assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
615
+
530
616
with llm :
531
617
task = MMLU (self .MODEL_NAME )
532
618
task .evaluate (llm )
@@ -539,17 +625,24 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
539
625
@pytest .mark .skip_less_device (8 )
540
626
@skip_pre_hopper
541
627
@pytest .mark .parametrize (
542
- "tp_size,pp_size,ep_size,mtp_nextn,attention_dp,cuda_graph,overlap_scheduler,batch_size" ,
543
- [(8 , 1 , 4 , 3 , False , True , True , 1 ),
544
- (8 , 1 , 8 , 0 , True , True , True , 24 )],
628
+ "tp_size,pp_size,ep_size,mtp_nextn,fp8kv, attention_dp,cuda_graph,overlap_scheduler,batch_size" ,
629
+ [(8 , 1 , 4 , 3 , False , False , True , True , 1 ),
630
+ (8 , 1 , 8 , 0 , True , True , True , True , 24 )],
545
631
ids = ["latency" , "throughput" ])
546
- def test_fp8_blockscale (self , tp_size , pp_size , ep_size , mtp_nextn ,
632
+ def test_fp8_blockscale (self , tp_size , pp_size , ep_size , mtp_nextn , fp8kv ,
547
633
attention_dp , cuda_graph , overlap_scheduler ,
548
634
batch_size ):
549
635
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.4 )
550
636
pytorch_config = PyTorchConfig (
551
637
enable_overlap_scheduler = overlap_scheduler ,
552
638
use_cuda_graph = cuda_graph )
639
+
640
+ quant_config = QuantConfig ()
641
+ quant_config .quant_algo = QuantAlgo .FP8_BLOCK_SCALES
642
+ if fp8kv :
643
+ quant_config .kv_cache_quant_algo = QuantAlgo .FP8
644
+ pytorch_config .kv_cache_dtype = "fp8"
645
+
553
646
if mtp_nextn is not None and mtp_nextn > 0 :
554
647
mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
555
648
else :
@@ -561,8 +654,13 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
561
654
moe_expert_parallel_size = ep_size ,
562
655
kv_cache_config = kv_cache_config ,
563
656
pytorch_backend_config = pytorch_config ,
657
+ quant_config = quant_config ,
564
658
enable_attention_dp = attention_dp ,
565
659
speculative_config = mtp_config )
660
+ assert llm .args .quant_config .quant_algo == QuantAlgo .FP8_BLOCK_SCALES
661
+ if fp8kv :
662
+ assert llm .args .quant_config .kv_cache_quant_algo == QuantAlgo .FP8
663
+
566
664
with llm :
567
665
task = CnnDailymail (self .MODEL_NAME )
568
666
task .evaluate (llm )
0 commit comments