@@ -2416,7 +2416,8 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2416
2416
(8 , 1 , 8 , True , True , True , "TRTLLM" , False ),
2417
2417
],
2418
2418
ids = [
2419
- "latency_moe_cutlass" , "latency_moe_trtllm" ,
2419
+ "latency_moe_cutlass" ,
2420
+ "latency_moe_trtllm" ,
2420
2421
],
2421
2422
)
2422
2423
def test_nvfp4 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
@@ -2456,14 +2457,15 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2456
2457
@pytest .mark .parametrize (
2457
2458
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend,eagle3" ,
2458
2459
[
2459
- (4 , 1 , 4 , False , False , False , "TRTLLM" , True ), # TP8 has bug when we use TRTLLM moe backend and eagle3
2460
+ (4 , 1 , 4 , False , False , False , "TRTLLM" ,
2461
+ True ), # TP8 has bug when we use TRTLLM moe backend and eagle3
2460
2462
],
2461
2463
ids = [
2462
2464
"latency_moe_trtllm_eagle3" ,
2463
2465
],
2464
2466
)
2465
- def test_nvfp4_4gpus (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2466
- overlap_scheduler , moe_backend , eagle3 ):
2467
+ def test_nvfp4_4gpus (self , tp_size , pp_size , ep_size , attention_dp ,
2468
+ cuda_graph , overlap_scheduler , moe_backend , eagle3 ):
2467
2469
2468
2470
pytorch_config = dict (
2469
2471
disable_overlap_scheduler = not overlap_scheduler ,
@@ -2494,6 +2496,7 @@ def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2494
2496
task = GSM8K (self .MODEL_NAME )
2495
2497
task .evaluate (llm )
2496
2498
2499
+
2497
2500
class TestPhi4MiniInstruct (LlmapiAccuracyTestHarness ):
2498
2501
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
2499
2502
MODEL_PATH = f"{ llm_models_root ()} /Phi-4-mini-instruct"
0 commit comments