@@ -2718,6 +2718,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2718
2718
])
2719
2719
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2720
2720
pytest .skip ("https://nvbugs/5481087" )
2721
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2722
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2723
+ {"scores_filter" : "exact_match,flexible-extract" })
2721
2724
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2722
2725
pytest .skip ("Triton kernels are not available" )
2723
2726
@@ -2735,9 +2738,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2735
2738
2736
2739
with llm :
2737
2740
model_name = "GPT-OSS/MXFP4"
2738
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2739
- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2740
- {"scores_filter" : "exact_match,flexible-extract" })
2741
2741
task = GSM8K (model_name )
2742
2742
task .evaluate (llm ,
2743
2743
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2757,7 +2757,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2757
2757
ids = ["tp4" , "ep4" , "dp4" ])
2758
2758
def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2759
2759
attention_dp , cuda_graph , overlap_scheduler , mocker ):
2760
- pytest .skip ("https://nvbugs/5481087" )
2760
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2761
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2762
+ {"scores_filter" : "exact_match,flexible-extract" })
2761
2763
if moe_backend == "TRITON" :
2762
2764
if not IS_TRITON_KERNELS_AVAILABLE :
2763
2765
pytest .skip ("Triton kernels are not available" )
@@ -2778,9 +2780,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2778
2780
with llm :
2779
2781
model_name = "GPT-OSS/MXFP4"
2780
2782
task = GSM8K (model_name )
2781
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2782
- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2783
- {"scores_filter" : "exact_match,flexible-extract" })
2784
2783
task .evaluate (llm ,
2785
2784
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2786
2785
@@ -2792,6 +2791,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2792
2791
ids = ["dp4" ])
2793
2792
def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2794
2793
overlap_scheduler , monkeypatch , mocker ):
2794
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2795
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2796
+ {"scores_filter" : "exact_match,flexible-extract" })
2795
2797
if not IS_TRITON_KERNELS_AVAILABLE :
2796
2798
pytest .skip ("Triton kernels are not available" )
2797
2799
monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2811,9 +2813,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2811
2813
with llm :
2812
2814
model_name = "GPT-OSS/BF16"
2813
2815
task = GSM8K (model_name )
2814
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2815
- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2816
- {"scores_filter" : "exact_match,flexible-extract" })
2817
2816
task .evaluate (llm ,
2818
2817
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2819
2818
0 commit comments