@@ -2793,6 +2793,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2793
2793
])
2794
2794
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2795
2795
pytest .skip ("https://nvbugs/5481087" )
2796
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2797
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2798
+ {"scores_filter" : "exact_match,flexible-extract" })
2796
2799
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2797
2800
pytest .skip ("Triton kernels are not available" )
2798
2801
@@ -2810,9 +2813,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2810
2813
2811
2814
with llm :
2812
2815
model_name = "GPT-OSS/MXFP4"
2813
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2814
- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2815
- {"scores_filter" : "exact_match,flexible-extract" })
2816
2816
task = GSM8K (model_name )
2817
2817
task .evaluate (llm ,
2818
2818
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2832,7 +2832,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2832
2832
ids = ["tp4" , "ep4" , "dp4" ])
2833
2833
def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2834
2834
attention_dp , cuda_graph , overlap_scheduler , mocker ):
2835
- pytest .skip ("https://nvbugs/5481087" )
2835
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2836
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2837
+ {"scores_filter" : "exact_match,flexible-extract" })
2836
2838
if moe_backend == "TRITON" :
2837
2839
if not IS_TRITON_KERNELS_AVAILABLE :
2838
2840
pytest .skip ("Triton kernels are not available" )
@@ -2853,9 +2855,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2853
2855
with llm :
2854
2856
model_name = "GPT-OSS/MXFP4"
2855
2857
task = GSM8K (model_name )
2856
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2857
- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2858
- {"scores_filter" : "exact_match,flexible-extract" })
2859
2858
task .evaluate (llm ,
2860
2859
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2861
2860
@@ -2867,6 +2866,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2867
2866
ids = ["dp4" ])
2868
2867
def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2869
2868
overlap_scheduler , monkeypatch , mocker ):
2869
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2870
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2871
+ {"scores_filter" : "exact_match,flexible-extract" })
2870
2872
if not IS_TRITON_KERNELS_AVAILABLE :
2871
2873
pytest .skip ("Triton kernels are not available" )
2872
2874
monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2886,9 +2888,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2886
2888
with llm :
2887
2889
model_name = "GPT-OSS/BF16"
2888
2890
task = GSM8K (model_name )
2889
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2890
- mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2891
- {"scores_filter" : "exact_match,flexible-extract" })
2892
2891
task .evaluate (llm ,
2893
2892
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2894
2893
0 commit comments