@@ -2822,8 +2822,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2822
2822
extra_evaluator_kwargs = {
2823
2823
"fewshot_as_multiturn" : True ,
2824
2824
"apply_chat_template" : True ,
2825
- "scores_filter" : "exact_match,flexible-extract" ,
2826
- "MAX_OUTPUT_LEN" : 8192
2827
2825
}
2828
2826
2829
2827
MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
@@ -2837,7 +2835,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2837
2835
(True , True ),
2838
2836
])
2839
2837
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2840
- pytest .skip ("https://nvbugs/5481087" )
2838
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2839
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2840
+ {"scores_filter" : "exact_match,flexible-extract" })
2841
2841
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2842
2842
pytest .skip ("Triton kernels are not available" )
2843
2843
@@ -2855,7 +2855,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2855
2855
2856
2856
with llm :
2857
2857
model_name = "GPT-OSS/MXFP4"
2858
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2859
2858
task = GSM8K (model_name )
2860
2859
task .evaluate (llm ,
2861
2860
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2875,7 +2874,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2875
2874
ids = ["tp4" , "ep4" , "dp4" ])
2876
2875
def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2877
2876
attention_dp , cuda_graph , overlap_scheduler , mocker ):
2878
- pytest .skip ("https://nvbugs/5481087" )
2877
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2878
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2879
+ {"scores_filter" : "exact_match,flexible-extract" })
2879
2880
if moe_backend == "TRITON" :
2880
2881
if not IS_TRITON_KERNELS_AVAILABLE :
2881
2882
pytest .skip ("Triton kernels are not available" )
@@ -2896,7 +2897,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2896
2897
with llm :
2897
2898
model_name = "GPT-OSS/MXFP4"
2898
2899
task = GSM8K (model_name )
2899
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2900
2900
task .evaluate (llm ,
2901
2901
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2902
2902
@@ -2908,6 +2908,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2908
2908
ids = ["dp4" ])
2909
2909
def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2910
2910
overlap_scheduler , monkeypatch , mocker ):
2911
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2912
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2913
+ {"scores_filter" : "exact_match,flexible-extract" })
2911
2914
if not IS_TRITON_KERNELS_AVAILABLE :
2912
2915
pytest .skip ("Triton kernels are not available" )
2913
2916
monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2927,7 +2930,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2927
2930
with llm :
2928
2931
model_name = "GPT-OSS/BF16"
2929
2932
task = GSM8K (model_name )
2930
- mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
2931
2933
task .evaluate (llm ,
2932
2934
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2933
2935
0 commit comments