@@ -2858,8 +2858,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2858
2858
extra_evaluator_kwargs = {
2859
2859
"fewshot_as_multiturn" : True ,
2860
2860
"apply_chat_template" : True ,
2861
- "scores_filter" : "exact_match,flexible-extract" ,
2862
- "MAX_OUTPUT_LEN" : 8192
2863
2861
}
2864
2862
2865
2863
MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
@@ -2873,7 +2871,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2873
2871
(True , True ),
2874
2872
])
2875
2873
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2876
- pytest .skip ("https://nvbugs/5481087" )
2874
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2875
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2876
+ {"scores_filter" : "exact_match,flexible-extract" })
2877
2877
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2878
2878
pytest .skip ("Triton kernels are not available" )
2879
2879
@@ -2891,7 +2891,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2891
2891
2892
2892
with llm :
2893
2893
model_name = "GPT-OSS/MXFP4"
2894
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2895
2894
task = GSM8K (model_name )
2896
2895
task .evaluate (llm ,
2897
2896
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2911,7 +2910,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2911
2910
ids = ["tp4" , "ep4" , "dp4" ])
2912
2911
def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2913
2912
attention_dp , cuda_graph , overlap_scheduler , mocker ):
2914
- pytest .skip ("https://nvbugs/5481087" )
2913
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2914
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2915
+ {"scores_filter" : "exact_match,flexible-extract" })
2915
2916
if moe_backend == "TRITON" :
2916
2917
if not IS_TRITON_KERNELS_AVAILABLE :
2917
2918
pytest .skip ("Triton kernels are not available" )
@@ -2932,7 +2933,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2932
2933
with llm :
2933
2934
model_name = "GPT-OSS/MXFP4"
2934
2935
task = GSM8K (model_name )
2935
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2936
2936
task .evaluate (llm ,
2937
2937
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2938
2938
@@ -2944,6 +2944,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2944
2944
ids = ["dp4" ])
2945
2945
def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2946
2946
overlap_scheduler , monkeypatch , mocker ):
2947
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2948
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2949
+ {"scores_filter" : "exact_match,flexible-extract" })
2947
2950
if not IS_TRITON_KERNELS_AVAILABLE :
2948
2951
pytest .skip ("Triton kernels are not available" )
2949
2952
monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2963,7 +2966,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2963
2966
with llm :
2964
2967
model_name = "GPT-OSS/BF16"
2965
2968
task = GSM8K (model_name )
2966
- mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
2967
2969
task .evaluate (llm ,
2968
2970
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2969
2971
0 commit comments