@@ -2866,8 +2866,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2866
2866
extra_evaluator_kwargs = {
2867
2867
"fewshot_as_multiturn" : True ,
2868
2868
"apply_chat_template" : True ,
2869
- "scores_filter" : "exact_match,flexible-extract" ,
2870
- "MAX_OUTPUT_LEN" : 8192
2871
2869
}
2872
2870
2873
2871
MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
@@ -2881,7 +2879,9 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2881
2879
(True , True ),
2882
2880
])
2883
2881
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2884
- pytest .skip ("https://nvbugs/5481087" )
2882
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2883
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2884
+ {"scores_filter" : "exact_match,flexible-extract" })
2885
2885
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2886
2886
pytest .skip ("Triton kernels are not available" )
2887
2887
@@ -2899,7 +2899,6 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2899
2899
2900
2900
with llm :
2901
2901
model_name = "GPT-OSS/MXFP4"
2902
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2903
2902
task = GSM8K (model_name )
2904
2903
task .evaluate (llm ,
2905
2904
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2919,7 +2918,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2919
2918
ids = ["tp4" , "ep4" , "dp4" ])
2920
2919
def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2921
2920
attention_dp , cuda_graph , overlap_scheduler , mocker ):
2922
- pytest .skip ("https://nvbugs/5481087" )
2921
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2922
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2923
+ {"scores_filter" : "exact_match,flexible-extract" })
2923
2924
if moe_backend == "TRITON" :
2924
2925
if not IS_TRITON_KERNELS_AVAILABLE :
2925
2926
pytest .skip ("Triton kernels are not available" )
@@ -2940,7 +2941,6 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2940
2941
with llm :
2941
2942
model_name = "GPT-OSS/MXFP4"
2942
2943
task = GSM8K (model_name )
2943
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2944
2944
task .evaluate (llm ,
2945
2945
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2946
2946
@@ -2952,6 +2952,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2952
2952
ids = ["dp4" ])
2953
2953
def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2954
2954
overlap_scheduler , monkeypatch , mocker ):
2955
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2956
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2957
+ {"scores_filter" : "exact_match,flexible-extract" })
2955
2958
if not IS_TRITON_KERNELS_AVAILABLE :
2956
2959
pytest .skip ("Triton kernels are not available" )
2957
2960
monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2971,7 +2974,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2971
2974
with llm :
2972
2975
model_name = "GPT-OSS/BF16"
2973
2976
task = GSM8K (model_name )
2974
- mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
2975
2977
task .evaluate (llm ,
2976
2978
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2977
2979
0 commit comments