@@ -2704,8 +2704,6 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
2704
2704
extra_evaluator_kwargs = {
2705
2705
"fewshot_as_multiturn" : True ,
2706
2706
"apply_chat_template" : True ,
2707
- "scores_filter" : "exact_match,flexible-extract" ,
2708
- "MAX_OUTPUT_LEN" : 8192
2709
2707
}
2710
2708
2711
2709
MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
@@ -2736,7 +2734,8 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2736
2734
2737
2735
with llm :
2738
2736
model_name = "GPT-OSS/MXFP4"
2739
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" : 8192 )
2737
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2738
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS , {"scores_filter" : "exact_match,flexible-extract" })
2740
2739
task = GSM8K (model_name )
2741
2740
task .evaluate (llm ,
2742
2741
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
@@ -2776,7 +2775,8 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2776
2775
with llm :
2777
2776
model_name = "GPT-OSS/MXFP4"
2778
2777
task = GSM8K (model_name )
2779
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" : 8192 )
2778
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2779
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS , {"scores_filter" : "exact_match,flexible-extract" })
2780
2780
task .evaluate (llm ,
2781
2781
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2782
2782
@@ -2807,7 +2807,8 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2807
2807
with llm :
2808
2808
model_name = "GPT-OSS/BF16"
2809
2809
task = GSM8K (model_name )
2810
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" : 8192 )
2810
+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2811
+ mocker .patch .dict (GSM8K .EVALUATE_KWARGS , {"scores_filter" : "exact_match,flexible-extract" })
2811
2812
task .evaluate (llm ,
2812
2813
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2813
2814
0 commit comments