@@ -2699,22 +2699,21 @@ def test_auto_dtype_long_rope(self):
2699
2699
2700
2700
class TestGPTOSS (LlmapiAccuracyTestHarness ):
2701
2701
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
2702
+ extra_evaluator_kwargs = {
2703
+ "fewshot_as_multiturn" : True ,
2704
+ "apply_chat_template" : True ,
2705
+ "scores_filter" : "exact_match,flexible-extract" ,
2706
+ "MAX_OUTPUT_LEN" : 8192
2707
+ }
2702
2708
2703
2709
MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
2704
2710
2705
- def update_task_kwargs (self , task ):
2706
- task .EVALUATOR_KWARGS ["fewshot_as_multiturn" ] = True
2707
- task .EVALUATOR_KWARGS ["apply_chat_template" ] = True
2708
- task .EVALUATE_KWARGS ["scores_filter" ] = "exact_match,flexible-extract"
2709
- task .MAX_OUTPUT_LEN = 8192
2710
- return task
2711
-
2712
2711
@pytest .mark .parametrize ("moe_backend" , ["CUTLASS" , "TRTLLM" , "TRITON" ],
2713
2712
ids = ["cutlass" , "trtllm" , "triton" ])
2714
2713
@pytest .mark .parametrize ("cuda_graph,overlap_scheduler" , [
2715
2714
(True , True ),
2716
2715
])
2717
- def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler ):
2716
+ def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2718
2717
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2719
2718
pytest .skip ("Triton kernels are not available" )
2720
2719
@@ -2732,9 +2731,10 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
2732
2731
2733
2732
with llm :
2734
2733
model_name = "GPT-OSS/MXFP4"
2734
+ mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
2735
2735
task = GSM8K (model_name )
2736
- task = self . update_task_kwargs ( task )
2737
- task . evaluate ( llm )
2736
+ task . evaluate ( llm ,
2737
+ extra_evaluator_kwargs = self . extra_evaluator_kwargs )
2738
2738
2739
2739
@pytest .mark .skip_less_device (4 )
2740
2740
@pytest .mark .parametrize ("moe_backend" , ["CUTLASS" , "TRTLLM" , "TRITON" ])
@@ -2746,7 +2746,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler):
2746
2746
],
2747
2747
ids = ["tp4" , "ep4" , "dp4" ])
2748
2748
def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2749
- attention_dp , cuda_graph , overlap_scheduler ):
2749
+ attention_dp , cuda_graph , overlap_scheduler , mocker ):
2750
2750
if moe_backend == "TRITON" :
2751
2751
if not IS_TRITON_KERNELS_AVAILABLE :
2752
2752
pytest .skip ("Triton kernels are not available" )
@@ -2767,8 +2767,9 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2767
2767
with llm :
2768
2768
model_name = "GPT-OSS/MXFP4"
2769
2769
task = GSM8K (model_name )
2770
- task = self .update_task_kwargs (task )
2771
- task .evaluate (llm )
2770
+ mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
2771
+ task .evaluate (llm ,
2772
+ extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2772
2773
2773
2774
@pytest .mark .skip_less_device (4 )
2774
2775
@pytest .mark .parametrize (
@@ -2777,7 +2778,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
2777
2778
],
2778
2779
ids = ["dp4" ])
2779
2780
def test_w4a16 (self , tp_size , pp_size , ep_size , attention_dp , cuda_graph ,
2780
- overlap_scheduler , monkeypatch ):
2781
+ overlap_scheduler , monkeypatch , mocker ):
2781
2782
if not IS_TRITON_KERNELS_AVAILABLE :
2782
2783
pytest .skip ("Triton kernels are not available" )
2783
2784
monkeypatch .setenv ("OVERRIDE_QUANT_ALGO" , "W4A16_MXFP4" )
@@ -2797,8 +2798,9 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2797
2798
with llm :
2798
2799
model_name = "GPT-OSS/BF16"
2799
2800
task = GSM8K (model_name )
2800
- task = self .update_task_kwargs (task )
2801
- task .evaluate (llm )
2801
+ mocker .patch .object (GSM8K , {"MAX_OUTPUT_LEN" : 8192 })
2802
+ task .evaluate (llm ,
2803
+ extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2802
2804
2803
2805
2804
2806
class TestEXAONE4 (LlmapiAccuracyTestHarness ):
0 commit comments