@@ -2835,8 +2835,8 @@ def test_auto_dtype_long_rope(self):
2835
2835
2836
2836
2837
2837
@skip_pre_hopper
2838
- @pytest .mark .skip_less_device_memory (100000 )
2839
- class TestGPTOSS_120B (LlmapiAccuracyTestHarness ):
2838
+ @pytest .mark .skip_less_device_memory (80000 )
2839
+ class TestGPTOSS (LlmapiAccuracyTestHarness ):
2840
2840
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
2841
2841
extra_evaluator_kwargs = {
2842
2842
"fewshot_as_multiturn" : True ,
@@ -2845,16 +2845,16 @@ class TestGPTOSS_120B(LlmapiAccuracyTestHarness):
2845
2845
2846
2846
MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-120b"
2847
2847
2848
- @pytest .mark .parametrize ("moe_backend" , [
2849
- "CUTLASS" ,
2850
- pytest .param ("TRTLLM" , marks = skip_pre_blackwell ),
2851
- pytest .param ("TRITON" , marks = pytest .mark .install_triton )
2852
- ],
2853
- ids = ["cutlass" , "trtllm" , "triton" ])
2848
+ @pytest .mark .parametrize (
2849
+ "moe_backend" ,
2850
+ ["CUTLASS" ,
2851
+ pytest .param ("TRTLLM" , marks = skip_pre_blackwell ), "TRITON" ],
2852
+ ids = ["cutlass" , "trtllm" , "triton" ])
2854
2853
@pytest .mark .parametrize ("cuda_graph,overlap_scheduler" , [
2855
2854
(True , True ),
2856
2855
])
2857
2856
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2857
+ MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-20b"
2858
2858
mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2859
2859
mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2860
2860
{"scores_filter" : "exact_match,flexible-extract" })
@@ -2865,7 +2865,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2865
2865
disable_overlap_scheduler = not overlap_scheduler ,
2866
2866
cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
2867
2867
2868
- llm = LLM (self . MODEL_PATH ,
2868
+ llm = LLM (MODEL_PATH ,
2869
2869
tensor_parallel_size = 1 ,
2870
2870
pipeline_parallel_size = 1 ,
2871
2871
moe_expert_parallel_size = 1 ,
@@ -2880,12 +2880,11 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2880
2880
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2881
2881
2882
2882
@pytest .mark .skip_less_device (4 )
2883
- @pytest .mark .parametrize ("moe_backend" , [
2884
- "CUTLASS" ,
2885
- pytest .param ("TRTLLM" , marks = skip_pre_blackwell ),
2886
- pytest .param ("TRITON" , marks = pytest .mark .install_triton )
2887
- ],
2888
- ids = ["cutlass" , "trtllm" , "triton" ])
2883
+ @pytest .mark .parametrize (
2884
+ "moe_backend" ,
2885
+ ["CUTLASS" ,
2886
+ pytest .param ("TRTLLM" , marks = skip_pre_blackwell ), "TRITON" ],
2887
+ ids = ["cutlass" , "trtllm" , "triton" ])
2889
2888
@pytest .mark .parametrize (
2890
2889
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler" , [
2891
2890
(4 , 1 , 1 , False , True , True ),
@@ -2954,52 +2953,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
2954
2953
task .evaluate (llm ,
2955
2954
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
2956
2955
2957
-
2958
- @skip_pre_hopper
2959
- @pytest .mark .skip_less_device_memory (100000 )
2960
- class TestGPTOSS_20B (LlmapiAccuracyTestHarness ):
2961
- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
2962
- extra_evaluator_kwargs = {
2963
- "fewshot_as_multiturn" : True ,
2964
- "apply_chat_template" : True ,
2965
- "scores_filter" : "exact_match,flexible-extract" ,
2966
- "MAX_OUTPUT_LEN" : 8192
2967
- }
2968
-
2969
- MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-20b"
2970
-
2971
- @pytest .mark .parametrize (
2972
- "moe_backend" ,
2973
- ["CUTLASS" ,
2974
- pytest .param ("TRTLLM" , marks = skip_pre_blackwell ), "TRITON" ],
2975
- ids = ["cutlass" , "trtllm" , "triton" ])
2976
- @pytest .mark .parametrize ("cuda_graph,overlap_scheduler" , [
2977
- (True , True ),
2978
- ])
2979
- def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2980
- if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
2981
- pytest .skip ("Triton kernels are not available" )
2982
-
2983
- pytorch_config = dict (
2984
- disable_overlap_scheduler = not overlap_scheduler ,
2985
- cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
2986
-
2987
- llm = LLM (self .MODEL_PATH ,
2988
- tensor_parallel_size = 1 ,
2989
- pipeline_parallel_size = 1 ,
2990
- moe_expert_parallel_size = 1 ,
2991
- kv_cache_config = self .kv_cache_config ,
2992
- max_seq_len = 8192 ,
2993
- ** pytorch_config ,
2994
- moe_config = MoeConfig (backend = moe_backend ))
2995
-
2996
- with llm :
2997
- model_name = "GPT-OSS/MXFP4"
2998
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2999
- task = GSM8K (model_name )
3000
- task .evaluate (llm ,
3001
- extra_evaluator_kwargs = self .extra_evaluator_kwargs )
3002
-
3003
2956
@pytest .mark .skip_less_device (2 )
3004
2957
@pytest .mark .parametrize (
3005
2958
"moe_backend" ,
@@ -3013,8 +2966,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
3013
2966
(2 , 1 , 2 , True , True , True ),
3014
2967
],
3015
2968
ids = ["tp2" , "ep2" , "dp2" ])
3016
- def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
2969
+ def test_w4_2gpus (self , moe_backend , tp_size , pp_size , ep_size ,
3017
2970
attention_dp , cuda_graph , overlap_scheduler , mocker ):
2971
+ MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-20b"
3018
2972
if moe_backend == "TRITON" :
3019
2973
if not IS_TRITON_KERNELS_AVAILABLE :
3020
2974
pytest .skip ("Triton kernels are not available" )
@@ -3023,7 +2977,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
3023
2977
disable_overlap_scheduler = not overlap_scheduler ,
3024
2978
cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
3025
2979
3026
- llm = LLM (self . MODEL_PATH ,
2980
+ llm = LLM (MODEL_PATH ,
3027
2981
tensor_parallel_size = tp_size ,
3028
2982
pipeline_parallel_size = pp_size ,
3029
2983
moe_expert_parallel_size = ep_size ,
0 commit comments