@@ -2883,8 +2883,8 @@ def test_auto_dtype_long_rope(self):
2883
2883
2884
2884
2885
2885
@skip_pre_hopper
2886
- @pytest .mark .skip_less_device_memory (100000 )
2887
- class TestGPTOSS_120B (LlmapiAccuracyTestHarness ):
2886
+ @pytest .mark .skip_less_device_memory (80000 )
2887
+ class TestGPTOSS (LlmapiAccuracyTestHarness ):
2888
2888
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
2889
2889
extra_evaluator_kwargs = {
2890
2890
"fewshot_as_multiturn" : True ,
@@ -2902,6 +2902,7 @@ class TestGPTOSS_120B(LlmapiAccuracyTestHarness):
2902
2902
(True , True ),
2903
2903
])
2904
2904
def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
2905
+ MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-20b"
2905
2906
mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
2906
2907
mocker .patch .dict (GSM8K .EVALUATE_KWARGS ,
2907
2908
{"scores_filter" : "exact_match,flexible-extract" })
@@ -2912,7 +2913,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2912
2913
disable_overlap_scheduler = not overlap_scheduler ,
2913
2914
cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
2914
2915
2915
- llm = LLM (self . MODEL_PATH ,
2916
+ llm = LLM (MODEL_PATH ,
2916
2917
tensor_parallel_size = 1 ,
2917
2918
pipeline_parallel_size = 1 ,
2918
2919
moe_expert_parallel_size = 1 ,
@@ -3000,52 +3001,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
3000
3001
task .evaluate (llm ,
3001
3002
extra_evaluator_kwargs = self .extra_evaluator_kwargs )
3002
3003
3003
-
3004
- @skip_pre_hopper
3005
- @pytest .mark .skip_less_device_memory (100000 )
3006
- class TestGPTOSS_20B (LlmapiAccuracyTestHarness ):
3007
- kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
3008
- extra_evaluator_kwargs = {
3009
- "fewshot_as_multiturn" : True ,
3010
- "apply_chat_template" : True ,
3011
- "scores_filter" : "exact_match,flexible-extract" ,
3012
- "MAX_OUTPUT_LEN" : 8192
3013
- }
3014
-
3015
- MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-20b"
3016
-
3017
- @pytest .mark .parametrize (
3018
- "moe_backend" ,
3019
- ["CUTLASS" ,
3020
- pytest .param ("TRTLLM" , marks = skip_pre_blackwell ), "TRITON" ],
3021
- ids = ["cutlass" , "trtllm" , "triton" ])
3022
- @pytest .mark .parametrize ("cuda_graph,overlap_scheduler" , [
3023
- (True , True ),
3024
- ])
3025
- def test_w4_1gpu (self , moe_backend , cuda_graph , overlap_scheduler , mocker ):
3026
- if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE :
3027
- pytest .skip ("Triton kernels are not available" )
3028
-
3029
- pytorch_config = dict (
3030
- disable_overlap_scheduler = not overlap_scheduler ,
3031
- cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
3032
-
3033
- llm = LLM (self .MODEL_PATH ,
3034
- tensor_parallel_size = 1 ,
3035
- pipeline_parallel_size = 1 ,
3036
- moe_expert_parallel_size = 1 ,
3037
- kv_cache_config = self .kv_cache_config ,
3038
- max_seq_len = 8192 ,
3039
- ** pytorch_config ,
3040
- moe_config = MoeConfig (backend = moe_backend ))
3041
-
3042
- with llm :
3043
- model_name = "GPT-OSS/MXFP4"
3044
- mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" , 8192 )
3045
- task = GSM8K (model_name )
3046
- task .evaluate (llm ,
3047
- extra_evaluator_kwargs = self .extra_evaluator_kwargs )
3048
-
3049
3004
@pytest .mark .skip_less_device (2 )
3050
3005
@pytest .mark .parametrize (
3051
3006
"moe_backend" ,
@@ -3059,8 +3014,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
3059
3014
(2 , 1 , 2 , True , True , True ),
3060
3015
],
3061
3016
ids = ["tp2" , "ep2" , "dp2" ])
3062
- def test_w4_4gpus (self , moe_backend , tp_size , pp_size , ep_size ,
3017
+ def test_w4_2gpus (self , moe_backend , tp_size , pp_size , ep_size ,
3063
3018
attention_dp , cuda_graph , overlap_scheduler , mocker ):
3019
+ MODEL_PATH = f"{ llm_models_root ()} /gpt_oss/gpt-oss-20b"
3064
3020
if moe_backend == "TRITON" :
3065
3021
if not IS_TRITON_KERNELS_AVAILABLE :
3066
3022
pytest .skip ("Triton kernels are not available" )
@@ -3069,7 +3025,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
3069
3025
disable_overlap_scheduler = not overlap_scheduler ,
3070
3026
cuda_graph_config = CudaGraphConfig () if cuda_graph else None )
3071
3027
3072
- llm = LLM (self . MODEL_PATH ,
3028
+ llm = LLM (MODEL_PATH ,
3073
3029
tensor_parallel_size = tp_size ,
3074
3030
pipeline_parallel_size = pp_size ,
3075
3031
moe_expert_parallel_size = ep_size ,
0 commit comments