Skip to content

Commit d2e57ba

Browse files
committed
add gptoss 20g tests
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent 1b5d33d commit d2e57ba

File tree

5 files changed

+50
-68
lines changed

5 files changed

+50
-68
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 7 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -2883,8 +2883,8 @@ def test_auto_dtype_long_rope(self):
28832883

28842884

28852885
@skip_pre_hopper
2886-
@pytest.mark.skip_less_device_memory(100000)
2887-
class TestGPTOSS_120B(LlmapiAccuracyTestHarness):
2886+
@pytest.mark.skip_less_device_memory(80000)
2887+
class TestGPTOSS(LlmapiAccuracyTestHarness):
28882888
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
28892889
extra_evaluator_kwargs = {
28902890
"fewshot_as_multiturn": True,
@@ -2902,6 +2902,7 @@ class TestGPTOSS_120B(LlmapiAccuracyTestHarness):
29022902
(True, True),
29032903
])
29042904
def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2905+
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
29052906
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
29062907
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
29072908
{"scores_filter": "exact_match,flexible-extract"})
@@ -2912,7 +2913,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
29122913
disable_overlap_scheduler=not overlap_scheduler,
29132914
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
29142915

2915-
llm = LLM(self.MODEL_PATH,
2916+
llm = LLM(MODEL_PATH,
29162917
tensor_parallel_size=1,
29172918
pipeline_parallel_size=1,
29182919
moe_expert_parallel_size=1,
@@ -3000,52 +3001,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
30003001
task.evaluate(llm,
30013002
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
30023003

3003-
3004-
@skip_pre_hopper
3005-
@pytest.mark.skip_less_device_memory(100000)
3006-
class TestGPTOSS_20B(LlmapiAccuracyTestHarness):
3007-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
3008-
extra_evaluator_kwargs = {
3009-
"fewshot_as_multiturn": True,
3010-
"apply_chat_template": True,
3011-
"scores_filter": "exact_match,flexible-extract",
3012-
"MAX_OUTPUT_LEN": 8192
3013-
}
3014-
3015-
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
3016-
3017-
@pytest.mark.parametrize(
3018-
"moe_backend",
3019-
["CUTLASS",
3020-
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
3021-
ids=["cutlass", "trtllm", "triton"])
3022-
@pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
3023-
(True, True),
3024-
])
3025-
def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
3026-
if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
3027-
pytest.skip("Triton kernels are not available")
3028-
3029-
pytorch_config = dict(
3030-
disable_overlap_scheduler=not overlap_scheduler,
3031-
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
3032-
3033-
llm = LLM(self.MODEL_PATH,
3034-
tensor_parallel_size=1,
3035-
pipeline_parallel_size=1,
3036-
moe_expert_parallel_size=1,
3037-
kv_cache_config=self.kv_cache_config,
3038-
max_seq_len=8192,
3039-
**pytorch_config,
3040-
moe_config=MoeConfig(backend=moe_backend))
3041-
3042-
with llm:
3043-
model_name = "GPT-OSS/MXFP4"
3044-
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3045-
task = GSM8K(model_name)
3046-
task.evaluate(llm,
3047-
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
3048-
30493004
@pytest.mark.skip_less_device(2)
30503005
@pytest.mark.parametrize(
30513006
"moe_backend",
@@ -3059,8 +3014,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
30593014
(2, 1, 2, True, True, True),
30603015
],
30613016
ids=["tp2", "ep2", "dp2"])
3062-
def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
3017+
def test_w4_2gpus(self, moe_backend, tp_size, pp_size, ep_size,
30633018
attention_dp, cuda_graph, overlap_scheduler, mocker):
3019+
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
30643020
if moe_backend == "TRITON":
30653021
if not IS_TRITON_KERNELS_AVAILABLE:
30663022
pytest.skip("Triton kernels are not available")
@@ -3069,7 +3025,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
30693025
disable_overlap_scheduler=not overlap_scheduler,
30703026
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
30713027

3072-
llm = LLM(self.MODEL_PATH,
3028+
llm = LLM(MODEL_PATH,
30733029
tensor_parallel_size=tp_size,
30743030
pipeline_parallel_size=pp_size,
30753031
moe_expert_parallel_size=ep_size,

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ accuracy/test_cli_flow.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
8686
accuracy/test_cli_flow.py::TestNemotronMini4BInstruct::test_fp8_prequantized
8787
accuracy/test_cli_flow.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_graph=True] TIMEOUT (240)
8888
accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
89+
<<<<<<< HEAD
8990
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
9091
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
9192
accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype
@@ -148,6 +149,30 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[ep2-triton]
148149
accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[dp2-cutlass]
149150
accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[dp2-trtllm]
150151
accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[dp2-triton]
152+
=======
153+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
154+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
155+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
156+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass]
157+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm]
158+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton]
159+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass]
160+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm]
161+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton]
162+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass]
163+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm]
164+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton]
165+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
166+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
167+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
168+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
169+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
170+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
171+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
172+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
173+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
174+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
175+
>>>>>>> b36b4a539 (add gpt-oss 20g tests)
151176
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
152177
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
153178
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,11 @@ l0_dgx_b200:
4141
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
4242
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
4343
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
44-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-trtllm]
45-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-cutlass]
46-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-triton]
47-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-trtllm]
48-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4a16[dp4]
44+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
45+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
46+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
47+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
48+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
4949
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
5050
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
5151
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
@@ -117,11 +117,11 @@ l0_dgx_b200:
117117
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
118118
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
119119
- accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
120-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-cutlass]
121-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-triton]
122-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-trtllm]
123-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-cutlass]
124-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-triton]
120+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
121+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
122+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
123+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
124+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
125125
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
126126
# ------------- AutoDeploy tests ---------------
127127
- accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -155,13 +155,13 @@ l0_dgx_h100:
155155
backend: pytorch
156156
auto_trigger: gpt_oss
157157
tests:
158-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-cutlass]
159-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-triton]
160-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-cutlass]
161-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-triton]
162-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-cutlass]
163-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-triton]
164-
- accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4a16[dp4]
158+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
159+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
160+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
161+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
162+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
163+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
164+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
165165
- condition:
166166
ranges:
167167
system_gpu_count:

tests/integration/test_lists/waives.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -330,6 +330,7 @@ accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbug
330330
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143)
331331
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/5471106)
332332
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/5481090)
333+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] SKIP (https://nvbugs/5481080)
333334
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-False] SKIP (https://nvbugs/5481094)
334335
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-True] SKIP (https://nvbugs/5481094)
335336
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True] SKIP (https://nvbugs/5481094)

0 commit comments

Comments
 (0)