Skip to content

Commit bc6d5c5

Browse files
xinhe-nvWong4j
authored andcommitted
[TRTLLM-6642][feat] add gptoss 20g tests (NVIDIA#7361)
Signed-off-by: Xin He (SW-GPU) <[email protected]>
1 parent 716972e commit bc6d5c5

File tree

5 files changed

+93
-10
lines changed

5 files changed

+93
-10
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2883,7 +2883,7 @@ def test_auto_dtype_long_rope(self):
28832883

28842884

28852885
@skip_pre_hopper
2886-
@pytest.mark.skip_less_device_memory(100000)
2886+
@pytest.mark.skip_less_device_memory(80000)
28872887
class TestGPTOSS(LlmapiAccuracyTestHarness):
28882888
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
28892889
extra_evaluator_kwargs = {
@@ -2902,6 +2902,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
29022902
(True, True),
29032903
])
29042904
def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
2905+
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
29052906
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
29062907
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
29072908
{"scores_filter": "exact_match,flexible-extract"})
@@ -2912,7 +2913,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
29122913
disable_overlap_scheduler=not overlap_scheduler,
29132914
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
29142915

2915-
llm = LLM(self.MODEL_PATH,
2916+
llm = LLM(MODEL_PATH,
29162917
tensor_parallel_size=1,
29172918
pipeline_parallel_size=1,
29182919
moe_expert_parallel_size=1,
@@ -3000,6 +3001,47 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
30003001
task.evaluate(llm,
30013002
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
30023003

3004+
@pytest.mark.skip_less_device(2)
3005+
@pytest.mark.parametrize(
3006+
"moe_backend",
3007+
["CUTLASS",
3008+
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
3009+
ids=["cutlass", "trtllm", "triton"])
3010+
@pytest.mark.parametrize(
3011+
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
3012+
(2, 1, 1, False, True, True),
3013+
(2, 1, 2, False, True, True),
3014+
(2, 1, 2, True, True, True),
3015+
],
3016+
ids=["tp2", "ep2", "dp2"])
3017+
def test_w4_2gpus(self, moe_backend, tp_size, pp_size, ep_size,
3018+
attention_dp, cuda_graph, overlap_scheduler, mocker):
3019+
MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
3020+
if moe_backend == "TRITON":
3021+
if not IS_TRITON_KERNELS_AVAILABLE:
3022+
pytest.skip("Triton kernels are not available")
3023+
3024+
pytorch_config = dict(
3025+
disable_overlap_scheduler=not overlap_scheduler,
3026+
cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
3027+
3028+
llm = LLM(MODEL_PATH,
3029+
tensor_parallel_size=tp_size,
3030+
pipeline_parallel_size=pp_size,
3031+
moe_expert_parallel_size=ep_size,
3032+
kv_cache_config=self.kv_cache_config,
3033+
max_seq_len=8192,
3034+
**pytorch_config,
3035+
enable_attention_dp=attention_dp,
3036+
moe_config=MoeConfig(backend=moe_backend))
3037+
3038+
with llm:
3039+
model_name = "GPT-OSS/MXFP4"
3040+
task = GSM8K(model_name)
3041+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
3042+
task.evaluate(llm,
3043+
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
3044+
30033045

30043046
class TestEXAONE4(LlmapiAccuracyTestHarness):
30053047
MODEL_NAME = "LGAI-EXAONE/EXAONE-4.0-32B"

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -510,17 +510,26 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten
510510
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
511511
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
512512
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
513-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
514513
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
515-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
516-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
517-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
518-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
519-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
520-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
514+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
515+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass]
516+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm]
517+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton]
518+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass]
519+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm]
520+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton]
521+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass]
522+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm]
523+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton]
521524
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
522-
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
523525
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
526+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
527+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
528+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
529+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
530+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
531+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
532+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
524533
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
525534
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
526535
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,15 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
4848
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
4949
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
5050
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
51+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass]
52+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm]
53+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton]
54+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass]
55+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm]
56+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton]
57+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass]
58+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm]
59+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton]
5160
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
5261
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
5362
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,28 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
123123
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
124124
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8
125125
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
126+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
127+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
128+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
129+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass]
130+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm]
131+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton]
132+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass]
133+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm]
134+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton]
135+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass]
136+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm]
137+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton]
138+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
139+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
140+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
141+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
142+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
143+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
144+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
145+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
146+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
147+
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
126148
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
127149
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
128150
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@ l0_dgx_b200:
120120
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
121121
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
122122
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
123+
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
123124
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
124125
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
125126
# ------------- AutoDeploy tests ---------------

0 commit comments

Comments
 (0)