From 9b87837bb540947783705ad570829d2aad60fe43 Mon Sep 17 00:00:00 2001 From: "Xin He (SW-GPU)" <200704525+xinhe-nv@users.noreply.github.com> Date: Fri, 5 Sep 2025 11:19:27 +0800 Subject: [PATCH] add gptoss 20g tests Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 46 ++++++++++++++++++- .../test_lists/qa/llm_function_core.txt | 25 ++++++---- .../qa/llm_function_core_sanity.txt | 9 ++++ .../test_lists/qa/llm_function_nim.txt | 22 +++++++++ .../test_lists/test-db/l0_dgx_b200.yml | 1 + 5 files changed, 93 insertions(+), 10 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index a8d2ccc270e..2b8190e4ad3 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2883,7 +2883,7 @@ def test_auto_dtype_long_rope(self): @skip_pre_hopper -@pytest.mark.skip_less_device_memory(100000) +@pytest.mark.skip_less_device_memory(80000) class TestGPTOSS(LlmapiAccuracyTestHarness): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) extra_evaluator_kwargs = { @@ -2902,6 +2902,7 @@ class TestGPTOSS(LlmapiAccuracyTestHarness): (True, True), ]) def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker): + MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b" mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) @@ -2912,7 +2913,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker): disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None) - llm = LLM(self.MODEL_PATH, + llm = LLM(MODEL_PATH, tensor_parallel_size=1, pipeline_parallel_size=1, moe_expert_parallel_size=1, @@ -3000,6 +3001,47 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, task.evaluate(llm, extra_evaluator_kwargs=self.extra_evaluator_kwargs) + @pytest.mark.skip_less_device(2) + @pytest.mark.parametrize( + "moe_backend", + ["CUTLASS", + pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"], + ids=["cutlass", "trtllm", "triton"]) + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [ + (2, 1, 1, False, True, True), + (2, 1, 2, False, True, True), + (2, 1, 2, True, True, True), + ], + ids=["tp2", "ep2", "dp2"]) + def test_w4_2gpus(self, moe_backend, tp_size, pp_size, ep_size, + attention_dp, cuda_graph, overlap_scheduler, mocker): + MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b" + if moe_backend == "TRITON": + if not IS_TRITON_KERNELS_AVAILABLE: + pytest.skip("Triton kernels are not available") + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None) + + llm = LLM(MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=self.kv_cache_config, + max_seq_len=8192, + **pytorch_config, + enable_attention_dp=attention_dp, + moe_config=MoeConfig(backend=moe_backend)) + + with llm: + model_name = "GPT-OSS/MXFP4" + task = GSM8K(model_name) + mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) + task.evaluate(llm, + extra_evaluator_kwargs=self.extra_evaluator_kwargs) + class TestEXAONE4(LlmapiAccuracyTestHarness): MODEL_NAME = "LGAI-EXAONE/EXAONE-4.0-32B" diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index e13addc9494..052484d7dde 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -510,17 +510,26 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass] -accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False] accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 4ca5fbb91ca..1fbb1277afc 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -48,6 +48,15 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton] accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 49c582114bc..13424bfc4fc 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -123,6 +123,28 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm] +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4] accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index e3e81cfbcab..54262b30e7a 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -120,6 +120,7 @@ l0_dgx_b200: - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm] + - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton] - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] # ------------- AutoDeploy tests ---------------