add gptoss 20g tests

xinhe-nv · xinhe-nv · commit d2e57bac1bb7 · 2025-09-05T11:16:50.000+08:00
Signed-off-by: Xin He (SW-GPU) &lt;200704525+xinhe-nv@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2883,8 +2883,8 @@ def test_auto_dtype_long_rope(self):
 
 
 @skip_pre_hopper
-@pytest.mark.skip_less_device_memory(100000)
-class TestGPTOSS_120B(LlmapiAccuracyTestHarness):
+@pytest.mark.skip_less_device_memory(80000)
+class TestGPTOSS(LlmapiAccuracyTestHarness):
     kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
     extra_evaluator_kwargs = {
         "fewshot_as_multiturn": True,
@@ -2902,6 +2902,7 @@ class TestGPTOSS_120B(LlmapiAccuracyTestHarness):
         (True, True),
     ])
     def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
+        MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
         mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
@@ -2912,7 +2913,7 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        llm = LLM(self.MODEL_PATH,
+        llm = LLM(MODEL_PATH,
                   tensor_parallel_size=1,
                   pipeline_parallel_size=1,
                   moe_expert_parallel_size=1,
@@ -3000,52 +3001,6 @@ def test_w4a16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
-
-@skip_pre_hopper
-@pytest.mark.skip_less_device_memory(100000)
-class TestGPTOSS_20B(LlmapiAccuracyTestHarness):
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
-    extra_evaluator_kwargs = {
-        "fewshot_as_multiturn": True,
-        "apply_chat_template": True,
-        "scores_filter": "exact_match,flexible-extract",
-        "MAX_OUTPUT_LEN": 8192
-    }
-
-    MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
-
-    @pytest.mark.parametrize(
-        "moe_backend",
-        ["CUTLASS",
-         pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
-        ids=["cutlass", "trtllm", "triton"])
-    @pytest.mark.parametrize("cuda_graph,overlap_scheduler", [
-        (True, True),
-    ])
-    def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
-        if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
-            pytest.skip("Triton kernels are not available")
-
-        pytorch_config = dict(
-            disable_overlap_scheduler=not overlap_scheduler,
-            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
-
-        llm = LLM(self.MODEL_PATH,
-                  tensor_parallel_size=1,
-                  pipeline_parallel_size=1,
-                  moe_expert_parallel_size=1,
-                  kv_cache_config=self.kv_cache_config,
-                  max_seq_len=8192,
-                  **pytorch_config,
-                  moe_config=MoeConfig(backend=moe_backend))
-
-        with llm:
-            model_name = "GPT-OSS/MXFP4"
-            mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
-            task = GSM8K(model_name)
-            task.evaluate(llm,
-                          extra_evaluator_kwargs=self.extra_evaluator_kwargs)
-
     @pytest.mark.skip_less_device(2)
     @pytest.mark.parametrize(
         "moe_backend",
@@ -3059,8 +3014,9 @@ def test_w4_1gpu(self, moe_backend, cuda_graph, overlap_scheduler, mocker):
             (2, 1, 2, True, True, True),
         ],
         ids=["tp2", "ep2", "dp2"])
-    def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
+    def test_w4_2gpus(self, moe_backend, tp_size, pp_size, ep_size,
                       attention_dp, cuda_graph, overlap_scheduler, mocker):
+        MODEL_PATH = f"{llm_models_root()}/gpt_oss/gpt-oss-20b"
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("Triton kernels are not available")
@@ -3069,7 +3025,7 @@ def test_w4_4gpus(self, moe_backend, tp_size, pp_size, ep_size,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
 
-        llm = LLM(self.MODEL_PATH,
+        llm = LLM(MODEL_PATH,
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
@@ -86,6 +86,7 @@ accuracy/test_cli_flow.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
 accuracy/test_cli_flow.py::TestNemotronMini4BInstruct::test_fp8_prequantized
 accuracy/test_cli_flow.py::TestNemotronUltra::test_auto_dtype[tp8-cuda_graph=True] TIMEOUT (240)
 accuracy/test_cli_flow.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
+<<<<<<< HEAD
 accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
 accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
 accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype
@@ -148,6 +149,30 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[ep2-triton]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[dp2-cutlass]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[dp2-trtllm]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS_20B::test_w4_2gpus[dp2-triton]
+=======
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[tp2-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[ep2-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_2gpus[dp2-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
+>>>>>>> b36b4a539 (add gpt-oss 20g tests)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -41,11 +41,11 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-trtllm]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-triton]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-trtllm]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4a16[dp4]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
@@ -117,11 +117,11 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
   - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-triton]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-trtllm]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
   # ------------- AutoDeploy tests ---------------
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -155,13 +155,13 @@ l0_dgx_h100:
       backend: pytorch
       auto_trigger: gpt_oss
   tests:
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[tp4-triton]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[ep4-triton]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4_4gpus[dp4-triton]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS_120B::test_w4a16[dp4]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -330,6 +330,7 @@ accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype SKIP (https://nvbug
 accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8 SKIP (https://nvbugs/5465143)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/5471106)
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype SKIP (https://nvbugs/5481090)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] SKIP (https://nvbugs/5481080)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-False] SKIP (https://nvbugs/5481094)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Maverick-17B-128E-Instruct-FP8-llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8-True] SKIP (https://nvbugs/5481094)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True] SKIP (https://nvbugs/5481094)