From 0732b252b9aa15fa72940e553336f4306c02d5d4 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 26 Aug 2025 18:44:38 +0800 Subject: [PATCH 1/3] move qwen3 tests from b200 to gb200 Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- jenkins/L0_Test.groovy | 2 +- .../test_lists/test-db/l0_b200.yml | 12 -- .../test_lists/test-db/l0_gb200.yml | 114 ------------------ 3 files changed, 1 insertion(+), 127 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index bbeca03749d..0f92c3990b8 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1923,7 +1923,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) SBSASlurmTestConfigs = [ // Not used in the pipeline now - // "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3], + "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3], "GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4], ] diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 5e188165a5b..a963ac6f8c5 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -31,18 +31,6 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-CUTLASS] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRITON] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[fp8-latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton] diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index d500da6b659..b9090b5d2de 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -17,23 +17,6 @@ l0_gb200: backend: pytorch tests: # ------------- PyTorch tests --------------- - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_4] - - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_64] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] @@ -46,100 +29,3 @@ l0_gb200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm] - - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-triton] - - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551 - - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] - - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] - - test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16] - - test_e2e.py::test_ptp_quickstart_advanced_mixed_precision - - test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] - - test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] - - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False] - - unittest/_torch/attention - - unittest/_torch/compilation - - unittest/_torch/debugger - - unittest/_torch/executor - - unittest/_torch/misc - - unittest/_torch/modules - - unittest/_torch/multimodal - - unittest/_torch/sampler - - unittest/_torch/speculative - - unittest/_torch/thop - - unittest/_torch/modeling -k "modeling_llama" - - unittest/_torch/modeling -k "modeling_mixtral" - - unittest/_torch/modeling -k "modeling_deepseek" - - unittest/_torch/modeling -k "modeling_gpt_oss" - - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison" -- condition: - ranges: - system_gpu_count: - gte: 1 - lte: 1 - wildcards: - gpu: - - '*gb200*' - linux_distribution_name: ubuntu* - cpu: aarch64 - terms: - stage: post_merge - backend: tensorrt - tests: - # ------------- TRT tests --------------- - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4 - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-enable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-disable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant] - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_auto_dtype - - accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8 - - accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized - - unittest/trt/attention/test_gpt_attention.py -k "trtllm_gen" - - unittest/llmapi/test_llm_quant.py - - unittest/trt/functional/test_fp4_gemm.py -- condition: - ranges: - system_gpu_count: - gte: 1 - lte: 1 - wildcards: - gpu: - - '*gb200*' - linux_distribution_name: ubuntu* - cpu: aarch64 - terms: - stage: post_merge - backend: triton - tests: - # ------------- Triton tests --------------- - - triton_server/test_triton.py::test_llava[llava] - - triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] - - triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora] -- condition: - ranges: - system_gpu_count: - gte: 1 - lte: 1 - wildcards: - gpu: - - '*gb200*' - linux_distribution_name: ubuntu* - cpu: aarch64 - terms: - stage: post_merge - backend: pytorch - tests: - # ------------- PyTorch tests --------------- - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False] From 96d0ef8b9297aae2598c2d0a30634ebc89a0ea47 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 26 Aug 2025 18:57:01 +0800 Subject: [PATCH 2/3] update Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- jenkins/L0_Test.groovy | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 0f92c3990b8..68913ac70b1 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1922,8 +1922,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) fullSet += SBSATestConfigs.keySet() SBSASlurmTestConfigs = [ - // Not used in the pipeline now - "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3], + "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 1], "GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4], ] From 74b375606dded265608cdfa67aa0ae4eb14b0235 Mon Sep 17 00:00:00 2001 From: junq <22017000+QiJune@users.noreply.github.com> Date: Tue, 26 Aug 2025 22:59:48 +0800 Subject: [PATCH 3/3] fix comments Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --- jenkins/L0_Test.groovy | 5 ++--- tests/integration/test_lists/test-db/l0_gb200.yml | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 68913ac70b1..3a2bb7c0131 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1820,9 +1820,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "H100_PCIe-CPP-2": ["h100-cr", "l0_h100", 2, 2], "H100_PCIe-TensorRT-1": ["h100-cr", "l0_h100", 1, 2], "H100_PCIe-TensorRT-2": ["h100-cr", "l0_h100", 2, 2], - "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 3], - "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 3], - "B200_PCIe-PyTorch-3": ["b100-ts2", "l0_b200", 3, 3], + "B200_PCIe-PyTorch-1": ["b100-ts2", "l0_b200", 1, 2], + "B200_PCIe-PyTorch-2": ["b100-ts2", "l0_b200", 2, 2], "RTX5090-PyTorch-1": ["rtx-5090", "l0_gb202", 1, 1], "RTX5080-TensorRT-1": ["rtx-5080", "l0_gb203", 1, 2], "RTX5080-TensorRT-2": ["rtx-5080", "l0_gb203", 2, 2], diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml index b9090b5d2de..adc0ffac85d 100644 --- a/tests/integration/test_lists/test-db/l0_gb200.yml +++ b/tests/integration/test_lists/test-db/l0_gb200.yml @@ -1,5 +1,3 @@ -# Don't add any tests here. -# Copied from l0_b200.yml but not used in the pipeline now version: 0.0.1 l0_gb200: - condition: