Skip to content

Commit 67125cc

Browse files
committed
[None][infra] Migrate B200 single GPU tests to GB200
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 96ff82e commit 67125cc

File tree

5 files changed

+190
-65
lines changed

5 files changed

+190
-65
lines changed

jenkins/L0_Test.groovy

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263263
}
264264

265265
if (CloudManager.isNodeOnline(nodeName)) {
266-
def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
266+
def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
267267

268268
if (partition.clusterName == "dlcluster") {
269269
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
@@ -1765,7 +1765,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
17651765
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
17661766
"DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
17671767
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
1768-
"DGX_H100-4_GPUs-Triton-Post-Merge-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
17691768
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
17701769
"A10-PyTorch-1": ["a10", "l0_a10", 1, 1],
17711770
"A10-CPP-1": ["a10", "l0_a10", 1, 1],
@@ -1838,6 +1837,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18381837
"B200_PCIe-TensorRT-Post-Merge-2": ["b100-ts2", "l0_b200", 2, 2],
18391838
"H100_PCIe-TensorRT-Perf-1": ["h100-cr", "l0_perf", 1, 1],
18401839
"H100_PCIe-PyTorch-Perf-1": ["h100-cr", "l0_perf", 1, 1],
1840+
"DGX_H200-4_GPUs-Triton-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
18411841
"DGX_H200-8_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x8", "l0_dgx_h200", 1, 1, 8],
18421842
"DGX_H200-4_GPUs-PyTorch-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 1, 4],
18431843
"DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
@@ -1890,8 +1890,14 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18901890
fullSet += SBSATestConfigs.keySet()
18911891

18921892
SBSASlurmTestConfigs = [
1893-
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200", 1, 1, 4],
1894-
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200", 1, 1, 4],
1893+
"GB200-PyTorch-1": ["gb200-unrestricted", "l0_gb200", 1, 3],
1894+
"GB200-PyTorch-2": ["gb200-unrestricted", "l0_gb200", 2, 3],
1895+
"GB200-PyTorch-3": ["gb200-unrestricted", "l0_gb200", 3, 3],
1896+
"GB200-TensorRT-1": ["gb200-unrestricted", "l0_gb200", 1, 2],
1897+
"GB200-TensorRT-2": ["gb200-unrestricted", "l0_gb200", 2, 2],
1898+
"GB200-Triton-Post-Merge-1": ["gb200-unrestricted", "l0_gb200", 1, 1],
1899+
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
1900+
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
18951901
]
18961902
fullSet += SBSASlurmTestConfigs.keySet()
18971903

@@ -2458,7 +2464,7 @@ pipeline {
24582464

24592465
def testPhase2StageName = env.testPhase2StageName
24602466
if (testPhase2StageName) {
2461-
def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200", "RTXPro6000-4_GPUs"]
2467+
def dgxSigns = ["DGX_H100", "DGX_H200", "GB200-4_GPUs", "GB200-8_GPUs", "DGX_B200", "RTXPro6000-4_GPUs"]
24622468
singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
24632469
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
24642470
}

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -209,18 +209,3 @@ l0_dgx_h100:
209209
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-ucx_kvcache-90]
210210
- cpp/test_multi_gpu.py::TestDisagg::test_orchestrator_params[llama-nixl_kvcache-90] TIMEOUT (90)
211211
- cpp/test_multi_gpu.py::TestDisagg::test_spawn_orchestrator[llama-nixl_kvcache-90]
212-
- condition:
213-
ranges:
214-
system_gpu_count:
215-
gte: 4
216-
lte: 4
217-
wildcards:
218-
gpu:
219-
- '*h100*'
220-
linux_distribution_name: ubuntu*
221-
terms:
222-
stage: post_merge
223-
backend: triton
224-
auto_trigger: others
225-
tests:
226-
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,19 @@ l0_dgx_h200:
166166
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-disable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
167167
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp2]
168168
- unittest/llmapi/apps/_test_openai_multi_gpu.py -m "part0"
169+
- condition:
170+
ranges:
171+
system_gpu_count:
172+
gte: 4
173+
lte: 4
174+
wildcards:
175+
gpu:
176+
- '*h200*'
177+
linux_distribution_name: ubuntu*
178+
cpu: x86_64
179+
terms:
180+
stage: post_merge
181+
backend: triton
182+
tests:
183+
# ------------- Triton tests ---------------
184+
- triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm]

0 commit comments

Comments
 (0)