Skip to content

Commit 612733a

Browse files
committed
Add b300 tests
Signed-off-by: Yiqing Yan <[email protected]>
1 parent 6e13160 commit 612733a

File tree

2 files changed

+38
-1
lines changed

2 files changed

+38
-1
lines changed

jenkins/L0_Test.groovy

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
1+
@Library(['bloom-jenkins-shared-lib@user/yiqingy/dgx_b300', 'trtllm-jenkins-shared-lib@main']) _
22

33
import java.lang.InterruptedException
44
import groovy.transform.Field
@@ -673,6 +673,10 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
673673
def driverVersion = Constants.DEFAULT_NVIDIA_DRIVER_VERSION
674674
def cpuCount = "${TESTER_CORES}"
675675

676+
<<<<<<< HEAD
677+
=======
678+
// Multi-GPU only supports DGX-H100 and DGX-H200 due to the hardware stability.
679+
>>>>>>> fix
676680
if (hasMultipleGPUs)
677681
{
678682
// Not a hard requirement, but based on empirical values.
@@ -1873,9 +1877,21 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18731877
"DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
18741878
"DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
18751879
"DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
1880+
<<<<<<< HEAD
1881+
<<<<<<< HEAD
1882+
<<<<<<< HEAD
18761883
"RTXPro6000-Pytorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
18771884
"RTXPro6000-4_GPUs-Pytorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
18781885
"RTXPro6000-4_GPUs-Pytorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
1886+
=======
1887+
"B300_PCIe-PyTorch-1": ["b300", "l0_b300", 1, 1],
1888+
>>>>>>> Add b300 tests
1889+
=======
1890+
"DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
1891+
>>>>>>> fix
1892+
=======
1893+
"DGX_B300-4_GPUs-PyTorch-1": ["dgx-b300-x4", "l0_dgx_b300", 1, 1, 4],
1894+
>>>>>>> fix
18791895
]
18801896

18811897
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
@@ -2494,7 +2510,11 @@ pipeline {
24942510

24952511
def testPhase2StageName = env.testPhase2StageName
24962512
if (testPhase2StageName) {
2513+
<<<<<<< HEAD
24972514
def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
2515+
=======
2516+
def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200", "DGX_B300"]
2517+
>>>>>>> fix
24982518
singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
24992519
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
25002520
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
version: 0.0.1
2+
l0_dgx_b300:
3+
- condition:
4+
ranges:
5+
system_gpu_count:
6+
gte: 4
7+
lte: 4
8+
wildcards:
9+
gpu:
10+
- '*gb110*'
11+
linux_distribution_name: ubuntu*
12+
terms:
13+
stage: pre_merge
14+
backend: pytorch
15+
tests:
16+
# ------------- PyTorch tests ---------------
17+
- unittest/_torch/multi_gpu_modeling -k "deepseek"

0 commit comments

Comments
 (0)