Add b300 tests

yiqingy0 · yiqingy0 · commit 612733a9e388 · 2025-08-25T02:37:04.000Z
Signed-off-by: Yiqing Yan &lt;yiqingy@nvidia.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -1,4 +1,4 @@
-@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
+@Library(['bloom-jenkins-shared-lib@user/yiqingy/dgx_b300', 'trtllm-jenkins-shared-lib@main']) _
 
 import java.lang.InterruptedException
 import groovy.transform.Field
@@ -673,6 +673,10 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
         def driverVersion = Constants.DEFAULT_NVIDIA_DRIVER_VERSION
         def cpuCount = "${TESTER_CORES}"
 
+<<<<<<< HEAD
+=======
+        // Multi-GPU only supports DGX-H100 and DGX-H200 due to the hardware stability.
+>>>>>>> fix
         if (hasMultipleGPUs)
         {
             // Not a hard requirement, but based on empirical values.
@@ -1873,9 +1877,21 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "DGX_H200-4_GPUs-TensorRT-Post-Merge-1": ["dgx-h200-x4", "l0_dgx_h200", 1, 3, 4],
         "DGX_H200-4_GPUs-TensorRT-Post-Merge-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 3, 4],
         "DGX_H200-4_GPUs-TensorRT-Post-Merge-3": ["dgx-h200-x4", "l0_dgx_h200", 3, 3, 4],
+<<<<<<< HEAD
+<<<<<<< HEAD
+<<<<<<< HEAD
         "RTXPro6000-Pytorch-Post-Merge-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
         "RTXPro6000-4_GPUs-Pytorch-Post-Merge-1": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 1, 2, 4],
         "RTXPro6000-4_GPUs-Pytorch-Post-Merge-2": ["rtx-pro-6000-x4", "l0_rtx_pro_6000", 2, 2, 4],
+=======
+        "B300_PCIe-PyTorch-1": ["b300", "l0_b300", 1, 1],
+>>>>>>> Add b300 tests
+=======
+        "DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
+>>>>>>> fix
+=======
+        "DGX_B300-4_GPUs-PyTorch-1": ["dgx-b300-x4", "l0_dgx_b300", 1, 1, 4],
+>>>>>>> fix
     ]
 
     parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
@@ -2494,7 +2510,11 @@ pipeline {
 
                     def testPhase2StageName = env.testPhase2StageName
                     if (testPhase2StageName) {
+<<<<<<< HEAD
                         def dgxSigns = ["2_GPUs", "4_GPUs", "8_GPUs"]
+=======
+                        def dgxSigns = ["DGX_H100", "DGX_H200", "GB200", "DGX_B200", "DGX_B300"]
+>>>>>>> fix
                         singleGpuJobs = parallelJobs.findAll{!dgxSigns.any{sign -> it.key.contains(sign)}}
                         dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
                     }
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml
@@ -0,0 +1,17 @@
+version: 0.0.1
+l0_dgx_b300:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*gb110*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: pytorch
+  tests:
+  # ------------- PyTorch tests ---------------
+  - unittest/_torch/multi_gpu_modeling -k "deepseek"