@@ -263,7 +263,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263
263
}
264
264
265
265
if (CloudManager . isNodeOnline(nodeName)) {
266
- def dockerArgs = " --gpus ${ gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
266
+ def dockerArgs = " --gpus ${ gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
267
267
268
268
if (partition. clusterName == " dlcluster" ) {
269
269
dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
@@ -1765,7 +1765,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
1765
1765
" DGX_H100-4_GPUs-PyTorch-DeepSeek-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 2 , 4 ],
1766
1766
" DGX_H100-4_GPUs-PyTorch-DeepSeek-2" : [" dgx-h100-x4" , " l0_dgx_h100" , 2 , 2 , 4 ],
1767
1767
" DGX_H100-4_GPUs-PyTorch-Others-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
1768
- " DGX_H100-4_GPUs-Triton-Post-Merge-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
1769
1768
" DGX_H100-4_GPUs-CPP-1" : [" dgx-h100-x4" , " l0_dgx_h100" , 1 , 1 , 4 ],
1770
1769
" A10-PyTorch-1" : [" a10" , " l0_a10" , 1 , 1 ],
1771
1770
" A10-CPP-1" : [" a10" , " l0_a10" , 1 , 1 ],
@@ -1838,6 +1837,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
1838
1837
" B200_PCIe-TensorRT-Post-Merge-2" : [" b100-ts2" , " l0_b200" , 2 , 2 ],
1839
1838
" H100_PCIe-TensorRT-Perf-1" : [" h100-cr" , " l0_perf" , 1 , 1 ],
1840
1839
" H100_PCIe-PyTorch-Perf-1" : [" h100-cr" , " l0_perf" , 1 , 1 ],
1840
+ " DGX_H200-4_GPUs-Triton-Post-Merge-1" : [" dgx-h200-x4" , " l0_dgx_h200" , 1 , 1 , 4 ],
1841
1841
" DGX_H200-8_GPUs-PyTorch-Post-Merge-1" : [" dgx-h200-x8" , " l0_dgx_h200" , 1 , 1 , 8 ],
1842
1842
" DGX_H200-4_GPUs-PyTorch-Post-Merge-1" : [" dgx-h200-x4" , " l0_dgx_h200" , 1 , 1 , 4 ],
1843
1843
" DGX_H200-4_GPUs-TensorRT-Post-Merge-1" : [" dgx-h200-x4" , " l0_dgx_h200" , 1 , 3 , 4 ],
@@ -1890,8 +1890,14 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
1890
1890
fullSet + = SBSATestConfigs . keySet()
1891
1891
1892
1892
SBSASlurmTestConfigs = [
1893
- " GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200" , 1 , 1 , 4 ],
1894
- " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200" , 1 , 1 , 4 ],
1893
+ " GB200-PyTorch-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 3 ],
1894
+ " GB200-PyTorch-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 3 ],
1895
+ " GB200-PyTorch-3" : [" gb200-unrestricted" , " l0_gb200" , 3 , 3 ],
1896
+ " GB200-TensorRT-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 2 ],
1897
+ " GB200-TensorRT-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 2 ],
1898
+ " GB200-Triton-Post-Merge-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 1 ],
1899
+ " GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
1900
+ " GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
1895
1901
]
1896
1902
fullSet + = SBSASlurmTestConfigs . keySet()
1897
1903
@@ -2458,7 +2464,7 @@ pipeline {
2458
2464
2459
2465
def testPhase2StageName = env. testPhase2StageName
2460
2466
if (testPhase2StageName) {
2461
- def dgxSigns = [" DGX_H100" , " DGX_H200" , " GB200" , " DGX_B200" , " RTXPro6000-4_GPUs" ]
2467
+ def dgxSigns = [" DGX_H100" , " DGX_H200" , " GB200-4_GPUs " , " GB200-8_GPUs " , " DGX_B200" , " RTXPro6000-4_GPUs" ]
2462
2468
singleGpuJobs = parallelJobs. findAll{!dgxSigns .any {sign -> it. key. contains(sign)}}
2463
2469
dgxJobs = parallelJobs. findAll{dgxSigns .any {sign -> it. key. contains(sign)}}
2464
2470
}
0 commit comments