@@ -7,7 +7,6 @@ import groovy.json.JsonOutput
7
7
import com.nvidia.bloom.KubernetesManager
8
8
import com.nvidia.bloom.Constants
9
9
import com.nvidia.bloom.CloudManager
10
- import com.nvidia.bloom.KubernetesManager
11
10
import com.nvidia.bloom.SlurmConfig
12
11
import com.nvidia.bloom.SlurmCluster
13
12
import com.nvidia.bloom.SlurmPartition
@@ -211,6 +210,13 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
211
210
sh " cp ${ llmSrc} /cpp/build_backup/*.xml ${ stageName} || true"
212
211
sh " ls ${ stageName} / -all"
213
212
})
213
+
214
+ // Clean up the workspace
215
+ sh """
216
+ env | sort
217
+ pwd && ls -alh
218
+ rm -rf ./*
219
+ """
214
220
}
215
221
}
216
222
@@ -219,8 +225,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
219
225
SlurmPartition partition = SlurmConfig . partitionConfig[platform] as SlurmPartition
220
226
SlurmCluster cluster = SlurmConfig . clusterConfig[partition. clusterName]
221
227
222
- def nodeName = " ${ cluster.host} -test-${ UUID.randomUUID().toString()} "
223
- def nodeSecret = CloudManager . createNode(nodeName)
228
+ // Create a unique suffix for the node name and workspace
229
+ String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
230
+ def nodeName = " ${ cluster.host} -test-${ customSuffix} "
231
+ def customWorkspace = " /tmp/${ nodeName} "
232
+ def nodeSecret = CloudManager . createNode(nodeName, customWorkspace)
224
233
225
234
try {
226
235
// Run ssh command to start node in desired cluster via SLURM
@@ -263,12 +272,30 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263
272
}
264
273
265
274
if (CloudManager . isNodeOnline(nodeName)) {
266
- def dockerArgs = " --gpus ${ gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
275
+ node(nodeName) {
276
+ sh """
277
+ env | sort
278
+ pwd && ls -alh
279
+ ls -alh ${ env.WORKSPACE}
280
+ ls -alh ${ env.WORKSPACE_TMP}
281
+ """
282
+ }
283
+
284
+ def dockerArgs = " --gpus ${ gpuCount} " +
285
+ " --cap-add=SYS_ADMIN " +
286
+ " --ipc=host " +
287
+ " --security-opt seccomp=unconfined " +
288
+ " -u root:root " +
289
+ " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
290
+ " -v /tmp/ccache:${ CCACHE_DIR} :rw " +
291
+ " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
292
+ " --cap-add syslog"
267
293
268
294
if (partition. clusterName == " dlcluster" ) {
269
295
dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
270
296
}
271
- slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, false )
297
+
298
+ slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
272
299
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273
300
} else {
274
301
echo " The node does not come online in 2 hours, terminating the job"
@@ -560,6 +587,13 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
560
587
" ${ UPLOAD_PATH} /test-results/"
561
588
)
562
589
junit(testResults : " ${ stageName} /results*.xml" )
590
+
591
+ // Clean up the workspace
592
+ sh """
593
+ env | sort
594
+ pwd && ls -alh
595
+ rm -rf ./*
596
+ """
563
597
}
564
598
}
565
599
}
@@ -796,7 +830,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
796
830
797
831
def runLLMDocBuild (pipeline , config )
798
832
{
799
- // Step 1: cloning tekit source code
833
+ // Step 1: cloning source code
800
834
sh " pwd && ls -alh"
801
835
sh " env | sort"
802
836
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@@ -1241,13 +1275,16 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
1241
1275
1242
1276
def runLLMTestlistOnPlatformImpl (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" )
1243
1277
{
1244
- // Step 1: create LLM_ROOT dir
1245
- sh " pwd && ls -alh"
1246
- // TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
1247
- // So that it can work with multiple job running in same node
1248
- sh " rm -rf ./*"
1278
+ // Step 1: create LLM_ROOT dir and clean up the workspace
1249
1279
def llmRootConfig = " ${ LLM_ROOT}${ config} "
1250
- sh " mkdir ${ llmRootConfig} "
1280
+ sh """
1281
+ env | sort
1282
+ pwd && ls -alh
1283
+ rm -rf ./*
1284
+ mkdir ${ llmRootConfig}
1285
+ ls -alh ${ env.WORKSPACE}
1286
+ ls -alh ${ env.WORKSPACE_TMP}
1287
+ """
1251
1288
1252
1289
def llmPath = sh (script : " realpath ${ llmRootConfig} " , returnStdout : true ). trim()
1253
1290
def llmSrc = " ${ llmPath} /TensorRT-LLM/src"
@@ -1562,6 +1599,13 @@ def runLLMTestlistOnPlatform(pipeline, platform, testList, config=VANILLA_CONFIG
1562
1599
sh " cp ${ llmSrc} /cpp/build_backup/*.xml ${ stageName} || true"
1563
1600
sh " ls ${ stageName} / -all"
1564
1601
})
1602
+
1603
+ // Clean up the workspace
1604
+ sh """
1605
+ env | sort
1606
+ pwd && ls -alh
1607
+ rm -rf ./*
1608
+ """
1565
1609
}
1566
1610
1567
1611
@@ -1890,12 +1934,8 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
1890
1934
fullSet + = SBSATestConfigs . keySet()
1891
1935
1892
1936
SBSASlurmTestConfigs = [
1893
- " GB200-PyTorch-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 3 ],
1894
- " GB200-PyTorch-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 3 ],
1895
- " GB200-PyTorch-3" : [" gb200-unrestricted" , " l0_gb200" , 3 , 3 ],
1896
- " GB200-TensorRT-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 2 ],
1897
- " GB200-TensorRT-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 2 ],
1898
- " GB200-Triton-Post-Merge-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 1 ],
1937
+ // Not used in the pipeline now
1938
+ // "GB200-PyTorch-1": ["gb200-single", "l0_gb200", 1, 3],
1899
1939
" GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
1900
1940
" GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
1901
1941
]
@@ -1909,7 +1949,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
1909
1949
" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-4" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 4 , 7 , 8 , 2 ],
1910
1950
" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-5" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 5 , 7 , 8 , 2 ],
1911
1951
" GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-6" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 6 , 7 , 8 , 2 ],
1912
- " GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-7" : [" gb200-multi-node" , " l0_gb200_multi_nodes" , 7 , 7 , 8 , 2 ],
1913
1952
]
1914
1953
fullSet + = multiNodesSBSAConfigs. keySet()
1915
1954
@@ -2129,7 +2168,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
2129
2168
echo " ###### Check pip install Start ######"
2130
2169
withEnv(libEnv) {
2131
2170
sh " env | sort"
2132
- checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2171
+ timeout(time : 1 , unit : ' HOURS' ) {
2172
+ checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2173
+ }
2133
2174
}
2134
2175
echo " ###### Run LLMAPI tests Start ######"
2135
2176
def config = VANILLA_CONFIG
@@ -2464,7 +2505,7 @@ pipeline {
2464
2505
2465
2506
def testPhase2StageName = env. testPhase2StageName
2466
2507
if (testPhase2StageName) {
2467
- def dgxSigns = [" DGX_H100 " , " DGX_H200 " , " GB200- 4_GPUs" , " GB200- 8_GPUs" , " DGX_B200 " , " RTXPro6000-4_GPUs " ]
2508
+ def dgxSigns = [" 2_GPUs " , " 4_GPUs" , " 8_GPUs" ]
2468
2509
singleGpuJobs = parallelJobs. findAll{!dgxSigns .any {sign -> it. key. contains(sign)}}
2469
2510
dgxJobs = parallelJobs. findAll{dgxSigns .any {sign -> it. key. contains(sign)}}
2470
2511
}
0 commit comments