1
- @Library ([' bloom-jenkins-shared-lib@main ' , ' trtllm-jenkins-shared-lib@main' ]) _
1
+ @Library ([' bloom-jenkins-shared-lib@dev-yanchaol-slurm ' , ' trtllm-jenkins-shared-lib@main' ]) _
2
2
3
3
import java.lang.InterruptedException
4
4
import groovy.transform.Field
@@ -7,7 +7,6 @@ import groovy.json.JsonOutput
7
7
import com.nvidia.bloom.KubernetesManager
8
8
import com.nvidia.bloom.Constants
9
9
import com.nvidia.bloom.CloudManager
10
- import com.nvidia.bloom.KubernetesManager
11
10
import com.nvidia.bloom.SlurmConfig
12
11
import com.nvidia.bloom.SlurmCluster
13
12
import com.nvidia.bloom.SlurmPartition
@@ -211,6 +210,13 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
211
210
sh " cp ${ llmSrc} /cpp/build_backup/*.xml ${ stageName} || true"
212
211
sh " ls ${ stageName} / -all"
213
212
})
213
+
214
+ // Clean up the workspace
215
+ sh """
216
+ env | sort
217
+ pwd && ls -alh
218
+ rm -rf ./*
219
+ """
214
220
}
215
221
}
216
222
@@ -219,8 +225,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
219
225
SlurmPartition partition = SlurmConfig . partitionConfig[platform] as SlurmPartition
220
226
SlurmCluster cluster = SlurmConfig . clusterConfig[partition. clusterName]
221
227
222
- def nodeName = " ${ cluster.host} -test-${ UUID.randomUUID().toString()} "
223
- def nodeSecret = CloudManager . createNode(nodeName)
228
+ // Create a unique suffix for the node name and workspace
229
+ String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
230
+ def nodeName = " ${ cluster.host} -test-${ customSuffix} "
231
+ def customWorkspace = " /tmp/${ nodeName} "
232
+ def nodeSecret = CloudManager . createNode(nodeName, customWorkspace)
224
233
225
234
try {
226
235
// Run ssh command to start node in desired cluster via SLURM
@@ -263,12 +272,31 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263
272
}
264
273
265
274
if (CloudManager . isNodeOnline(nodeName)) {
266
- def dockerArgs = " --gpus ${ gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${ CCACHE_DIR} :rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
275
+ node(nodeName) {
276
+ sh """
277
+ env | sort
278
+ pwd && ls -alh
279
+ ls -alh ${ env.WORKSPACE}
280
+ ls -alh ${ env.WORKSPACE_TMP}
281
+ ls -alh ${ env.PWD}
282
+ """
283
+ }
284
+
285
+ def dockerArgs = " --gpus ${ gpuCount} " +
286
+ " --cap-add=SYS_ADMIN " +
287
+ " --ipc=host " +
288
+ " --security-opt seccomp=unconfined " +
289
+ " -u root:root " +
290
+ " -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
291
+ " -v /tmp/ccache:${ CCACHE_DIR} :rw " +
292
+ " -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
293
+ " --cap-add syslog"
267
294
268
295
if (partition. clusterName == " dlcluster" ) {
269
296
dockerArgs + = " -e NVIDIA_IMEX_CHANNELS=0"
270
297
}
271
- slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, false )
298
+
299
+ slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
272
300
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273
301
} else {
274
302
echo " The node does not come online in 2 hours, terminating the job"
@@ -796,7 +824,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
796
824
797
825
def runLLMDocBuild (pipeline , config )
798
826
{
799
- // Step 1: cloning tekit source code
827
+ // Step 1: cloning source code
800
828
sh " pwd && ls -alh"
801
829
sh " env | sort"
802
830
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@@ -1241,13 +1269,17 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
1241
1269
1242
1270
def runLLMTestlistOnPlatformImpl (pipeline , platform , testList , config = VANILLA_CONFIG , perfMode = false , stageName = " Undefined" , splitId = 1 , splits = 1 , skipInstallWheel = false , cpver = " cp312" )
1243
1271
{
1244
- // Step 1: create LLM_ROOT dir
1245
- sh " pwd && ls -alh"
1246
- // TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
1247
- // So that it can work with multiple job running in same node
1248
- sh " rm -rf ./*"
1272
+ // Step 1: create LLM_ROOT dir and clean up the workspace
1249
1273
def llmRootConfig = " ${ LLM_ROOT}${ config} "
1250
- sh " mkdir ${ llmRootConfig} "
1274
+ sh """
1275
+ env | sort
1276
+ pwd && ls -alh
1277
+ rm -rf ./*
1278
+ mkdir ${ llmRootConfig}
1279
+ ls -alh ${ env.WORKSPACE}
1280
+ ls -alh ${ env.WORKSPACE_TMP}
1281
+ ls -alh ${ env.PWD}
1282
+ """
1251
1283
1252
1284
def llmPath = sh (script : " realpath ${ llmRootConfig} " , returnStdout : true ). trim()
1253
1285
def llmSrc = " ${ llmPath} /TensorRT-LLM/src"
@@ -1562,6 +1594,13 @@ def runLLMTestlistOnPlatform(pipeline, platform, testList, config=VANILLA_CONFIG
1562
1594
sh " cp ${ llmSrc} /cpp/build_backup/*.xml ${ stageName} || true"
1563
1595
sh " ls ${ stageName} / -all"
1564
1596
})
1597
+
1598
+ // Clean up the workspace
1599
+ sh """
1600
+ env | sort
1601
+ pwd && ls -alh
1602
+ rm -rf ./*
1603
+ """
1565
1604
}
1566
1605
1567
1606
@@ -1893,8 +1932,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
1893
1932
" GB200-PyTorch-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 3 ],
1894
1933
" GB200-PyTorch-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 3 ],
1895
1934
" GB200-PyTorch-3" : [" gb200-unrestricted" , " l0_gb200" , 3 , 3 ],
1896
- " GB200-TensorRT-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 2 ],
1897
- " GB200-TensorRT-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 2 ],
1935
+ " GB200-PyTorch-Post-Merge-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 1 ],
1936
+ " GB200-TensorRT-Post-Merge-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 2 ],
1937
+ " GB200-TensorRT-Post-Merge-2" : [" gb200-unrestricted" , " l0_gb200" , 2 , 2 ],
1898
1938
" GB200-Triton-Post-Merge-1" : [" gb200-unrestricted" , " l0_gb200" , 1 , 1 ],
1899
1939
" GB200-4_GPUs-PyTorch-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
1900
1940
" GB200-4_GPUs-PyTorch-Post-Merge-1" : [" gb200-x4" , " l0_gb200_multi_gpus" , 1 , 1 , 4 ],
@@ -2129,7 +2169,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
2129
2169
echo " ###### Check pip install Start ######"
2130
2170
withEnv(libEnv) {
2131
2171
sh " env | sort"
2132
- checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2172
+ timeout(time : 1 , unit : ' HOURS' ) {
2173
+ checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2174
+ }
2133
2175
}
2134
2176
echo " ###### Run LLMAPI tests Start ######"
2135
2177
def config = VANILLA_CONFIG
0 commit comments