1
- @Library ([' bloom-jenkins-shared-lib@dev-yanchaol-slurm-output ' , ' trtllm-jenkins-shared-lib@main' ]) _
1
+ @Library ([' bloom-jenkins-shared-lib@main ' , ' trtllm-jenkins-shared-lib@main' ]) _
2
2
3
3
import java.lang.InterruptedException
4
4
import groovy.transform.Field
@@ -44,8 +44,9 @@ DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
44
44
UBUNTU_22_04_IMAGE = " urm.nvidia.com/docker/ubuntu:22.04"
45
45
UBUNTU_24_04_IMAGE = " urm.nvidia.com/docker/ubuntu:24.04"
46
46
47
- POD_TIMEOUT_SECONDS = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 21600"
48
- POD_TIMEOUT_SECONDS_TMP = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 43200"
47
+ POD_TIMEOUT_SECONDS_TEST = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 21600"
48
+ POD_TIMEOUT_SECONDS_BUILD = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 43200"
49
+ POD_TIMEOUT_SECONDS_SLURM = env. podTimeoutSeconds ? env. podTimeoutSeconds : " 79200" // Use 22 hours to allow for 2 hour of buffer.
49
50
50
51
// Literals for easier access.
51
52
@Field
@@ -133,7 +134,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133
134
}
134
135
135
136
// TODO: consolidate slurm related code for both multi nodes and single nodes
136
- def cleanUpNodeResourcesMultiNodes (def pipeline , SlurmCluster cluster , String jobUID ) {
137
+ def cleanUpNodeResourcesMultiNodes (def pipeline , SlurmCluster cluster , String jobUID , String slurmOutputFile ) {
137
138
withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
138
139
def remote = [
139
140
ip : cluster. ip,
@@ -144,20 +145,50 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
144
145
]
145
146
146
147
Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
147
- pipeline . stage( ' Clean up SLURM Agent Resources ' ) {
148
- Utils . exec(
148
+
149
+ def slurmJobID = Utils . exec(
149
150
pipeline,
150
- timeout : false ,
151
151
script : Utils . sshUserCmd(
152
152
remote,
153
- " rm -rf /home/svc_tensorrt/bloom/scripts/${ jobUID} "
154
- )
155
- )
153
+ " \" sed -n " +
154
+ " -e 's/.*Submitted batch job \\ ([0-9]\\ +\\ ).*/\\ 1/p' " +
155
+ " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) queued.*/\\ 1/p' " +
156
+ " -e 's/.*srun: job \\ ([0-9]\\ +\\ ) has been allocated.*/\\ 1/p' " +
157
+ " ${ slurmOutputFile} | tail -n1\" "
158
+ ),
159
+ returnStdout : true
160
+ ). trim()
161
+
162
+ if (! slurmJobID || ! slurmJobID. isNumber()) {
163
+ Utils . exec(pipeline, script : Utils . sshUserCmd(remote, " \" cat ${ slurmOutputFile} \" " ))
164
+ error(" Slurm job did not submit successfully. No job ID found." )
156
165
}
166
+
167
+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
168
+
169
+ Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination; sleep 30" )
170
+
171
+ Utils . exec(
172
+ pipeline,
173
+ script : Utils . sshUserCmd(
174
+ remote,
175
+ " \" scancel ${ slurmJobID} || true; sacct -j ${ slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${ slurmJobID} || true\" "
176
+ )
177
+ )
178
+
179
+ Utils . exec(
180
+ pipeline,
181
+ script : Utils . sshUserCmd(
182
+ remote,
183
+ " rm -rf /home/svc_tensorrt/bloom/scripts/${ jobUID} "
184
+ )
185
+ )
186
+
187
+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} cleaned up" )
157
188
}
158
189
}
159
190
160
- def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName ) {
191
+ def cleanUpNodeResources (def pipeline , SlurmCluster cluster , String nodeName , String slurmJobID ) {
161
192
withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
162
193
def remote = [
163
194
ip : cluster. ip,
@@ -168,17 +199,26 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
168
199
]
169
200
170
201
Utils . exec(pipeline, script : " apt-get update && apt-get install -y sshpass openssh-client" )
171
- pipeline . stage( ' Clean up SLURM Agent Resources ' ) {
172
- Utils . exec(
173
- pipeline,
174
- timeout : false ,
175
- script : Utils . sshUserCmd(
176
- remote,
177
- " rm -rf /home/svc_tensorrt/bloom/scripts/agent- ${ nodeName } .jar /home/svc_tensorrt/bloom/scripts/ ${ nodeName } -slurm_jenkins_agent_setup.sh "
178
- )
202
+
203
+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID } " )
204
+
205
+ Utils . exec(
206
+ pipeline,
207
+ script : Utils . sshUserCmd(
208
+ remote,
209
+ " \" scancel ${ slurmJobID } || true; sacct -j ${ slurmJobID } --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${ slurmJobID } || true \" "
179
210
)
180
- Utils . exec(pipeline, script : " echo done" )
181
- }
211
+ )
212
+
213
+ Utils . exec(
214
+ pipeline,
215
+ script : Utils . sshUserCmd(
216
+ remote,
217
+ " rm -rf /home/svc_tensorrt/bloom/scripts/agent-${ nodeName} .jar /home/svc_tensorrt/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh"
218
+ )
219
+ )
220
+
221
+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} cleaned up" )
182
222
}
183
223
}
184
224
@@ -224,6 +264,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
224
264
def customWorkspace = " /tmp/${ nodeName} "
225
265
def nodeSecret = CloudManager . createNode(nodeName, customWorkspace)
226
266
267
+ def slurmJobID = null
268
+
227
269
try {
228
270
// Run ssh command to start node in desired cluster via SLURM
229
271
withCredentials([usernamePassword(credentialsId : ' svc_tensorrt' , usernameVariable : ' USERNAME' , passwordVariable : ' PASSWORD' )]) {
@@ -245,24 +287,47 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245
287
246
288
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} @${ remote.host} :~/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh" , numRetries : 3 ,)
247
289
248
- sh( label : " Print slurm_jenkins_agent_setup.sh script " , script : " cat ${ jenkinsSetupPath} " )
290
+ Utils . exec(pipeline , script : " cat ${ jenkinsSetupPath} " )
249
291
250
- Utils . exec(
292
+ def slurmSubmitOutput = Utils . exec(
251
293
pipeline,
252
294
timeout : false ,
253
295
script : Utils . sshUserCmd(
254
- remote,
255
- """ ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} """
256
- )
296
+ remote,
297
+ " \" ${ SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)} \" "
298
+ ),
299
+ returnStdout : true
257
300
)
301
+
302
+ def jobIDs = slurmSubmitOutput
303
+ .readLines()
304
+ .collect { it. trim() }
305
+ .collectMany { line ->
306
+ def ids = []
307
+ def m1 = (line =~ / Submitted batch job (\d +)/ )
308
+ if (m1) ids << m1[0 ][1 ] // Extract the first captured group
309
+ def m2 = (line =~ / srun: job (\d +) (queued|has been allocated)/ )
310
+ if (m2) ids << m2[0 ][1 ] // Extract the first captured group
311
+ return ids
312
+ }
313
+
314
+ slurmJobID = jobIDs ? jobIDs[-1 ] : null
315
+
316
+ if (! slurmJobID || ! slurmJobID. isNumber()) {
317
+ error(" Slurm job did not submit successfully. No job ID found.\n Submission output:\n ${ slurmSubmitOutput} " )
318
+ }
319
+ Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
258
320
Utils . exec(pipeline, script : " echo Sleeping to allow agent initialization; sleep 30" )
259
321
}
260
322
}
261
323
262
324
stage(' Checking if the Node is Online' ) {
263
325
def counter = 0
264
- while (! CloudManager . isNodeOnline(nodeName) && counter < 12 ) {
265
- sleep(time : 10 , unit : ' MINUTES' ) // Wait 10 minutes to check status of the node again
326
+ // We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours.
327
+ // Let's use 15 hours to check if the node is online, and with 2 hours buffer.
328
+ while (! CloudManager . isNodeOnline(nodeName) && counter < 90 ) {
329
+ // Wait 10 minutes to check status of the node again
330
+ sleep(time : 10 , unit : ' MINUTES' )
266
331
counter++
267
332
}
268
333
@@ -293,12 +358,16 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
293
358
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE , nodeName, dockerArgs, true )
294
359
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
295
360
} else {
296
- echo " The node does not come online in 2 hours, terminating the job"
361
+ error " The Slurm node does not come online in the waiting period. Terminating the job. "
297
362
}
298
363
}
299
364
} finally {
300
- cleanUpNodeResources(pipeline, cluster, nodeName)
301
- CloudManager . destroyNode(nodeName)
365
+ stage(' Clean up SLURM Resources' ) {
366
+ Utils . exec(pipeline, script : " echo Sleeping to allow docker stop; sleep 30" )
367
+ CloudManager . destroyNode(nodeName)
368
+ Utils . exec(pipeline, script : " echo Sleeping to allow node destruction; sleep 30" )
369
+ cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
370
+ }
302
371
}
303
372
}
304
373
@@ -321,13 +390,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
321
390
String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
322
391
def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
323
392
324
- sh(
325
- label : " Print env for debugging" ,
326
- script : """
327
- env | sort
328
- pwd && ls -alh
329
- """
330
- )
393
+ Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
394
+
395
+ def slurmOutputFile = null
331
396
332
397
try {
333
398
// Run ssh command to start node in desired cluster via SLURM
@@ -353,7 +418,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
353
418
def resourcePathNode = " /tmp"
354
419
def llmSrcNode = " ${ resourcePathNode} /TensorRT-LLM/src"
355
420
def llmSrcLocal = " ${ llmPath} /TensorRT-LLM/src"
356
- def scriptRunNode = " ${ jobWorkspace} /slurm_run.sh"
421
+ def scriptRunNode = " ${ jobWorkspace} /${ jobUID} -slurm_run.sh"
422
+ def scriptLaunch = " ${ jobWorkspace} /${ jobUID} -slurm_launch.sh"
423
+ slurmOutputFile = " ${ jobWorkspace} /${ jobUID} -slurm_output.log"
357
424
def testListPathNode = " ${ jobWorkspace} /${ testList} .txt"
358
425
def waivesListPathNode = " ${ jobWorkspace} /waives.txt"
359
426
def isAarch64 = config. contains(" aarch64" )
@@ -370,8 +437,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
370
437
// Upload slurm_run_sh to Frontend node
371
438
def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
372
439
Utils . exec(pipeline, script : " chmod +x ${ scriptRunLocalPath} " , returnStdout : true )
440
+
373
441
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptRunLocalPath} ${ remote.user} @${ remote.host} :${ scriptRunNode} " , numRetries : 3 ,)
374
- sh(label : " Print slurm_run.sh script" , script : " cat ${ scriptRunLocalPath} " )
442
+ Utils . exec(pipeline, script : " cat ${ scriptRunLocalPath} " )
443
+
375
444
// Upload waives.txt to Frontend node
376
445
def waivesListLocalPath = " ${ llmSrcLocal} /tests/integration/test_lists/waives.txt"
377
446
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ waivesListLocalPath} ${ remote.user} @${ remote.host} :${ waivesListPathNode} " , numRetries : 3 ,)
@@ -403,7 +472,6 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
403
472
" --container-env=NVIDIA_IMEX_CHANNELS"
404
473
]. join(" " )
405
474
406
- def scriptLaunch = " /home/svc_tensorrt/bloom/scripts/${ jobUID} /slurm_launch.sh"
407
475
def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
408
476
scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
409
477
def scriptContent = """ #!/bin/bash
@@ -423,28 +491,33 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
423
491
export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
424
492
export NVIDIA_IMEX_CHANNELS=0
425
493
chmod +x ${ scriptRunNode}
426
- ${ srunCmd}
494
+ ${ srunCmd} 2>&1 | tee ${ slurmOutputFile }
427
495
""" . stripIndent()
428
496
pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
429
497
Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} " , returnStdout : true )
430
498
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptLaunchDestPath} ${ remote.user} @${ remote.host} :${ scriptLaunch} " , numRetries : 3 ,)
431
- sh( label : " Print slurm_launch.sh script " , script : " cat ${ scriptLaunchDestPath} " )
499
+ Utils . exec(pipeline , script : " cat ${ scriptLaunchDestPath} " )
432
500
}
501
+
433
502
stage(' Run Test' ) {
434
- def scriptLaunch = " ${ jobWorkspace} /slurm_launch.sh"
435
503
Utils . exec(
436
504
pipeline,
437
505
timeout : false ,
438
506
script : Utils . sshUserCmd(
439
507
remote,
440
- """ bash ${ scriptLaunch} " ""
508
+ " \" bash ${ scriptLaunch} \ ""
441
509
)
442
510
)
443
511
}
512
+
513
+ echo " Finished test stage execution."
444
514
}
445
515
} finally {
446
516
uploadResults(pipeline, cluster, jobUID, stageName)
447
- cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
517
+
518
+ stage(' Clean up SLURM Resources' ) {
519
+ cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
520
+ }
448
521
}
449
522
}
450
523
@@ -573,6 +646,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
573
646
} else {
574
647
sh ' if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
575
648
if (noResultIfSuccess && ! stageIsFailed) {
649
+ // Clean up the workspace
650
+ sh """
651
+ env | sort
652
+ pwd && ls -alh
653
+ rm -rf ./*
654
+ """
655
+
656
+ echo " Finished test stage execution."
576
657
return
577
658
}
578
659
echo " noResultIfSuccess: ${ noResultIfSuccess} , stageIsFailed: ${ stageIsFailed} "
@@ -593,14 +674,16 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
593
674
" ${ UPLOAD_PATH} /test-results/"
594
675
)
595
676
junit(testResults : " ${ stageName} /results*.xml" )
596
-
597
- // Clean up the workspace
598
- sh """
599
- env | sort
600
- pwd && ls -alh
601
- rm -rf ./*
602
- """
603
677
}
678
+
679
+ // Clean up the workspace
680
+ sh """
681
+ env | sort
682
+ pwd && ls -alh
683
+ rm -rf ./*
684
+ """
685
+
686
+ echo " Finished test stage execution."
604
687
}
605
688
}
606
689
@@ -643,7 +726,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
643
726
containerConfig = """
644
727
- name: trt-llm
645
728
image: ${ image}
646
- command: ['sleep', ${ POD_TIMEOUT_SECONDS } ]
729
+ command: ['sleep', ${ POD_TIMEOUT_SECONDS_SLURM } ]
647
730
tty: true
648
731
resources:
649
732
requests:
@@ -661,7 +744,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
661
744
containerConfig = """
662
745
- name: trt-llm
663
746
image: ${ image}
664
- command: ['sleep', ${ POD_TIMEOUT_SECONDS_TMP } ]
747
+ command: ['sleep', ${ POD_TIMEOUT_SECONDS_BUILD } ]
665
748
volumeMounts:
666
749
- name: sw-tensorrt-pvc
667
750
mountPath: "/mnt/sw-tensorrt-pvc"
@@ -727,7 +810,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
727
810
containerConfig = """
728
811
- name: trt-llm
729
812
image: ${ image}
730
- command: ['sleep', ${ POD_TIMEOUT_SECONDS } ]
813
+ command: ['sleep', ${ POD_TIMEOUT_SECONDS_TEST } ]
731
814
tty: true
732
815
resources:
733
816
requests:
@@ -2167,10 +2250,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
2167
2250
}
2168
2251
echo " ###### Check pip install Start ######"
2169
2252
withEnv(libEnv) {
2253
+ // Retry 2 times if timeout occurs.
2170
2254
sh " env | sort"
2171
- timeout(time : 30 , unit : ' MINUTES' ) {
2172
- checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2173
- }
2255
+ trtllm_utils. llmRetry(1 , " checkPipInstall" , {
2256
+ timeout(time : 30 , unit : ' MINUTES' ) {
2257
+ checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2258
+ }
2259
+ })
2174
2260
}
2175
2261
echo " ###### Run LLMAPI tests Start ######"
2176
2262
def config = VANILLA_CONFIG
0 commit comments