Skip to content

Commit d6a1e13

Browse files
committed
Improve CI stability
Signed-off-by: Yanchao Lu <[email protected]>
1 parent d752784 commit d6a1e13

File tree

2 files changed

+147
-62
lines changed

2 files changed

+147
-62
lines changed

jenkins/Build.groovy

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,7 @@ LLM_DOCKER_IMAGE = env.dockerImage
1919
// Always use x86_64 image for agent
2020
AGENT_IMAGE = env.dockerImage.replace("aarch64", "x86_64")
2121

22-
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
23-
POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
22+
POD_TIMEOUT_SECONDS_BUILD = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
2423

2524
// Literals for easier access.
2625
@Field
@@ -169,7 +168,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64")
169168
containerConfig = """
170169
- name: trt-llm
171170
image: ${image}
172-
command: ['sleep', ${POD_TIMEOUT_SECONDS_TMP}]
171+
command: ['sleep', ${POD_TIMEOUT_SECONDS_BUILD}]
173172
volumeMounts:
174173
- name: sw-tensorrt-pvc
175174
mountPath: "/mnt/sw-tensorrt-pvc"

jenkins/L0_Test.groovy

Lines changed: 145 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@Library(['bloom-jenkins-shared-lib@dev-yanchaol-slurm-output', 'trtllm-jenkins-shared-lib@main']) _
1+
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
22

33
import java.lang.InterruptedException
44
import groovy.transform.Field
@@ -44,8 +44,9 @@ DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.06-py3"
4444
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
4545
UBUNTU_24_04_IMAGE = "urm.nvidia.com/docker/ubuntu:24.04"
4646

47-
POD_TIMEOUT_SECONDS = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
48-
POD_TIMEOUT_SECONDS_TMP = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
47+
POD_TIMEOUT_SECONDS_TEST = env.podTimeoutSeconds ? env.podTimeoutSeconds : "21600"
48+
POD_TIMEOUT_SECONDS_BUILD = env.podTimeoutSeconds ? env.podTimeoutSeconds : "43200"
49+
POD_TIMEOUT_SECONDS_SLURM = env.podTimeoutSeconds ? env.podTimeoutSeconds : "79200" // Use 22 hours to allow for 2 hour of buffer.
4950

5051
// Literals for easier access.
5152
@Field
@@ -133,7 +134,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
133134
}
134135

135136
//TODO: consolidate slurm related code for both multi nodes and single nodes
136-
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID){
137+
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
137138
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
138139
def remote = [
139140
ip : cluster.ip,
@@ -144,20 +145,50 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
144145
]
145146

146147
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
147-
pipeline.stage('Clean up SLURM Agent Resources') {
148-
Utils.exec(
148+
149+
def slurmJobID = Utils.exec(
149150
pipeline,
150-
timeout: false,
151151
script: Utils.sshUserCmd(
152152
remote,
153-
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
154-
)
155-
)
153+
"\"sed -n " +
154+
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
155+
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
156+
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
157+
"${slurmOutputFile} | tail -n1\""
158+
),
159+
returnStdout: true
160+
).trim()
161+
162+
if (!slurmJobID || !slurmJobID.isNumber()) {
163+
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
164+
error("Slurm job did not submit successfully. No job ID found.")
156165
}
166+
167+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
168+
169+
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
170+
171+
Utils.exec(
172+
pipeline,
173+
script: Utils.sshUserCmd(
174+
remote,
175+
"\"scancel ${slurmJobID} || true; sacct -j ${slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${slurmJobID} || true\""
176+
)
177+
)
178+
179+
Utils.exec(
180+
pipeline,
181+
script: Utils.sshUserCmd(
182+
remote,
183+
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
184+
)
185+
)
186+
187+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
157188
}
158189
}
159190

160-
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
191+
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
161192
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
162193
def remote = [
163194
ip : cluster.ip,
@@ -168,17 +199,26 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
168199
]
169200

170201
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
171-
pipeline.stage('Clean up SLURM Agent Resources') {
172-
Utils.exec(
173-
pipeline,
174-
timeout: false,
175-
script: Utils.sshUserCmd(
176-
remote,
177-
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
178-
)
202+
203+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
204+
205+
Utils.exec(
206+
pipeline,
207+
script: Utils.sshUserCmd(
208+
remote,
209+
"\"scancel ${slurmJobID} || true; sacct -j ${slurmJobID} --format=JobID,JobName%100,Partition%15,Account%15,State,ExitCode,NodeList%30 || true; scontrol show job ${slurmJobID} || true\""
179210
)
180-
Utils.exec(pipeline, script: "echo done")
181-
}
211+
)
212+
213+
Utils.exec(
214+
pipeline,
215+
script: Utils.sshUserCmd(
216+
remote,
217+
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
218+
)
219+
)
220+
221+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
182222
}
183223
}
184224

@@ -224,6 +264,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
224264
def customWorkspace = "/tmp/${nodeName}"
225265
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
226266

267+
def slurmJobID = null
268+
227269
try {
228270
// Run ssh command to start node in desired cluster via SLURM
229271
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
@@ -245,24 +287,47 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245287

246288
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh", numRetries: 3,)
247289

248-
sh(label: "Print slurm_jenkins_agent_setup.sh script", script: "cat ${jenkinsSetupPath}")
290+
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
249291

250-
Utils.exec(
292+
def slurmSubmitOutput = Utils.exec(
251293
pipeline,
252294
timeout: false,
253295
script: Utils.sshUserCmd(
254-
remote,
255-
"""${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
256-
)
296+
remote,
297+
"\"${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}\""
298+
),
299+
returnStdout: true
257300
)
301+
302+
def jobIDs = slurmSubmitOutput
303+
.readLines()
304+
.collect { it.trim() }
305+
.collectMany { line ->
306+
def ids = []
307+
def m1 = (line =~ /Submitted batch job (\d+)/)
308+
if (m1) ids << m1[0][1] // Extract the first captured group
309+
def m2 = (line =~ /srun: job (\d+) (queued|has been allocated)/)
310+
if (m2) ids << m2[0][1] // Extract the first captured group
311+
return ids
312+
}
313+
314+
slurmJobID = jobIDs ? jobIDs[-1] : null
315+
316+
if (!slurmJobID || !slurmJobID.isNumber()) {
317+
error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
318+
}
319+
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
258320
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
259321
}
260322
}
261323

262324
stage('Checking if the Node is Online') {
263325
def counter = 0
264-
while (!CloudManager.isNodeOnline(nodeName) && counter < 12) {
265-
sleep(time: 10, unit: 'MINUTES') // Wait 10 minutes to check status of the node again
326+
// We submit the Slurm job with 5 hours timeout, and the K8S pod will be evicted after 22 hours.
327+
// Let's use 15 hours to check if the node is online, and with 2 hours buffer.
328+
while (!CloudManager.isNodeOnline(nodeName) && counter < 90) {
329+
// Wait 10 minutes to check status of the node again
330+
sleep(time: 10, unit: 'MINUTES')
266331
counter++
267332
}
268333

@@ -293,12 +358,16 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
293358
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
294359
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
295360
} else {
296-
echo "The node does not come online in 2 hours, terminating the job"
361+
error "The Slurm node does not come online in the waiting period. Terminating the job."
297362
}
298363
}
299364
} finally {
300-
cleanUpNodeResources(pipeline, cluster, nodeName)
301-
CloudManager.destroyNode(nodeName)
365+
stage('Clean up SLURM Resources') {
366+
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
367+
CloudManager.destroyNode(nodeName)
368+
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
369+
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
370+
}
302371
}
303372
}
304373

@@ -321,13 +390,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
321390
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
322391
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
323392

324-
sh(
325-
label: "Print env for debugging",
326-
script: """
327-
env | sort
328-
pwd && ls -alh
329-
"""
330-
)
393+
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
394+
395+
def slurmOutputFile = null
331396

332397
try {
333398
// Run ssh command to start node in desired cluster via SLURM
@@ -353,7 +418,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
353418
def resourcePathNode = "/tmp"
354419
def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
355420
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
356-
def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
421+
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
422+
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
423+
slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
357424
def testListPathNode = "${jobWorkspace}/${testList}.txt"
358425
def waivesListPathNode = "${jobWorkspace}/waives.txt"
359426
def isAarch64 = config.contains("aarch64")
@@ -370,8 +437,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
370437
// Upload slurm_run_sh to Frontend node
371438
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
372439
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
440+
373441
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}", numRetries: 3,)
374-
sh(label: "Print slurm_run.sh script", script: "cat ${scriptRunLocalPath}")
442+
Utils.exec(pipeline, script: "cat ${scriptRunLocalPath}")
443+
375444
// Upload waives.txt to Frontend node
376445
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
377446
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}", numRetries: 3,)
@@ -403,7 +472,6 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
403472
"--container-env=NVIDIA_IMEX_CHANNELS"
404473
].join(" ")
405474

406-
def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh"
407475
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
408476
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
409477
def scriptContent = """#!/bin/bash
@@ -423,28 +491,33 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
423491
export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
424492
export NVIDIA_IMEX_CHANNELS=0
425493
chmod +x ${scriptRunNode}
426-
${srunCmd}
494+
${srunCmd} 2>&1 | tee ${slurmOutputFile}
427495
""".stripIndent()
428496
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
429497
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
430498
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}", numRetries: 3,)
431-
sh(label: "Print slurm_launch.sh script", script: "cat ${scriptLaunchDestPath}")
499+
Utils.exec(pipeline, script: "cat ${scriptLaunchDestPath}")
432500
}
501+
433502
stage('Run Test') {
434-
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
435503
Utils.exec(
436504
pipeline,
437505
timeout: false,
438506
script: Utils.sshUserCmd(
439507
remote,
440-
"""bash ${scriptLaunch}"""
508+
"\"bash ${scriptLaunch}\""
441509
)
442510
)
443511
}
512+
513+
echo "Finished test stage execution."
444514
}
445515
} finally {
446516
uploadResults(pipeline, cluster, jobUID, stageName)
447-
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
517+
518+
stage('Clean up SLURM Resources') {
519+
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
520+
}
448521
}
449522
}
450523

@@ -573,6 +646,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
573646
} else {
574647
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
575648
if (noResultIfSuccess && !stageIsFailed) {
649+
// Clean up the workspace
650+
sh """
651+
env | sort
652+
pwd && ls -alh
653+
rm -rf ./*
654+
"""
655+
656+
echo "Finished test stage execution."
576657
return
577658
}
578659
echo "noResultIfSuccess: ${noResultIfSuccess}, stageIsFailed: ${stageIsFailed}"
@@ -593,14 +674,16 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
593674
"${UPLOAD_PATH}/test-results/"
594675
)
595676
junit(testResults: "${stageName}/results*.xml")
596-
597-
// Clean up the workspace
598-
sh """
599-
env | sort
600-
pwd && ls -alh
601-
rm -rf ./*
602-
"""
603677
}
678+
679+
// Clean up the workspace
680+
sh """
681+
env | sort
682+
pwd && ls -alh
683+
rm -rf ./*
684+
"""
685+
686+
echo "Finished test stage execution."
604687
}
605688
}
606689

@@ -643,7 +726,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
643726
containerConfig = """
644727
- name: trt-llm
645728
image: ${image}
646-
command: ['sleep', ${POD_TIMEOUT_SECONDS}]
729+
command: ['sleep', ${POD_TIMEOUT_SECONDS_SLURM}]
647730
tty: true
648731
resources:
649732
requests:
@@ -661,7 +744,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
661744
containerConfig = """
662745
- name: trt-llm
663746
image: ${image}
664-
command: ['sleep', ${POD_TIMEOUT_SECONDS_TMP}]
747+
command: ['sleep', ${POD_TIMEOUT_SECONDS_BUILD}]
665748
volumeMounts:
666749
- name: sw-tensorrt-pvc
667750
mountPath: "/mnt/sw-tensorrt-pvc"
@@ -727,7 +810,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
727810
containerConfig = """
728811
- name: trt-llm
729812
image: ${image}
730-
command: ['sleep', ${POD_TIMEOUT_SECONDS}]
813+
command: ['sleep', ${POD_TIMEOUT_SECONDS_TEST}]
731814
tty: true
732815
resources:
733816
requests:
@@ -2167,10 +2250,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21672250
}
21682251
echo "###### Check pip install Start ######"
21692252
withEnv(libEnv) {
2253+
// Retry 2 times if timeout occurs.
21702254
sh "env | sort"
2171-
timeout(time: 30, unit: 'MINUTES') {
2172-
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2173-
}
2255+
trtllm_utils.llmRetry(1, "checkPipInstall", {
2256+
timeout(time: 30, unit: 'MINUTES') {
2257+
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2258+
}
2259+
})
21742260
}
21752261
echo "###### Run LLMAPI tests Start ######"
21762262
def config = VANILLA_CONFIG

0 commit comments

Comments
 (0)