Skip to content

Commit f89dfd7

Browse files
committed
Improve CI stability
Signed-off-by: Yanchao Lu <[email protected]>
1 parent cd36bbe commit f89dfd7

File tree

1 file changed

+20
-15
lines changed

1 file changed

+20
-15
lines changed

jenkins/L0_Test.groovy

Lines changed: 20 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@Library(['bloom-jenkins-shared-lib@dev-yanchaol-slurm-output', 'trtllm-jenkins-shared-lib@main']) _
1+
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
22

33
import java.lang.InterruptedException
44
import groovy.transform.Field
@@ -245,7 +245,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245245

246246
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
247247

248-
sh(label: "Print slurm_jenkins_agent_setup.sh script", script: "cat ${jenkinsSetupPath}")
248+
Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
249249

250250
Utils.exec(
251251
pipeline,
@@ -297,8 +297,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
297297
}
298298
}
299299
} finally {
300-
cleanUpNodeResources(pipeline, cluster, nodeName)
300+
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
301301
CloudManager.destroyNode(nodeName)
302+
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
303+
cleanUpNodeResources(pipeline, cluster, nodeName)
302304
}
303305
}
304306

@@ -321,13 +323,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
321323
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
322324
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
323325

324-
sh(
325-
label: "Print env for debugging",
326-
script: """
327-
env | sort
328-
pwd && ls -alh
329-
"""
330-
)
326+
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
331327

332328
try {
333329
// Run ssh command to start node in desired cluster via SLURM
@@ -371,7 +367,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
371367
def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
372368
Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
373369
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
374-
sh(label: "Print slurm_run.sh script", script: "cat ${scriptRunLocalPath}")
370+
Utils.exec(pipeline, script: "cat ${scriptRunLocalPath}")
375371

376372
// Upload waives.txt to Frontend node
377373
def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
@@ -429,7 +425,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
429425
pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
430426
Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
431427
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
432-
sh(label: "Print slurm_launch.sh script", script: "cat ${scriptLaunchDestPath}")
428+
Utils.exec(pipeline, script: "cat ${scriptLaunchDestPath}")
433429
}
434430
stage('Run Test') {
435431
def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
@@ -2169,10 +2165,19 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21692165
libEnv += ["LD_LIBRARY_PATH+nvrtc=/usr/local/lib/python${pyver}/dist-packages/nvidia/cuda_nvrtc/lib"]
21702166
}
21712167
echo "###### Check pip install Start ######"
2168+
def sleepTime = 20
21722169
withEnv(libEnv) {
2173-
sh "env | sort"
2174-
timeout(time: 30, unit: 'MINUTES') {
2175-
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2170+
retry(2) {
2171+
try {
2172+
sh "env | sort"
2173+
timeout(time: 30, unit: 'MINUTES') {
2174+
sleep(sleepTime * 60)
2175+
sleepTime = 1
2176+
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2177+
}
2178+
} catch (org.jenkinsci.plugins.workflow.steps.TimeoutStepExecution.ExceededTimeout e) {
2179+
error "Timeout occurred, retrying..."
2180+
}
21762181
}
21772182
}
21782183
echo "###### Run LLMAPI tests Start ######"

0 commit comments

Comments
 (0)