1
- @Library ([' bloom-jenkins-shared-lib@dev-yanchaol-slurm-output ' , ' trtllm-jenkins-shared-lib@main' ]) _
1
+ @Library ([' bloom-jenkins-shared-lib@main ' , ' trtllm-jenkins-shared-lib@main' ]) _
2
2
3
3
import java.lang.InterruptedException
4
4
import groovy.transform.Field
@@ -245,7 +245,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
245
245
246
246
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ jenkinsSetupPath} ${ remote.user} @${ remote.host} :~/bloom/scripts/${ nodeName} -slurm_jenkins_agent_setup.sh" ,)
247
247
248
- sh( label : " Print slurm_jenkins_agent_setup.sh script " , script : " cat ${ jenkinsSetupPath} " )
248
+ Utils . exec(pipeline , script : " cat ${ jenkinsSetupPath} " )
249
249
250
250
Utils . exec(
251
251
pipeline,
@@ -297,8 +297,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
297
297
}
298
298
}
299
299
} finally {
300
- cleanUpNodeResources (pipeline, cluster, nodeName )
300
+ Utils . exec (pipeline, script : " echo Sleeping to allow docker stop; sleep 30 " )
301
301
CloudManager . destroyNode(nodeName)
302
+ Utils . exec(pipeline, script : " echo Sleeping to allow node destruction; sleep 30" )
303
+ cleanUpNodeResources(pipeline, cluster, nodeName)
302
304
}
303
305
}
304
306
@@ -321,13 +323,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
321
323
String customSuffix = " ${ env.BUILD_TAG} -${ UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)} " . toLowerCase()
322
324
def jobUID = " ${ cluster.host} -multi_node_test-${ customSuffix} "
323
325
324
- sh(
325
- label : " Print env for debugging" ,
326
- script : """
327
- env | sort
328
- pwd && ls -alh
329
- """
330
- )
326
+ Utils . exec(pipeline, script : " env | sort && pwd && ls -alh" )
331
327
332
328
try {
333
329
// Run ssh command to start node in desired cluster via SLURM
@@ -371,7 +367,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
371
367
def scriptRunLocalPath = " ${ llmSrcLocal} /jenkins/scripts/slurm_run.sh"
372
368
Utils . exec(pipeline, script : " chmod +x ${ scriptRunLocalPath} " , returnStdout : true )
373
369
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptRunLocalPath} ${ remote.user} @${ remote.host} :${ scriptRunNode} " ,)
374
- sh( label : " Print slurm_run.sh script " , script : " cat ${ scriptRunLocalPath} " )
370
+ Utils . exec(pipeline , script : " cat ${ scriptRunLocalPath} " )
375
371
376
372
// Upload waives.txt to Frontend node
377
373
def waivesListLocalPath = " ${ llmSrcLocal} /tests/integration/test_lists/waives.txt"
@@ -429,7 +425,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
429
425
pipeline. writeFile(file : scriptLaunchDestPath, text : scriptContent)
430
426
Utils . exec(pipeline, script : " chmod +x ${ scriptLaunchDestPath} " , returnStdout : true )
431
427
Utils . exec(pipeline, script : " sshpass -p '${ remote.passwd} ' scp -r -p ${ COMMON_SSH_OPTIONS} ${ scriptLaunchDestPath} ${ remote.user} @${ remote.host} :${ scriptLaunch} " ,)
432
- sh( label : " Print slurm_launch.sh script " , script : " cat ${ scriptLaunchDestPath} " )
428
+ Utils . exec(pipeline , script : " cat ${ scriptLaunchDestPath} " )
433
429
}
434
430
stage(' Run Test' ) {
435
431
def scriptLaunch = " ${ jobWorkspace} /slurm_launch.sh"
@@ -2169,10 +2165,19 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
2169
2165
libEnv + = [" LD_LIBRARY_PATH+nvrtc=/usr/local/lib/python${ pyver} /dist-packages/nvidia/cuda_nvrtc/lib" ]
2170
2166
}
2171
2167
echo " ###### Check pip install Start ######"
2168
+ def sleepTime = 35
2172
2169
withEnv(libEnv) {
2173
- sh " env | sort"
2174
- timeout(time : 30 , unit : ' MINUTES' ) {
2175
- checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2170
+ retry(1 ) {
2171
+ try {
2172
+ sh " env | sort"
2173
+ timeout(time : 30 , unit : ' MINUTES' ) {
2174
+ sleep(sleepTime * 60 )
2175
+ sleepTime = 1
2176
+ checkPipInstall(pipeline, " ${ cpu_arch} /${ wheelPath} " )
2177
+ }
2178
+ } catch (org.jenkinsci.plugins.workflow.steps.TimeoutStepExecution.ExceededTimeout e) {
2179
+ error " Timeout occurred, retrying..."
2180
+ }
2176
2181
}
2177
2182
}
2178
2183
echo " ###### Run LLMAPI tests Start ######"
0 commit comments