@@ -163,7 +163,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
163
163
164
164
Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
165
165
166
- Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination ; sleep 30" )
166
+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job completion ; sleep 30" )
167
167
168
168
Utils . exec(
169
169
pipeline,
@@ -173,6 +173,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
173
173
)
174
174
)
175
175
176
+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
177
+
176
178
Utils . exec(
177
179
pipeline,
178
180
script : Utils . sshUserCmd(
@@ -222,6 +224,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
222
224
)
223
225
)
224
226
227
+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
228
+
225
229
Utils . exec(
226
230
pipeline,
227
231
script : Utils . sshUserCmd(
@@ -348,7 +352,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
348
352
}
349
353
350
354
if (CloudManager . isNodeOnline(nodeName)) {
351
- def dockerGpuOption = " "
355
+ def dockerGPUOption = " "
352
356
353
357
node(nodeName) {
354
358
sh """
@@ -367,6 +371,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
367
371
368
372
// Dynamically set GPU arguments based on environment variables
369
373
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
374
+ // It's intentional to check NV_GPU first.
370
375
dockerGPUOption = sh(script : """
371
376
if [ -n "\$ NV_GPU" ]; then
372
377
echo "--gpus '\\ "device=\$ NV_GPU\\ "'"
@@ -386,7 +391,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
386
391
" -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
387
392
" -v /tmp/ccache:${ CCACHE_DIR} :rw " +
388
393
" -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
389
- " --cap-add syslog "
394
+ " --cap-add=SYSLOG "
390
395
391
396
echo " Final dockerArgs: ${ dockerArgs} "
392
397
@@ -516,9 +521,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
516
521
]. join(" " )
517
522
518
523
def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
519
- scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
520
- // TODO: check if the tee always returns 0
524
+ def scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
521
525
def scriptContent = """ #!/bin/bash
526
+ set -o pipefail
522
527
export jobWorkspace=$jobWorkspace
523
528
export tarName=$tarName
524
529
export llmTarfile=$llmTarfile
0 commit comments