@@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
169
169
170
170
Utils . exec(pipeline, script : " echo Slurm job ID: ${ slurmJobID} " )
171
171
172
- Utils . exec(pipeline, script : " echo Sleeping to allow slurm job termination ; sleep 30" )
172
+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job completion ; sleep 30" )
173
173
174
174
Utils . exec(
175
175
pipeline,
@@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
179
179
)
180
180
)
181
181
182
+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
183
+
182
184
Utils . exec(
183
185
pipeline,
184
186
script : Utils . sshUserCmd(
@@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
228
230
)
229
231
)
230
232
233
+ Utils . exec(pipeline, script : " echo Sleeping to allow Slurm job termination; sleep 30" )
234
+
231
235
Utils . exec(
232
236
pipeline,
233
237
script : Utils . sshUserCmd(
@@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
354
358
}
355
359
356
360
if (CloudManager . isNodeOnline(nodeName)) {
357
- def dockerGpuOption = " "
361
+ def dockerGPUOption = " "
358
362
359
363
node(nodeName) {
360
364
sh """
@@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
373
377
374
378
// Dynamically set GPU arguments based on environment variables
375
379
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
380
+ // It's intentional to check NV_GPU first.
376
381
dockerGPUOption = sh(script : """
377
382
if [ -n "\$ NV_GPU" ]; then
378
383
echo "--gpus '\\ "device=\$ NV_GPU\\ "'"
@@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
392
397
" -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
393
398
" -v /tmp/ccache:${ CCACHE_DIR} :rw " +
394
399
" -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
395
- " --cap-add syslog "
400
+ " --cap-add=SYSLOG "
396
401
397
402
echo " Final dockerArgs: ${ dockerArgs} "
398
403
@@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
522
527
]. join(" " )
523
528
524
529
def srunCmd = SlurmConfig . generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
525
- scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
526
- // TODO: check if the tee always returns 0
530
+ def scriptLaunchDestPath = Utils . createTempLocation(pipeline, " ./slurm_launch.sh" )
527
531
def scriptContent = """ #!/bin/bash
532
+ set -o pipefail
528
533
export jobWorkspace=$jobWorkspace
529
534
export tarName=$tarName
530
535
export llmTarfile=$llmTarfile
0 commit comments