Skip to content

Commit 2558d04

Browse files
committed
[None][fix] Fix a typo in the Slurm CI codes (NVIDIA#7485)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent d32e462 commit 2558d04

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

jenkins/BuildDockerImage.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ pipeline {
684684
}
685685
cmd += imageKeyToTag.values().join(" ")
686686
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
687-
sh cmd
687+
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
688688
}
689689
}
690690
}

jenkins/L0_Test.groovy

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
163163

164164
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
165165

166-
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
166+
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")
167167

168168
Utils.exec(
169169
pipeline,
@@ -173,6 +173,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
173173
)
174174
)
175175

176+
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
177+
176178
Utils.exec(
177179
pipeline,
178180
script: Utils.sshUserCmd(
@@ -222,6 +224,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
222224
)
223225
)
224226

227+
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
228+
225229
Utils.exec(
226230
pipeline,
227231
script: Utils.sshUserCmd(
@@ -348,7 +352,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
348352
}
349353

350354
if (CloudManager.isNodeOnline(nodeName)) {
351-
def dockerGpuOption = ""
355+
def dockerGPUOption = ""
352356

353357
node(nodeName) {
354358
sh """
@@ -367,6 +371,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
367371

368372
// Dynamically set GPU arguments based on environment variables
369373
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
374+
// It's intentional to check NV_GPU first.
370375
dockerGPUOption = sh(script: """
371376
if [ -n "\$NV_GPU" ]; then
372377
echo "--gpus '\\"device=\$NV_GPU\\"'"
@@ -386,7 +391,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
386391
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
387392
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
388393
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
389-
"--cap-add syslog"
394+
"--cap-add=SYSLOG"
390395

391396
echo "Final dockerArgs: ${dockerArgs}"
392397

@@ -516,9 +521,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
516521
].join(" ")
517522

518523
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
519-
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
520-
// TODO: check if the tee always returns 0
524+
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
521525
def scriptContent = """#!/bin/bash
526+
set -o pipefail
522527
export jobWorkspace=$jobWorkspace
523528
export tarName=$tarName
524529
export llmTarfile=$llmTarfile

0 commit comments

Comments
 (0)