Skip to content

Commit b2ba107

Browse files
chzblychgreg-kwasniewski1
authored andcommitted
[None][fix] Fix a typo in the Slurm CI codes (NVIDIA#7485)
Signed-off-by: Yanchao Lu <[email protected]>
1 parent b1701b4 commit b2ba107

File tree

2 files changed

+11
-6
lines changed

2 files changed

+11
-6
lines changed

jenkins/BuildDockerImage.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ pipeline {
684684
}
685685
cmd += imageKeyToTag.values().join(" ")
686686
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
687-
sh cmd
687+
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
688688
}
689689
}
690690
}

jenkins/L0_Test.groovy

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
169169

170170
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
171171

172-
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
172+
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")
173173

174174
Utils.exec(
175175
pipeline,
@@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
179179
)
180180
)
181181

182+
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
183+
182184
Utils.exec(
183185
pipeline,
184186
script: Utils.sshUserCmd(
@@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
228230
)
229231
)
230232

233+
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
234+
231235
Utils.exec(
232236
pipeline,
233237
script: Utils.sshUserCmd(
@@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
354358
}
355359

356360
if (CloudManager.isNodeOnline(nodeName)) {
357-
def dockerGpuOption = ""
361+
def dockerGPUOption = ""
358362

359363
node(nodeName) {
360364
sh """
@@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
373377

374378
// Dynamically set GPU arguments based on environment variables
375379
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
380+
// It's intentional to check NV_GPU first.
376381
dockerGPUOption = sh(script: """
377382
if [ -n "\$NV_GPU" ]; then
378383
echo "--gpus '\\"device=\$NV_GPU\\"'"
@@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
392397
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
393398
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
394399
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
395-
"--cap-add syslog"
400+
"--cap-add=SYSLOG"
396401

397402
echo "Final dockerArgs: ${dockerArgs}"
398403

@@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
522527
].join(" ")
523528

524529
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
525-
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
526-
// TODO: check if the tee always returns 0
530+
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
527531
def scriptContent = """#!/bin/bash
532+
set -o pipefail
528533
export jobWorkspace=$jobWorkspace
529534
export tarName=$tarName
530535
export llmTarfile=$llmTarfile

0 commit comments

Comments
 (0)