[None][fix] Fix a typo in the Slurm CI codes (NVIDIA#7485)

chzblych · greg-kwasniewski1 · commit b2ba107292a1 · 2025-09-04T04:22:09.000-07:00
Signed-off-by: Yanchao Lu &lt;yanchaol@nvidia.com&gt;
diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy
@@ -684,7 +684,7 @@ pipeline {
                         }
                         cmd += imageKeyToTag.values().join(" ")
                         withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
-                            sh cmd
+                            trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
                         }
                     }
                 }
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
 
         Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
 
-        Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
+        Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")
 
         Utils.exec(
             pipeline,
@@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
             )
         )
 
+        Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
+
         Utils.exec(
             pipeline,
             script: Utils.sshUserCmd(
@@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
             )
         )
 
+        Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
+
         Utils.exec(
             pipeline,
             script: Utils.sshUserCmd(
@@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
             }
 
             if (CloudManager.isNodeOnline(nodeName)) {
-                def dockerGpuOption = ""
+                def dockerGPUOption = ""
 
                 node(nodeName) {
                     sh """
@@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
 
                     // Dynamically set GPU arguments based on environment variables
                     // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
+                    // It's intentional to check NV_GPU first.
                     dockerGPUOption = sh(script: """
                         if [ -n "\$NV_GPU" ]; then
                             echo "--gpus '\\"device=\$NV_GPU\\"'"
@@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     "-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
                     "-v /tmp/ccache:${CCACHE_DIR}:rw " +
                     "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
-                    "--cap-add syslog"
+                    "--cap-add=SYSLOG"
 
                 echo "Final dockerArgs: ${dockerArgs}"
 
@@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 ].join(" ")
 
                 def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
-                scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
-                // TODO: check if the tee always returns 0
+                def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
                 def scriptContent = """#!/bin/bash
+                    set -o pipefail
                     export jobWorkspace=$jobWorkspace
                     export tarName=$tarName
                     export llmTarfile=$llmTarfile

Original file line number	Diff line number	Diff line change
`@@ -684,7 +684,7 @@ pipeline {`
`684`	`684`	`}`
`685`	`685`	`cmd += imageKeyToTag.values().join(" ")`
`686`	`686`	`withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {`
`687`		`- sh cmd`
	`687`	`+ trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)`
`688`	`688`	`}`
`689`	`689`	`}`
`690`	`690`	`}`