From e89513989e16401169915c676e4acab07f377ac2 Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Wed, 3 Sep 2025 00:02:10 +0800 Subject: [PATCH 1/4] [None][fix] Fix a typo in the Slurm CI codes Signed-off-by: Yanchao Lu --- jenkins/BuildDockerImage.groovy | 2 +- jenkins/L0_Test.groovy | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 64e03de476a..6d0cdef7cad 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -684,7 +684,7 @@ pipeline { } cmd += imageKeyToTag.values().join(" ") withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) { - sh cmd + trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)) } } } diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 197ad8d28eb..63851651b89 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -348,7 +348,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } if (CloudManager.isNodeOnline(nodeName)) { - def dockerGpuOption = "" + def dockerGPUOption = "" node(nodeName) { sh """ @@ -367,6 +367,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p // Dynamically set GPU arguments based on environment variables // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html + // It's intentional to check NV_GPU first. dockerGPUOption = sh(script: """ if [ -n "\$NV_GPU" ]; then echo "--gpus '\\"device=\$NV_GPU\\"'" @@ -386,7 +387,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p "-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " + "-v /tmp/ccache:${CCACHE_DIR}:rw " + "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + - "--cap-add syslog" + "--cap-add SYSLOG" echo "Final dockerArgs: ${dockerArgs}" @@ -516,9 +517,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL ].join(" ") def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode) - scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") - // TODO: check if the tee always returns 0 + def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptContent = """#!/bin/bash + set -o pipefail export jobWorkspace=$jobWorkspace export tarName=$tarName export llmTarfile=$llmTarfile @@ -536,6 +537,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL export NVIDIA_IMEX_CHANNELS=0 chmod +x ${scriptRunNode} ${srunCmd} 2>&1 | tee ${slurmOutputFile} + exit ${PIPESTATUS[0]} """.stripIndent() pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent) Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true) From d2cea417bf61adf5c58c3b65ea1f2ed7b0171caf Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Wed, 3 Sep 2025 00:24:42 +0800 Subject: [PATCH 2/4] [None][fix] Fix a typo in the Slurm CI codes Signed-off-by: Yanchao Lu --- jenkins/BuildDockerImage.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 6d0cdef7cad..96f0bf7fbc4 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -684,7 +684,7 @@ pipeline { } cmd += imageKeyToTag.values().join(" ") withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) { - trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)) + trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200) } } } From 2efba0369bb208545067be3ff02bd6d4b87c423b Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Wed, 3 Sep 2025 00:27:02 +0800 Subject: [PATCH 3/4] [None][fix] Fix a typo in the Slurm CI codes Signed-off-by: Yanchao Lu --- jenkins/L0_Test.groovy | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 63851651b89..19a72887cd9 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -387,7 +387,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p "-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " + "-v /tmp/ccache:${CCACHE_DIR}:rw " + "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + - "--cap-add SYSLOG" + "--cap-add=SYSLOG" echo "Final dockerArgs: ${dockerArgs}" From d1cf2e6a3cf2ffe25017d28a06eb59806409f3b0 Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Wed, 3 Sep 2025 12:56:12 +0800 Subject: [PATCH 4/4] [None][fix] Fix a typo in the Slurm CI codes Signed-off-by: Yanchao Lu --- jenkins/L0_Test.groovy | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 19a72887cd9..5feccc076d1 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -163,7 +163,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") - Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30") + Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30") Utils.exec( pipeline, @@ -173,6 +173,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo ) ) + Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") + Utils.exec( pipeline, script: Utils.sshUserCmd( @@ -222,6 +224,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St ) ) + Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30") + Utils.exec( pipeline, script: Utils.sshUserCmd( @@ -537,7 +541,6 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL export NVIDIA_IMEX_CHANNELS=0 chmod +x ${scriptRunNode} ${srunCmd} 2>&1 | tee ${slurmOutputFile} - exit ${PIPESTATUS[0]} """.stripIndent() pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent) Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)