Skip to content

Commit 426d490

Browse files
committed
Test new Slurm script
Signed-off-by: Yanchao Lu <[email protected]>
1 parent 1207d9e commit 426d490

File tree

3 files changed

+110
-46
lines changed

3 files changed

+110
-46
lines changed

jenkins/L0_Test.groovy

Lines changed: 58 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _
1+
@Library(['bloom-jenkins-shared-lib@dev-yanchaol-slurm', 'trtllm-jenkins-shared-lib@main']) _
22

33
import java.lang.InterruptedException
44
import groovy.transform.Field
@@ -7,7 +7,6 @@ import groovy.json.JsonOutput
77
import com.nvidia.bloom.KubernetesManager
88
import com.nvidia.bloom.Constants
99
import com.nvidia.bloom.CloudManager
10-
import com.nvidia.bloom.KubernetesManager
1110
import com.nvidia.bloom.SlurmConfig
1211
import com.nvidia.bloom.SlurmCluster
1312
import com.nvidia.bloom.SlurmPartition
@@ -211,6 +210,13 @@ def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
211210
sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
212211
sh "ls ${stageName}/ -all"
213212
})
213+
214+
// Clean up the workspace
215+
sh """
216+
env | sort
217+
pwd && ls -alh
218+
rm -rf ./*
219+
"""
214220
}
215221
}
216222

@@ -219,8 +225,11 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
219225
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
220226
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
221227

222-
def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
223-
def nodeSecret = CloudManager.createNode(nodeName)
228+
// Create a unique suffix for the node name and workspace
229+
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
230+
def nodeName = "${cluster.host}-test-${customSuffix}"
231+
def customWorkspace = "/tmp/${nodeName}"
232+
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
224233

225234
try {
226235
// Run ssh command to start node in desired cluster via SLURM
@@ -263,12 +272,31 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
263272
}
264273

265274
if (CloudManager.isNodeOnline(nodeName)) {
266-
def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
275+
node(nodeName) {
276+
sh """
277+
env | sort
278+
pwd && ls -alh
279+
ls -alh ${env.WORKSPACE}
280+
ls -alh ${env.WORKSPACE_TMP}
281+
ls -alh ${env.PWD}
282+
"""
283+
}
284+
285+
def dockerArgs = "--gpus ${gpuCount} " +
286+
"--cap-add=SYS_ADMIN " +
287+
"--ipc=host " +
288+
"--security-opt seccomp=unconfined " +
289+
"-u root:root " +
290+
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
291+
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
292+
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
293+
"--cap-add syslog"
267294

268295
if (partition.clusterName == "dlcluster") {
269296
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
270297
}
271-
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
298+
299+
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
272300
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
273301
} else {
274302
echo "The node does not come online in 2 hours, terminating the job"
@@ -796,7 +824,7 @@ def echoNodeAndGpuInfo(pipeline, stageName)
796824

797825
def runLLMDocBuild(pipeline, config)
798826
{
799-
// Step 1: cloning tekit source code
827+
// Step 1: cloning source code
800828
sh "pwd && ls -alh"
801829
sh "env | sort"
802830
// allow to checkout from forked repo, svc_tensorrt needs to have access to the repo, otherwise clone will fail
@@ -1241,13 +1269,17 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
12411269

12421270
def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
12431271
{
1244-
// Step 1: create LLM_ROOT dir
1245-
sh "pwd && ls -alh"
1246-
// TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
1247-
// So that it can work with multiple job running in same node
1248-
sh "rm -rf ./*"
1272+
// Step 1: create LLM_ROOT dir and clean up the workspace
12491273
def llmRootConfig = "${LLM_ROOT}${config}"
1250-
sh "mkdir ${llmRootConfig}"
1274+
sh """
1275+
env | sort
1276+
pwd && ls -alh
1277+
rm -rf ./*
1278+
mkdir ${llmRootConfig}
1279+
ls -alh ${env.WORKSPACE}
1280+
ls -alh ${env.WORKSPACE_TMP}
1281+
ls -alh ${env.PWD}
1282+
"""
12511283

12521284
def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
12531285
def llmSrc = "${llmPath}/TensorRT-LLM/src"
@@ -1562,6 +1594,13 @@ def runLLMTestlistOnPlatform(pipeline, platform, testList, config=VANILLA_CONFIG
15621594
sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
15631595
sh "ls ${stageName}/ -all"
15641596
})
1597+
1598+
// Clean up the workspace
1599+
sh """
1600+
env | sort
1601+
pwd && ls -alh
1602+
rm -rf ./*
1603+
"""
15651604
}
15661605

15671606

@@ -1893,8 +1932,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
18931932
"GB200-PyTorch-1": ["gb200-unrestricted", "l0_gb200", 1, 3],
18941933
"GB200-PyTorch-2": ["gb200-unrestricted", "l0_gb200", 2, 3],
18951934
"GB200-PyTorch-3": ["gb200-unrestricted", "l0_gb200", 3, 3],
1896-
"GB200-TensorRT-1": ["gb200-unrestricted", "l0_gb200", 1, 2],
1897-
"GB200-TensorRT-2": ["gb200-unrestricted", "l0_gb200", 2, 2],
1935+
"GB200-PyTorch-Post-Merge-1": ["gb200-unrestricted", "l0_gb200", 1, 1],
1936+
"GB200-TensorRT-Post-Merge-1": ["gb200-unrestricted", "l0_gb200", 1, 2],
1937+
"GB200-TensorRT-Post-Merge-2": ["gb200-unrestricted", "l0_gb200", 2, 2],
18981938
"GB200-Triton-Post-Merge-1": ["gb200-unrestricted", "l0_gb200", 1, 1],
18991939
"GB200-4_GPUs-PyTorch-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
19001940
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4", "l0_gb200_multi_gpus", 1, 1, 4],
@@ -2129,7 +2169,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
21292169
echo "###### Check pip install Start ######"
21302170
withEnv(libEnv) {
21312171
sh "env | sort"
2132-
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2172+
timeout(time: 1, unit: 'HOURS') {
2173+
checkPipInstall(pipeline, "${cpu_arch}/${wheelPath}")
2174+
}
21332175
}
21342176
echo "###### Run LLMAPI tests Start ######"
21352177
def config = VANILLA_CONFIG

tests/integration/test_lists/test-db/l0_gb200.yml

Lines changed: 51 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -21,22 +21,10 @@ l0_gb200:
2121
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
2222
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
2323
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
24-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
25-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
26-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
2724
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
28-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
29-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
30-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
31-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
3225
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
33-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
3426
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
35-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
36-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
37-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
3827
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
39-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
4028
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
4129
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
4230
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
@@ -67,15 +55,21 @@ l0_gb200:
6755
- test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]
6856
- test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
6957
- test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
70-
- unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (120)
71-
- unittest/_torch -k "modeling_llama"
58+
- unittest/_torch/attention
59+
- unittest/_torch/compilation
60+
- unittest/_torch/debugger
61+
- unittest/_torch/executor
62+
- unittest/_torch/misc
63+
- unittest/_torch/modules
64+
- unittest/_torch/multimodal
65+
- unittest/_torch/sampler
66+
- unittest/_torch/speculative
67+
- unittest/_torch/thop
68+
- unittest/_torch/modeling -k "modeling_llama"
7269
- unittest/_torch/modeling -k "modeling_mixtral"
7370
- unittest/_torch/modeling -k "modeling_deepseek"
7471
- unittest/_torch/modeling -k "modeling_gpt_oss"
7572
- unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
76-
- unittest/_torch/speculative/test_eagle3.py
77-
- unittest/_torch/speculative/test_kv_cache_reuse.py
78-
- unittest/_torch/speculative/test_dynamic_spec_decode.py
7973
- condition:
8074
ranges:
8175
system_gpu_count:
@@ -87,7 +81,7 @@ l0_gb200:
8781
linux_distribution_name: ubuntu*
8882
cpu: aarch64
8983
terms:
90-
stage: pre_merge
84+
stage: post_merge
9185
backend: tensorrt
9286
tests:
9387
# ------------- TRT tests ---------------
@@ -103,20 +97,47 @@ l0_gb200:
10397
- unittest/llmapi/test_llm_quant.py
10498
- unittest/trt/functional/test_fp4_gemm.py
10599
- condition:
106-
ranges:
107-
system_gpu_count:
108-
gte: 1
109-
lte: 1
110-
wildcards:
111-
gpu:
112-
- '*gb200*'
113-
linux_distribution_name: ubuntu*
114-
cpu: aarch64
115-
terms:
116-
stage: post_merge
117-
backend: triton
100+
ranges:
101+
system_gpu_count:
102+
gte: 1
103+
lte: 1
104+
wildcards:
105+
gpu:
106+
- '*gb200*'
107+
linux_distribution_name: ubuntu*
108+
cpu: aarch64
109+
terms:
110+
stage: post_merge
111+
backend: triton
118112
tests:
119113
# ------------- Triton tests ---------------
120114
- triton_server/test_triton.py::test_llava[llava]
121115
- triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning]
122116
- triton_server/test_triton.py::test_gpt_2b_ib_lora[gpt-2b-ib-lora]
117+
- condition:
118+
ranges:
119+
system_gpu_count:
120+
gte: 1
121+
lte: 1
122+
wildcards:
123+
gpu:
124+
- '*gb200*'
125+
linux_distribution_name: ubuntu*
126+
cpu: aarch64
127+
terms:
128+
stage: post_merge
129+
backend: pytorch
130+
tests:
131+
# ------------- PyTorch tests ---------------
132+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
133+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
134+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
135+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
136+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
137+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
138+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
139+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
140+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
141+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
142+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
143+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]

tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,4 @@ l0_gb200_multi_gpus:
6666
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=TRTLLM-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
6767
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
6868
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]
69+
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4_4gpus[latency_moe_trtllm_eagle3] TIMEOUT (90)

0 commit comments

Comments
 (0)