Skip to content

Commit 03632a6

Browse files
authored
test: organize perf cases and add missing perflab cases in qa test list (#6283)
Signed-off-by: ruodil <[email protected]>
1 parent 971be1f commit 03632a6

File tree

5 files changed

+196
-54
lines changed

5 files changed

+196
-54
lines changed

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,8 @@ def get_model_yaml_config(model_label: str,
5656
# DeepSeek R1 models with MTP speculative decoding
5757
{
5858
'patterns': [
59-
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:fp8-reqs:10-ep:4-gpus:8',
60-
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-quant:nvfp4-reqs:10-ep:4-tp:8-gpus:8'
59+
'deepseek_r1-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-reqs:10-ep:4-gpus:8',
60+
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:1-maxnt:8192-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8'
6161
],
6262
'config': {
6363
'enable_attention_dp': True,
@@ -71,8 +71,8 @@ def get_model_yaml_config(model_label: str,
7171
# DeepSeek R1 models with large batch sizes and cuda graph padding
7272
{
7373
'patterns': [
74-
'deepseek_r1-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8',
75-
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-quant:nvfp4-reqs:49152-con:3072-ep:8-gpus:8'
74+
'deepseek_r1_fp8-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-gpus:8',
75+
'deepseek_r1_nvfp4-bench-pytorch-float16-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-gpus:8'
7676
],
7777
'config': {
7878
'enable_attention_dp': True,
@@ -85,7 +85,7 @@ def get_model_yaml_config(model_label: str,
8585
# DeepSeek R1 model with specific batch size 128
8686
{
8787
'patterns':
88-
'deepseek_r1-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-quant:fp8-reqs:5120-con:1024-ep:8-gpus:8',
88+
'deepseek_r1_fp8-bench-pytorch-float16-maxbs:128-maxnt:1127-input_output_len:1000,2000-reqs:5120-con:1024-ep:8-gpus:8',
8989
'config': {
9090
'enable_attention_dp': True,
9191
'cuda_graph_config': {
@@ -154,6 +154,9 @@ def get_model_yaml_config(model_label: str,
154154
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4',
155155
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4',
156156
'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4',
157+
'llama_v3.1_405b_instruct_fp4',
158+
'llama_v4_scout_17b_16e_instruct_fp4',
159+
'llama_v4_maverick_17b_128e_instruct_fp8'
157160
],
158161
'config': {
159162
'use_cuda_graph':

tests/integration/defs/perf/test_perf.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@
5555
"llama_v3.3_70b_instruct_fp4":
5656
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
5757
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
58+
"llama_v3.1_405b_instruct_fp8":
59+
"llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
5860
"llama_v3.1_405b_instruct_fp4":
5961
"modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
6062
"llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
@@ -71,11 +73,14 @@
7173
"nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
7274
"llama_v4_scout_17b_16e_instruct":
7375
"llama4-models/Llama-4-Scout-17B-16E-Instruct",
76+
"llama_v4_scout_17b_16e_instruct_fp8":
77+
"llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
78+
"llama_v4_scout_17b_16e_instruct_fp4":
79+
"llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
7480
"llama_v4_maverick_17b_128e_instruct":
7581
"llama4-models/Llama-4-Maverick-17B-128E-Instruct",
7682
"llama_v4_maverick_17b_128e_instruct_fp8":
77-
"llama4-models/Llama-4-Maverick-17B-128E-Instruct-FP8",
78-
# "llama_30b": "llama-models/llama-30b-hf",
83+
"llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
7984
"mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
8085
"mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
8186
"mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8",
@@ -1257,14 +1262,16 @@ def get_trtllm_bench_command(self, engine_dir):
12571262
#use default yaml config
12581263
if self._config.backend == "pytorch":
12591264
import yaml
1265+
pytorch_config_path = os.path.join(engine_dir,
1266+
"extra-llm-api-config.yml")
1267+
if not os.path.exists(pytorch_config_path):
1268+
os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True)
12601269
config = get_model_yaml_config(self._config.to_string(),
12611270
lora_dirs=self.lora_dirs)
12621271
print_info(f"pytorch model config: {config}")
1263-
with open('extra-llm-api-config.yml', 'w') as f:
1272+
with open(pytorch_config_path, 'w') as f:
12641273
yaml.dump(config, f, default_flow_style=False)
1265-
benchmark_cmd += [
1266-
f"--extra_llm_api_options=extra-llm-api-config.yml"
1267-
]
1274+
benchmark_cmd += [f"--extra_llm_api_options={pytorch_config_path}"]
12681275
return benchmark_cmd
12691276

12701277
def get_gpt_manager_runtime_benchmark_command(self, engine_dir, bs,

tests/integration/test_lists/qa/trt_llm_release_perf_cluster_test.yml

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,6 @@ trt_llm_release_perf_cluster_test:
3939
gte: 4
4040
tests:
4141
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:4]
42-
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:128,128-gpus:4]
43-
- perf/test_perf.py::test_perf[qwen_14b_chat-bench-float16-input_output_len:512,32-gpus:4]
4442
- perf/test_perf.py::test_perf[starcoder_15b-bench-float16-input_output_len:512,200-gpus:4]
4543
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
4644
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:512-input_output_len:128,128-ep:4-tp:4-gpus:4]
@@ -55,14 +53,33 @@ trt_llm_release_perf_cluster_test:
5553
tests:
5654
#- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-gpus:8]
5755
#- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-gpus:8]
56+
#llama_v3.3_nemotron_super_49b
5857
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-bfloat16-input_output_len:500,2000-quant:fp8-con:250-gpus:8]
5958
- perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-bfloat16-input_output_len:500,2000-con:250-gpus:8]
59+
#llama_v3.3_70b_instruct_fp4
60+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
61+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
62+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
63+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
64+
#llama_v3.1_405b_instruct_fp4
65+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
66+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
67+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
68+
- perf/test_perf.py::test_perf[llama_v3.1_405b_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
69+
#llama_v4_scout_17b_16e_instruct_fp4
70+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:128,128-tp:8-gpus:8]
71+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-input_output_len:512,32-tp:8-gpus:8]
72+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:500,2000-tp:8-gpus:8]
73+
- perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-float4-maxbs:1024-maxnt:4096-input_output_len:1000,1000-tp:8-gpus:8]
74+
#mixtral_8x22b_v0.1
6075
- perf/test_perf.py::test_perf[mixtral_8x22b_v0.1-bench-float16-input_output_len:512,512-quant:fp8-tp:8]
6176
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
6277
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
78+
#deepseek_r1_fp8
6379
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
6480
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
6581
- perf/test_perf.py::test_perf[deepseek_r1_fp8-bench-pytorch-float8-maxbs:384-maxnt:1536-input_output_len:1000,2000-reqs:49152-con:3072-ep:8-tp:8-gpus:8] #max throughput test
82+
#deepseek_r1_nvfp4
6683
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:512-input_output_len:128,128-ep:8-tp:8-gpus:8]
6784
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test
6885
- perf/test_perf.py::test_perf[deepseek_r1_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:1000,2000-reqs:10-ep:4-tp:8-gpus:8] #min latency test

tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,11 @@ trt_llm_release_perf_sanity_test:
3232
- perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20]
3333
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20]
3434
- perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20]
35+
#llama_v3.1_8b_instruct
36+
#trt backend
3537
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
3638
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
39+
#pytorch backend
3740
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
3841

3942
# Test list validation
@@ -58,7 +61,10 @@ trt_llm_release_perf_sanity_test:
5861
# E2E gptManagerBenchmark IFB
5962
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-static_batching-plugin_ifb-float16-bs:8+64-input_output_len:128,128+512,32]
6063
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.0-input_output_len:128,128+512,32]
64+
#llama_v3.1_8b
65+
#trt backend
6166
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
67+
#pytorch backend
6268
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
6369
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32]
6470
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128]
@@ -77,8 +83,11 @@ trt_llm_release_perf_sanity_test:
7783
- '*l20*'
7884
- '*h20*'
7985
tests:
86+
#llama_v3.1_8b_instruct_fp8
87+
#trt backend
8088
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:fp8]
8189
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32-quant:fp8]
90+
#pytorch backend
8291
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
8392
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32]
8493
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
@@ -101,9 +110,12 @@ trt_llm_release_perf_sanity_test:
101110
tests:
102111
- perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
103112
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
113+
#llama_v3.1_8b_instruct
114+
#trt backend
104115
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2]
105116
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
106117
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
118+
#pytorch backend
107119
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
108120
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
109121
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
@@ -128,7 +140,7 @@ trt_llm_release_perf_sanity_test:
128140
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
129141
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
130142
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
131-
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-pytorch-float16-input_output_len:128,128-quant:fp8-gpus:2]
143+
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:2]
132144

133145
# Tests for systems with 2+ GPUs and high memory
134146
- condition:
@@ -161,7 +173,10 @@ trt_llm_release_perf_sanity_test:
161173
- '*l40s*'
162174
- '*h20*'
163175
tests:
176+
#llama_v3.1_70b
177+
#trt backend
164178
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
179+
#pytorch backend
165180
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
166181
- perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128-gpus:4]
167182
- perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
@@ -198,9 +213,12 @@ trt_llm_release_perf_sanity_test:
198213
- '*l40s*'
199214
- '*h20*'
200215
tests:
216+
#llama_v3.1_70b
217+
#trt backend
201218
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
202-
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
203219
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
220+
#pytorch backend
221+
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
204222
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
205223
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
206224
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
@@ -222,8 +240,13 @@ trt_llm_release_perf_sanity_test:
222240
- '*h20*'
223241

224242
tests:
243+
#llama_v3.1_70b
244+
#trt backend
225245
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8]
246+
#pytorch backend
226247
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8]
248+
#llama_v3.3_70b_instruct_fp8
249+
#pytorch backend
227250
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
228251

229252
- condition:

0 commit comments

Comments
 (0)