Skip to content

Commit 2a147c4

Browse files
authored
test: add llama_v3.3_70b_cases in perf test (#6035)
Signed-off-by: ruodil <[email protected]>
1 parent 2504aa5 commit 2a147c4

File tree

4 files changed

+30
-0
lines changed

4 files changed

+30
-0
lines changed

tests/integration/defs/perf/pytorch_model_config.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,26 @@ def get_model_yaml_config(model_label: str,
143143
'backend': 'TRTLLM'
144144
}
145145
}
146+
},
147+
# Llama-v3.3 models with fp8 quantization
148+
{
149+
'patterns': [
150+
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:4',
151+
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,1000-gpus:4',
152+
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4',
153+
'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4',
154+
'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4',
155+
],
156+
'config': {
157+
'use_cuda_graph':
158+
True,
159+
'cuda_graph_padding_enabled':
160+
True,
161+
'cuda_graph_batch_sizes': [
162+
1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048,
163+
4096, 8192
164+
]
165+
}
146166
}
147167
]
148168

tests/integration/defs/perf/test_perf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@
5454
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
5555
"llama_v3.3_70b_instruct_fp4":
5656
"modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
57+
"llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
5758
"llama_v3.1_405b_instruct_fp4":
5859
"modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
5960
"llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",

tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,8 @@ trt_llm_release_perf_sanity_test:
202202
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
203203
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
204204
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
205+
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
206+
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
205207
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:8]
206208

207209

tests/integration/test_lists/qa/trt_llm_release_perf_test.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,11 @@ trt_llm_release_perf_test:
295295
- perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4]
296296
- perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
297297
- perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
298+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-gpus:4]
299+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:4]
300+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
301+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:4]
302+
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4]
298303

299304
# FP8 specific tests
300305
- condition:
@@ -357,6 +362,8 @@ trt_llm_release_perf_test:
357362
- perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
358363
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8]
359364
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8]
365+
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
366+
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
360367
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
361368
- perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
362369

0 commit comments

Comments
 (0)