@@ -32,8 +32,11 @@ trt_llm_release_perf_sanity_test:
32
32
- perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20]
33
33
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20]
34
34
- perf/test_perf.py::test_perf[whisper_large_v3-bench-float16-input_output_len:128,20]
35
+ # llama_v3.1_8b_instruct
36
+ # trt backend
35
37
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128]
36
38
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
39
+ # pytorch backend
37
40
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
38
41
39
42
# Test list validation
@@ -58,7 +61,10 @@ trt_llm_release_perf_sanity_test:
58
61
# E2E gptManagerBenchmark IFB
59
62
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-static_batching-plugin_ifb-float16-bs:8+64-input_output_len:128,128+512,32]
60
63
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-cppmanager-exe-plugin_ifb-bfloat16-gwp:0.0-input_output_len:128,128+512,32]
64
+ # llama_v3.1_8b
65
+ # trt backend
61
66
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32]
67
+ # pytorch backend
62
68
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:128,128]
63
69
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-input_output_len:512,32]
64
70
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-streaming-bfloat16-input_output_len:128,128]
@@ -77,8 +83,11 @@ trt_llm_release_perf_sanity_test:
77
83
- ' *l20*'
78
84
- ' *h20*'
79
85
tests :
86
+ # llama_v3.1_8b_instruct_fp8
87
+ # trt backend
80
88
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:fp8]
81
89
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,32-quant:fp8]
90
+ # pytorch backend
82
91
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128]
83
92
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32]
84
93
- perf/test_perf.py::test_perf[llama_v3.1_nemotron_nano_8b_fp8-bench-pytorch-float8-maxbs:512-maxnt:5000-input_output_len:5000,500-reqs:8-con:1]
@@ -101,9 +110,12 @@ trt_llm_release_perf_sanity_test:
101
110
tests :
102
111
- perf/test_perf.py::test_perf[t5-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
103
112
- perf/test_perf.py::test_perf[flan_t5_large-bench-float16-maxbs:1-input_output_len:128,20-gpus:2]
113
+ # llama_v3.1_8b_instruct
114
+ # trt backend
104
115
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128-quant:int8-gpus:2]
105
116
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
106
117
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128-gpus:2]
118
+ # pytorch backend
107
119
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2]
108
120
- perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]
109
121
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:2]
@@ -128,7 +140,7 @@ trt_llm_release_perf_sanity_test:
128
140
- perf/test_perf.py::test_perf[llama_v3.1_8b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
129
141
- perf/test_perf.py::test_perf[llama_v3.2_1b-bench-bfloat16-input_output_len:128,128-quant:fp8-gpus:2]
130
142
- perf/test_perf.py::test_perf[mixtral_8x7b_v0.1-bench-float16-input_output_len:128,128-quant:fp8-gpus:2]
131
- - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1 -bench-pytorch-float16 -input_output_len:128,128-quant:fp8 -gpus:2]
143
+ - perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8 -bench-pytorch-float8 -input_output_len:128,128-gpus:2]
132
144
133
145
# Tests for systems with 2+ GPUs and high memory
134
146
- condition :
@@ -161,7 +173,10 @@ trt_llm_release_perf_sanity_test:
161
173
- ' *l40s*'
162
174
- ' *h20*'
163
175
tests :
176
+ # llama_v3.1_70b
177
+ # trt backend
164
178
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
179
+ # pytorch backend
165
180
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:128,128-reqs:10-gpus:4]
166
181
- perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128-gpus:4]
167
182
- perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
@@ -198,9 +213,12 @@ trt_llm_release_perf_sanity_test:
198
213
- ' *l40s*'
199
214
- ' *h20*'
200
215
tests :
216
+ # llama_v3.1_70b
217
+ # trt backend
201
218
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
202
- - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
203
219
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
220
+ # pytorch backend
221
+ - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
204
222
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
205
223
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
206
224
- perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
@@ -222,8 +240,13 @@ trt_llm_release_perf_sanity_test:
222
240
- ' *h20*'
223
241
224
242
tests :
243
+ # llama_v3.1_70b
244
+ # trt backend
225
245
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:128,128-quant:fp8-gpus:8]
246
+ # pytorch backend
226
247
- perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:512,32-quant:fp8-gpus:8]
248
+ # llama_v3.3_70b_instruct_fp8
249
+ # pytorch backend
227
250
- perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:8]
228
251
229
252
- condition :
0 commit comments