@@ -79,6 +79,11 @@ class BenchmarkMetrics:
79
79
std_e2el_ms : float
80
80
percentiles_e2el_ms : list [tuple [float , float ]]
81
81
tput_user : list [float ]
82
+ # Request accuracy rate metrics
83
+ mean_request_ar : float
84
+ median_request_ar : float
85
+ std_request_ar : float
86
+ percentiles_request_ar : list [tuple [float , float ]]
82
87
83
88
84
89
async def get_request (
@@ -131,7 +136,7 @@ def calculate_metrics(
131
136
selected_percentile_metrics : list [str ],
132
137
selected_percentiles : list [float ],
133
138
goodput_config_dict : dict [str , float ],
134
- ) -> tuple [BenchmarkMetrics , list [int ]]:
139
+ ) -> tuple [BenchmarkMetrics , list [int ], list [ float ] ]:
135
140
actual_output_lens : list [int ] = []
136
141
total_input = 0
137
142
completed = 0
@@ -142,6 +147,7 @@ def calculate_metrics(
142
147
ttfts : list [float ] = []
143
148
e2els : list [float ] = []
144
149
tput_user : list [float ] = []
150
+ request_ars : list [float ] = [] # Request accuracy rates
145
151
for i in range (len (outputs )):
146
152
if outputs [i ].success :
147
153
output_len = outputs [i ].output_tokens
@@ -167,9 +173,24 @@ def calculate_metrics(
167
173
ttfts .append (outputs [i ].ttft )
168
174
e2els .append (outputs [i ].latency )
169
175
tput_user .append (output_len / (outputs [i ].latency ))
176
+
177
+ # Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1))
178
+ decode_iter = outputs [i ].decode_iteration
179
+ if decode_iter >= 0 :
180
+ # For generated tokens, we use output_len - 1 (excluding the first token if needed)
181
+ # But according to the reference, it should be num_generated_tokens
182
+ num_generated_tokens = max (0 , output_len -
183
+ 1 ) if output_len > 1 else output_len
184
+ request_ar = num_generated_tokens / (
185
+ decode_iter + 1 ) if decode_iter >= 0 else 0.0
186
+ request_ars .append (request_ar )
187
+ else :
188
+ request_ars .append (0.0 )
189
+
170
190
completed += 1
171
191
else :
172
192
actual_output_lens .append (0 )
193
+ request_ars .append (0.0 )
173
194
174
195
if goodput_config_dict :
175
196
valid_metrics = []
@@ -228,8 +249,13 @@ def calculate_metrics(
228
249
percentiles_e2el_ms = [(p , np .percentile (e2els or 0 , p ) * 1000 )
229
250
for p in selected_percentiles ],
230
251
tput_user = np .mean (tput_user or 0 ),
252
+ mean_request_ar = np .mean (request_ars or 0 ),
253
+ median_request_ar = np .median (request_ars or 0 ),
254
+ std_request_ar = np .std (request_ars or 0 ),
255
+ percentiles_request_ar = [(p , np .percentile (request_ars or 0 , p ))
256
+ for p in selected_percentiles ],
231
257
)
232
- return metrics , actual_output_lens
258
+ return metrics , actual_output_lens , request_ars
233
259
234
260
235
261
async def benchmark (
@@ -403,7 +429,7 @@ async def limited_request_func(request_func_input, streaming, pbar,
403
429
# Close the session
404
430
await session .close ()
405
431
406
- metrics , actual_output_lens = calculate_metrics (
432
+ metrics , actual_output_lens , request_ars = calculate_metrics (
407
433
input_requests = input_requests ,
408
434
outputs = outputs ,
409
435
dur_s = benchmark_duration ,
@@ -431,6 +457,10 @@ async def limited_request_func(request_func_input, streaming, pbar,
431
457
metrics .total_token_throughput ))
432
458
print ("{:<40} {:<10.2f}" .format ("User throughput (tok/s):" ,
433
459
metrics .tput_user ))
460
+ print ("{:<40} {:<10.4f}" .format ("Mean Request AR:" ,
461
+ metrics .mean_request_ar ))
462
+ print ("{:<40} {:<10.4f}" .format ("Median Request AR:" ,
463
+ metrics .median_request_ar ))
434
464
435
465
result = {
436
466
"duration" : benchmark_duration ,
@@ -443,12 +473,16 @@ async def limited_request_func(request_func_input, streaming, pbar,
443
473
"output_throughput" : metrics .output_throughput ,
444
474
"total_token_throughput" : metrics .total_token_throughput ,
445
475
"user_throughput" : metrics .tput_user ,
476
+ "mean_request_ar" : metrics .mean_request_ar ,
477
+ "median_request_ar" : metrics .median_request_ar ,
446
478
"input_lens" : [output .prompt_len for output in outputs ],
447
479
"output_lens" : actual_output_lens ,
448
480
"ttfts" : [output .ttft for output in outputs ],
449
481
"itls" : [output .itl for output in outputs ],
450
482
"generated_texts" : [output .generated_text for output in outputs ],
451
483
"errors" : [output .error for output in outputs ],
484
+ "request_ars" : request_ars ,
485
+ "decode_iterations" : [output .decode_iteration for output in outputs ],
452
486
}
453
487
454
488
def process_one_metric (
@@ -534,11 +568,15 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
534
568
metrics = [
535
569
"median_ttft_ms" , "mean_ttft_ms" , "std_ttft_ms" , "p99_ttft_ms" ,
536
570
"mean_tpot_ms" , "median_tpot_ms" , "std_tpot_ms" , "p99_tpot_ms" ,
537
- "median_itl_ms" , "mean_itl_ms" , "std_itl_ms" , "p99_itl_ms"
571
+ "median_itl_ms" , "mean_itl_ms" , "std_itl_ms" , "p99_itl_ms" ,
572
+ "mean_request_ar" , "median_request_ar" , "std_request_ar"
538
573
]
539
574
# These raw data might be useful, but they are rather big. They can be added
540
575
# later if needed
541
- ignored_metrics = ["ttfts" , "itls" , "generated_texts" , "errors" ]
576
+ ignored_metrics = [
577
+ "ttfts" , "itls" , "generated_texts" , "errors" , "request_ars" ,
578
+ "decode_iterations"
579
+ ]
542
580
pt_records = convert_to_pytorch_benchmark_format (
543
581
args = args ,
544
582
metrics = {k : [results [k ]]
@@ -762,7 +800,8 @@ def main(args: argparse.Namespace):
762
800
# Remove fields with too many data points
763
801
for field in [
764
802
"input_lens" , "output_lens" , "ttfts" , "itls" ,
765
- "generated_texts" , "errors"
803
+ "generated_texts" , "errors" , "request_ars" ,
804
+ "decode_iterations"
766
805
]:
767
806
if field in result_json :
768
807
del result_json [field ]
@@ -963,11 +1002,11 @@ def main(args: argparse.Namespace):
963
1002
parser .add_argument (
964
1003
"--percentile-metrics" ,
965
1004
type = str ,
966
- default = "ttft,tpot,itl" ,
1005
+ default = "ttft,tpot,itl,request_ar " ,
967
1006
help = "Comma-separated list of selected metrics to report percentils. "
968
1007
"This argument specifies the metrics to report percentiles. "
969
- "Allowed metric names are \" ttft\" , \" tpot\" , \" itl\" , \" e2el\" . "
970
- "Default value is \" ttft,tpot,itl\" ." )
1008
+ "Allowed metric names are \" ttft\" , \" tpot\" , \" itl\" , \" e2el\" , \" request_ar \" . "
1009
+ "Default value is \" ttft,tpot,itl,request_ar \" ." )
971
1010
parser .add_argument (
972
1011
"--metric-percentiles" ,
973
1012
type = str ,
0 commit comments