@@ -77,6 +77,11 @@ class BenchmarkMetrics:
77
77
std_e2el_ms : float
78
78
percentiles_e2el_ms : list [tuple [float , float ]]
79
79
tput_user : list [float ]
80
+ # Request accuracy rate metrics
81
+ mean_request_ar : float
82
+ median_request_ar : float
83
+ std_request_ar : float
84
+ percentiles_request_ar : list [tuple [float , float ]]
80
85
81
86
82
87
async def get_request (
@@ -129,7 +134,7 @@ def calculate_metrics(
129
134
selected_percentile_metrics : list [str ],
130
135
selected_percentiles : list [float ],
131
136
goodput_config_dict : dict [str , float ],
132
- ) -> tuple [BenchmarkMetrics , list [int ]]:
137
+ ) -> tuple [BenchmarkMetrics , list [int ], list [ float ] ]:
133
138
actual_output_lens : list [int ] = []
134
139
total_input = 0
135
140
completed = 0
@@ -140,6 +145,7 @@ def calculate_metrics(
140
145
ttfts : list [float ] = []
141
146
e2els : list [float ] = []
142
147
tput_user : list [float ] = []
148
+ request_ars : list [float ] = [] # Request accuracy rates
143
149
for i in range (len (outputs )):
144
150
if outputs [i ].success :
145
151
output_len = outputs [i ].output_tokens
@@ -165,9 +171,22 @@ def calculate_metrics(
165
171
ttfts .append (outputs [i ].ttft )
166
172
e2els .append (outputs [i ].latency )
167
173
tput_user .append (output_len / (outputs [i ].latency ))
174
+
175
+ # Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1))
176
+ decode_iter = outputs [i ].decode_iteration
177
+ if decode_iter >= 0 :
178
+ # For generated tokens, we use output_len - 1 (excluding the first token if needed)
179
+ # But according to the reference, it should be num_generated_tokens
180
+ num_generated_tokens = max (0 , output_len - 1 ) if output_len > 1 else output_len
181
+ request_ar = num_generated_tokens / (decode_iter + 1 ) if decode_iter >= 0 else 0.0
182
+ request_ars .append (request_ar )
183
+ else :
184
+ request_ars .append (0.0 )
185
+
168
186
completed += 1
169
187
else :
170
188
actual_output_lens .append (0 )
189
+ request_ars .append (0.0 )
171
190
172
191
if goodput_config_dict :
173
192
valid_metrics = []
@@ -226,8 +245,13 @@ def calculate_metrics(
226
245
percentiles_e2el_ms = [(p , np .percentile (e2els or 0 , p ) * 1000 )
227
246
for p in selected_percentiles ],
228
247
tput_user = np .mean (tput_user or 0 ),
248
+ mean_request_ar = np .mean (request_ars or 0 ),
249
+ median_request_ar = np .median (request_ars or 0 ),
250
+ std_request_ar = np .std (request_ars or 0 ),
251
+ percentiles_request_ar = [(p , np .percentile (request_ars or 0 , p ))
252
+ for p in selected_percentiles ],
229
253
)
230
- return metrics , actual_output_lens
254
+ return metrics , actual_output_lens , request_ars
231
255
232
256
233
257
async def benchmark (
@@ -401,7 +425,7 @@ async def limited_request_func(request_func_input, streaming, pbar,
401
425
# Close the session
402
426
await session .close ()
403
427
404
- metrics , actual_output_lens = calculate_metrics (
428
+ metrics , actual_output_lens , request_ars = calculate_metrics (
405
429
input_requests = input_requests ,
406
430
outputs = outputs ,
407
431
dur_s = benchmark_duration ,
@@ -429,6 +453,10 @@ async def limited_request_func(request_func_input, streaming, pbar,
429
453
metrics .total_token_throughput ))
430
454
print ("{:<40} {:<10.2f}" .format ("User throughput (tok/s):" ,
431
455
metrics .tput_user ))
456
+ print ("{:<40} {:<10.4f}" .format ("Mean Request AR:" ,
457
+ metrics .mean_request_ar ))
458
+ print ("{:<40} {:<10.4f}" .format ("Median Request AR:" ,
459
+ metrics .median_request_ar ))
432
460
433
461
result = {
434
462
"duration" : benchmark_duration ,
@@ -441,12 +469,16 @@ async def limited_request_func(request_func_input, streaming, pbar,
441
469
"output_throughput" : metrics .output_throughput ,
442
470
"total_token_throughput" : metrics .total_token_throughput ,
443
471
"user_throughput" : metrics .tput_user ,
472
+ "mean_request_ar" : metrics .mean_request_ar ,
473
+ "median_request_ar" : metrics .median_request_ar ,
444
474
"input_lens" : [output .prompt_len for output in outputs ],
445
475
"output_lens" : actual_output_lens ,
446
476
"ttfts" : [output .ttft for output in outputs ],
447
477
"itls" : [output .itl for output in outputs ],
448
478
"generated_texts" : [output .generated_text for output in outputs ],
449
479
"errors" : [output .error for output in outputs ],
480
+ "request_ars" : request_ars ,
481
+ "decode_iterations" : [output .decode_iteration for output in outputs ],
450
482
}
451
483
452
484
def process_one_metric (
@@ -532,11 +564,12 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
532
564
metrics = [
533
565
"median_ttft_ms" , "mean_ttft_ms" , "std_ttft_ms" , "p99_ttft_ms" ,
534
566
"mean_tpot_ms" , "median_tpot_ms" , "std_tpot_ms" , "p99_tpot_ms" ,
535
- "median_itl_ms" , "mean_itl_ms" , "std_itl_ms" , "p99_itl_ms"
567
+ "median_itl_ms" , "mean_itl_ms" , "std_itl_ms" , "p99_itl_ms" ,
568
+ "mean_request_ar" , "median_request_ar" , "std_request_ar"
536
569
]
537
570
# These raw data might be useful, but they are rather big. They can be added
538
571
# later if needed
539
- ignored_metrics = ["ttfts" , "itls" , "generated_texts" , "errors" ]
572
+ ignored_metrics = ["ttfts" , "itls" , "generated_texts" , "errors" , "request_ars" , "decode_iterations" ]
540
573
pt_records = convert_to_pytorch_benchmark_format (
541
574
args = args ,
542
575
metrics = {k : [results [k ]]
@@ -760,7 +793,7 @@ def main(args: argparse.Namespace):
760
793
# Remove fields with too many data points
761
794
for field in [
762
795
"input_lens" , "output_lens" , "ttfts" , "itls" ,
763
- "generated_texts" , "errors"
796
+ "generated_texts" , "errors" , "request_ars" , "decode_iterations"
764
797
]:
765
798
if field in result_json :
766
799
del result_json [field ]
@@ -961,11 +994,11 @@ def main(args: argparse.Namespace):
961
994
parser .add_argument (
962
995
"--percentile-metrics" ,
963
996
type = str ,
964
- default = "ttft,tpot,itl" ,
997
+ default = "ttft,tpot,itl,request_ar " ,
965
998
help = "Comma-separated list of selected metrics to report percentils. "
966
999
"This argument specifies the metrics to report percentiles. "
967
- "Allowed metric names are \" ttft\" , \" tpot\" , \" itl\" , \" e2el\" . "
968
- "Default value is \" ttft,tpot,itl\" ." )
1000
+ "Allowed metric names are \" ttft\" , \" tpot\" , \" itl\" , \" e2el\" , \" request_ar \" . "
1001
+ "Default value is \" ttft,tpot,itl,request_ar \" ." )
969
1002
parser .add_argument (
970
1003
"--metric-percentiles" ,
971
1004
type = str ,
0 commit comments