@@ -79,6 +79,11 @@ class BenchmarkMetrics:
79
79
std_e2el_ms : float
80
80
percentiles_e2el_ms : list [tuple [float , float ]]
81
81
tput_user : list [float ]
82
+ # Request accuracy rate metrics
83
+ mean_request_ar : float
84
+ median_request_ar : float
85
+ std_request_ar : float
86
+ percentiles_request_ar : list [tuple [float , float ]]
82
87
83
88
84
89
async def get_request (
@@ -131,7 +136,7 @@ def calculate_metrics(
131
136
selected_percentile_metrics : list [str ],
132
137
selected_percentiles : list [float ],
133
138
goodput_config_dict : dict [str , float ],
134
- ) -> tuple [BenchmarkMetrics , list [int ]]:
139
+ ) -> tuple [BenchmarkMetrics , list [int ], list [ float ] ]:
135
140
actual_output_lens : list [int ] = []
136
141
total_input = 0
137
142
completed = 0
@@ -142,6 +147,7 @@ def calculate_metrics(
142
147
ttfts : list [float ] = []
143
148
e2els : list [float ] = []
144
149
tput_user : list [float ] = []
150
+ request_ars : list [float ] = [] # Request accuracy rates
145
151
for i in range (len (outputs )):
146
152
if outputs [i ].success :
147
153
output_len = outputs [i ].output_tokens
@@ -167,9 +173,22 @@ def calculate_metrics(
167
173
ttfts .append (outputs [i ].ttft )
168
174
e2els .append (outputs [i ].latency )
169
175
tput_user .append (output_len / (outputs [i ].latency ))
176
+
177
+ # Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1))
178
+ decode_iter = outputs [i ].decode_iteration
179
+ if decode_iter >= 0 :
180
+ # For generated tokens, we use output_len - 1 (excluding the first token if needed)
181
+ # But according to the reference, it should be num_generated_tokens
182
+ num_generated_tokens = max (0 , output_len - 1 ) if output_len > 1 else output_len
183
+ request_ar = num_generated_tokens / (decode_iter + 1 ) if decode_iter >= 0 else 0.0
184
+ request_ars .append (request_ar )
185
+ else :
186
+ request_ars .append (0.0 )
187
+
170
188
completed += 1
171
189
else :
172
190
actual_output_lens .append (0 )
191
+ request_ars .append (0.0 )
173
192
174
193
if goodput_config_dict :
175
194
valid_metrics = []
@@ -228,8 +247,13 @@ def calculate_metrics(
228
247
percentiles_e2el_ms = [(p , np .percentile (e2els or 0 , p ) * 1000 )
229
248
for p in selected_percentiles ],
230
249
tput_user = np .mean (tput_user or 0 ),
250
+ mean_request_ar = np .mean (request_ars or 0 ),
251
+ median_request_ar = np .median (request_ars or 0 ),
252
+ std_request_ar = np .std (request_ars or 0 ),
253
+ percentiles_request_ar = [(p , np .percentile (request_ars or 0 , p ))
254
+ for p in selected_percentiles ],
231
255
)
232
- return metrics , actual_output_lens
256
+ return metrics , actual_output_lens , request_ars
233
257
234
258
235
259
async def benchmark (
@@ -403,7 +427,7 @@ async def limited_request_func(request_func_input, streaming, pbar,
403
427
# Close the session
404
428
await session .close ()
405
429
406
- metrics , actual_output_lens = calculate_metrics (
430
+ metrics , actual_output_lens , request_ars = calculate_metrics (
407
431
input_requests = input_requests ,
408
432
outputs = outputs ,
409
433
dur_s = benchmark_duration ,
@@ -431,6 +455,10 @@ async def limited_request_func(request_func_input, streaming, pbar,
431
455
metrics .total_token_throughput ))
432
456
print ("{:<40} {:<10.2f}" .format ("User throughput (tok/s):" ,
433
457
metrics .tput_user ))
458
+ print ("{:<40} {:<10.4f}" .format ("Mean Request AR:" ,
459
+ metrics .mean_request_ar ))
460
+ print ("{:<40} {:<10.4f}" .format ("Median Request AR:" ,
461
+ metrics .median_request_ar ))
434
462
435
463
result = {
436
464
"duration" : benchmark_duration ,
@@ -443,12 +471,16 @@ async def limited_request_func(request_func_input, streaming, pbar,
443
471
"output_throughput" : metrics .output_throughput ,
444
472
"total_token_throughput" : metrics .total_token_throughput ,
445
473
"user_throughput" : metrics .tput_user ,
474
+ "mean_request_ar" : metrics .mean_request_ar ,
475
+ "median_request_ar" : metrics .median_request_ar ,
446
476
"input_lens" : [output .prompt_len for output in outputs ],
447
477
"output_lens" : actual_output_lens ,
448
478
"ttfts" : [output .ttft for output in outputs ],
449
479
"itls" : [output .itl for output in outputs ],
450
480
"generated_texts" : [output .generated_text for output in outputs ],
451
481
"errors" : [output .error for output in outputs ],
482
+ "request_ars" : request_ars ,
483
+ "decode_iterations" : [output .decode_iteration for output in outputs ],
452
484
}
453
485
454
486
def process_one_metric (
@@ -534,11 +566,12 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
534
566
metrics = [
535
567
"median_ttft_ms" , "mean_ttft_ms" , "std_ttft_ms" , "p99_ttft_ms" ,
536
568
"mean_tpot_ms" , "median_tpot_ms" , "std_tpot_ms" , "p99_tpot_ms" ,
537
- "median_itl_ms" , "mean_itl_ms" , "std_itl_ms" , "p99_itl_ms"
569
+ "median_itl_ms" , "mean_itl_ms" , "std_itl_ms" , "p99_itl_ms" ,
570
+ "mean_request_ar" , "median_request_ar" , "std_request_ar"
538
571
]
539
572
# These raw data might be useful, but they are rather big. They can be added
540
573
# later if needed
541
- ignored_metrics = ["ttfts" , "itls" , "generated_texts" , "errors" ]
574
+ ignored_metrics = ["ttfts" , "itls" , "generated_texts" , "errors" , "request_ars" , "decode_iterations" ]
542
575
pt_records = convert_to_pytorch_benchmark_format (
543
576
args = args ,
544
577
metrics = {k : [results [k ]]
@@ -762,7 +795,7 @@ def main(args: argparse.Namespace):
762
795
# Remove fields with too many data points
763
796
for field in [
764
797
"input_lens" , "output_lens" , "ttfts" , "itls" ,
765
- "generated_texts" , "errors"
798
+ "generated_texts" , "errors" , "request_ars" , "decode_iterations"
766
799
]:
767
800
if field in result_json :
768
801
del result_json [field ]
@@ -963,11 +996,11 @@ def main(args: argparse.Namespace):
963
996
parser .add_argument (
964
997
"--percentile-metrics" ,
965
998
type = str ,
966
- default = "ttft,tpot,itl" ,
999
+ default = "ttft,tpot,itl,request_ar " ,
967
1000
help = "Comma-separated list of selected metrics to report percentils. "
968
1001
"This argument specifies the metrics to report percentiles. "
969
- "Allowed metric names are \" ttft\" , \" tpot\" , \" itl\" , \" e2el\" . "
970
- "Default value is \" ttft,tpot,itl\" ." )
1002
+ "Allowed metric names are \" ttft\" , \" tpot\" , \" itl\" , \" e2el\" , \" request_ar \" . "
1003
+ "Default value is \" ttft,tpot,itl,request_ar \" ." )
971
1004
parser .add_argument (
972
1005
"--metric-percentiles" ,
973
1006
type = str ,
0 commit comments