@@ -72,7 +72,7 @@ def register_request_perf_item(self, request_perf_item: PerfItemTuple):
72
72
if request_perf_item .response_is_final :
73
73
self .num_complete = self .num_complete + 1
74
74
75
- def generate_statistics_summary (self ) -> None :
75
+ def generate_statistics_summary (self , max_draft_tokens : int ) -> None :
76
76
"""Generate summary statistics from internally stored statistics.
77
77
78
78
Returns:
@@ -90,42 +90,62 @@ def generate_statistics_summary(self) -> None:
90
90
91
91
intertoken_avg_latencies = []
92
92
output_tokens = []
93
- request_acceptance = []
94
93
total_decoding_iterations = 0
95
94
ttft_times = []
96
95
last_queue_time = 0.0
97
96
queue_time_total = 0.0
98
97
98
+ num_draft_tokens = []
99
+ num_accepted_draft_tokens = []
100
+ draft_acceptance_rate = []
101
+ acceptance_length = []
102
+
99
103
for entry in self .requests .values ():
100
104
start_time = min (entry .start_timestamp , start_time )
101
105
end_time = max (entry .end_timestamp , end_time )
102
106
last_queue_time = max (entry .start_timestamp , last_queue_time )
103
- request_ar = entry .num_generated_tokens / (entry .decode_iteration +
104
- 1 )
105
107
106
108
request_latencies .append (entry .end_to_end_latency )
107
109
generation_latencies .append (entry .generation_time )
108
110
generation_throughputs .append (entry .generation_token_throughput )
109
111
ttft_times .append (entry .time_to_first_token )
110
112
intertoken_avg_latencies .append (entry .intertoken_latency )
111
- request_acceptance .append (request_ar )
112
113
output_throughput_per_user .append (entry .output_token_throughput )
113
114
total_decoding_iterations += entry .decode_iteration + 1
114
115
115
116
output_tokens .append (entry .num_total_output_tokens )
116
117
total_input_tokens += entry .num_input_tokens
117
118
118
- global_acceptance_rate = sum (output_tokens ) / total_decoding_iterations
119
+ # For speculative decoding, we need to track the number of draft tokens per request and the number of accepted draft tokens per request
120
+ if max_draft_tokens > 0 :
121
+ num_draft_tokens .append (max_draft_tokens *
122
+ (entry .decode_iteration + 1 ))
123
+ num_accepted_draft_tokens .append (entry .num_total_output_tokens -
124
+ entry .decode_iteration - 1 )
125
+ draft_acceptance_rate .append (
126
+ float (num_accepted_draft_tokens [- 1 ]) /
127
+ float (num_draft_tokens [- 1 ]))
128
+ acceptance_length .append (entry .num_total_output_tokens /
129
+ (entry .decode_iteration + 1 ))
130
+
131
+ global_acceptance_length = sum (
132
+ output_tokens ) / total_decoding_iterations
119
133
queue_time_total = last_queue_time - start_time
120
- percentile_request_accept = PercentileStats .from_iterable (
121
- request_acceptance ) if request_acceptance else None
134
+
135
+ num_draft_tokens_percentiles = PercentileStats .from_iterable (
136
+ num_draft_tokens ) if num_draft_tokens else None
137
+ num_accepted_draft_tokens_percentiles = PercentileStats .from_iterable (
138
+ num_accepted_draft_tokens ) if num_accepted_draft_tokens else None
139
+ draft_acceptance_rate_percentiles = PercentileStats .from_iterable (
140
+ draft_acceptance_rate ) if draft_acceptance_rate else None
141
+ acceptance_length_percentiles = PercentileStats .from_iterable (
142
+ acceptance_length ) if acceptance_length else None
122
143
123
144
stats = BenchmarkStatistics (
124
145
num_requests = num_requests ,
125
146
total_latency_ns = end_time - start_time ,
126
147
total_output_tokens = sum (output_tokens ),
127
148
total_input_tokens = total_input_tokens ,
128
- acceptance_rate = global_acceptance_rate ,
129
149
request_latency_percentiles = PercentileStats .from_iterable (
130
150
request_latencies ),
131
151
tpot_percentiles = PercentileStats .from_iterable (
@@ -139,7 +159,12 @@ def generate_statistics_summary(self) -> None:
139
159
generation_latencies ),
140
160
token_percentiles = PercentileStats .from_iterable (output_tokens ),
141
161
issue_rate_ns = queue_time_total / num_requests ,
142
- acceptance_percentiles = percentile_request_accept ,
162
+ acceptance_length = global_acceptance_length ,
163
+ num_draft_tokens_percentiles = num_draft_tokens_percentiles ,
164
+ num_accepted_draft_tokens_percentiles =
165
+ num_accepted_draft_tokens_percentiles ,
166
+ draft_acceptance_rate_percentiles = draft_acceptance_rate_percentiles ,
167
+ acceptance_length_percentiles = acceptance_length_percentiles ,
143
168
)
144
169
145
170
return stats
@@ -164,12 +189,13 @@ def __init__(self,
164
189
logger (Logger): A logger for logging.
165
190
streaming (bool, optional): Streaming benchmark used. Defaults to False.
166
191
"""
167
- self .raw_statistics = statistics
168
- self .statistics = statistics .generate_statistics_summary ()
169
192
self .dataset_metadata = dataset_metadata
170
193
self .rt_cfg = rt_cfg
171
194
self .logger = logger
172
195
self .kwargs = kwargs
196
+ self .raw_statistics = statistics
197
+ self .statistics = statistics .generate_statistics_summary (
198
+ self .get_max_draft_len ())
173
199
self .streaming = streaming
174
200
175
201
@staticmethod
@@ -415,9 +441,22 @@ def get_statistics_dict(self) -> Dict[str, Any]:
415
441
stats_dict ["decoding_stats" ] = {
416
442
"mode" :
417
443
decoding_mode ,
418
- "acceptance_percentiles" :
419
- self .statistics .acceptance_percentiles .model_dump (
444
+ "num_draft_tokens_percentiles" :
445
+ self .statistics .num_draft_tokens_percentiles .model_dump (
446
+ exclude_none = True , by_alias = True , mode = 'json' )
447
+ if self .statistics .num_draft_tokens_percentiles else None ,
448
+ "num_accepted_draft_tokens_percentiles" :
449
+ self .statistics .num_accepted_draft_tokens_percentiles .
450
+ model_dump (exclude_none = True , by_alias = True , mode = 'json' ) if
451
+ self .statistics .num_accepted_draft_tokens_percentiles else None ,
452
+ "draft_acceptance_rate_percentiles" :
453
+ self .statistics .draft_acceptance_rate_percentiles .model_dump (
454
+ exclude_none = True , by_alias = True , mode = 'json' )
455
+ if self .statistics .draft_acceptance_rate_percentiles else None ,
456
+ "acceptance_length_percentiles" :
457
+ self .statistics .acceptance_length_percentiles .model_dump (
420
458
exclude_none = True , by_alias = True , mode = 'json' )
459
+ if self .statistics .acceptance_length_percentiles else None
421
460
}
422
461
# Dataset metadata
423
462
stats_dict ["dataset" ] = self .dataset_metadata .model_dump (by_alias = True ,
@@ -557,21 +596,61 @@ def report_statistics(self) -> None:
557
596
decoding_stats = ""
558
597
if decoding is not None :
559
598
decoding = stats_dict ["decoding_stats" ]
560
- acc = decoding ["acceptance_percentiles" ]
561
- acc_stats = "\n " .join (
562
- f"[AR] { key .upper ():<7} : { acc [key ]:.2f} " for key in
563
- ["minimum" , "maximum" , "average" , "p50" , "p90" , "p95" , "p99" ])
564
-
565
- decoding_stats = (
566
- "===========================================================\n "
567
- f"= DECODING STATISTICS ({ decoding ['mode' ]} )\n "
568
- "===========================================================\n "
569
- "\n "
570
- "-- Acceptance Rate Details --------------------------------\n \n "
571
- "\n "
572
- f"{ acc_stats } "
573
- f"\n "
574
- "===========================================================\n " )
599
+ if self .get_max_draft_len () > 0 :
600
+ num_draft_tokens = decoding ["num_draft_tokens_percentiles" ]
601
+ num_draft_tokens_stats = "\n " .join (
602
+ f"[DT] { key .upper ():<7} : { num_draft_tokens [key ]:.2f} "
603
+ for key in [
604
+ "minimum" , "maximum" , "average" , "p50" , "p90" , "p95" ,
605
+ "p99"
606
+ ])
607
+
608
+ num_accepted_draft_tokens = decoding [
609
+ "num_accepted_draft_tokens_percentiles" ]
610
+ num_accepted_draft_tokens_stats = "\n " .join (
611
+ f"[ADT] { key .upper ():<7} : { num_accepted_draft_tokens [key ]:.2f} "
612
+ for key in [
613
+ "minimum" , "maximum" , "average" , "p50" , "p90" , "p95" ,
614
+ "p99"
615
+ ])
616
+
617
+ draft_acceptance_rate = decoding [
618
+ "draft_acceptance_rate_percentiles" ]
619
+ draft_acceptance_rate_stats = "\n " .join (
620
+ f"[DAR] { key .upper ():<7} : { draft_acceptance_rate [key ]:.2f} "
621
+ for key in [
622
+ "minimum" , "maximum" , "average" , "p50" , "p90" , "p95" ,
623
+ "p99"
624
+ ])
625
+
626
+ acceptance_length = decoding ["acceptance_length_percentiles" ]
627
+ acceptance_length_stats = "\n " .join (
628
+ f"[AL] { key .upper ():<7} : { acceptance_length [key ]:.2f} "
629
+ for key in [
630
+ "minimum" , "maximum" , "average" , "p50" , "p90" , "p95" ,
631
+ "p99"
632
+ ])
633
+
634
+ decoding_stats = (
635
+ "===========================================================\n "
636
+ f"= DECODING STATISTICS ({ decoding ['mode' ]} )\n "
637
+ "===========================================================\n "
638
+ "\n "
639
+ "-- Number of Draft Tokens Details --------------------------------\n \n "
640
+ "\n "
641
+ f"{ num_draft_tokens_stats } "
642
+ f"\n "
643
+ "-- Number of Accepted Draft Tokens Details --------------------------------\n \n "
644
+ f"{ num_accepted_draft_tokens_stats } "
645
+ f"\n "
646
+ "-- Draft Acceptance Rate Details --------------------------------\n \n "
647
+ f"{ draft_acceptance_rate_stats } "
648
+ f"\n "
649
+ "-- Acceptance Length Details --------------------------------\n \n "
650
+ f"{ acceptance_length_stats } "
651
+ f"\n "
652
+ "===========================================================\n "
653
+ )
575
654
576
655
logging_info = (f"{ backend_info } "
577
656
f"{ request_info } "
@@ -582,3 +661,12 @@ def report_statistics(self) -> None:
582
661
f"{ self .dataset_metadata .get_summary_for_print ()} " )
583
662
self .logger .info (logging_info )
584
663
return self .statistics
664
+
665
+ def get_max_draft_len (self ) -> int :
666
+ """Get max_draft_len from speculative_config."""
667
+ # Try to get from speculative_config
668
+ if ("speculative_config" in self .kwargs
669
+ and self .kwargs ["speculative_config" ] is not None ):
670
+ return self .kwargs ["speculative_config" ].max_draft_len or 0
671
+
672
+ return 0
0 commit comments