Skip to content

Commit 5095693

Browse files
AryanBagadermccorm4
authored andcommitted
feat: Add output token counter to frontend metrics (#4202)
Signed-off-by: Aryan Bagade <[email protected]>
1 parent 696647c commit 5095693

File tree

3 files changed

+186
-7
lines changed

3 files changed

+186
-7
lines changed

lib/bindings/python/src/dynamo/prometheus_names.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,8 @@ class frontend_service:
5555
INPUT_SEQUENCE_TOKENS = "input_sequence_tokens"
5656
# Output sequence length in tokens
5757
OUTPUT_SEQUENCE_TOKENS = "output_sequence_tokens"
58+
# Total number of output tokens generated (counter that updates in real-time)
59+
OUTPUT_TOKENS_TOTAL = "output_tokens_total"
5860
# Time to first token in seconds
5961
TIME_TO_FIRST_TOKEN_SECONDS = "time_to_first_token_seconds"
6062
# Inter-token latency in seconds
@@ -76,13 +78,21 @@ class frontend_service:
7678
MODEL_MIGRATION_LIMIT = "model_migration_limit"
7779

7880

79-
class kvbm_connector:
80-
"""KVBM connector"""
81-
82-
# KVBM connector leader
83-
KVBM_CONNECTOR_LEADER = "kvbm_connector_leader"
84-
# KVBM connector worker
85-
KVBM_CONNECTOR_WORKER = "kvbm_connector_worker"
81+
class kvbm:
82+
"""KVBM"""
83+
84+
# The number of offload blocks from device to host
85+
OFFLOAD_BLOCKS_D2H = "offload_blocks_d2h"
86+
# The number of offload blocks from host to disk
87+
OFFLOAD_BLOCKS_H2D = "offload_blocks_h2d"
88+
# The number of offload blocks from device to disk (bypassing host memory)
89+
OFFLOAD_BLOCKS_D2D = "offload_blocks_d2d"
90+
# The number of onboard blocks from host to device
91+
ONBOARD_BLOCKS_H2D = "onboard_blocks_h2d"
92+
# The number of onboard blocks from disk to device
93+
ONBOARD_BLOCKS_D2D = "onboard_blocks_d2d"
94+
# The number of matched tokens
95+
MATCHED_TOKENS = "matched_tokens"
8696

8797

8898
class kvrouter:

lib/llm/src/http/service/metrics.rs

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,7 @@ pub struct Metrics {
161161
request_duration: HistogramVec,
162162
input_sequence_length: HistogramVec,
163163
output_sequence_length: HistogramVec,
164+
output_tokens_counter: IntCounterVec,
164165
time_to_first_token: HistogramVec,
165166
inter_token_latency: HistogramVec,
166167

@@ -266,6 +267,7 @@ impl Metrics {
266267
/// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
267268
/// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
268269
/// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
270+
/// - `{prefix}_output_tokens_total` - IntCounterVec for total output tokens generated (real-time updates)
269271
/// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
270272
/// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
271273
///
@@ -392,6 +394,15 @@ impl Metrics {
392394
)
393395
.unwrap();
394396

397+
let output_tokens_counter = IntCounterVec::new(
398+
Opts::new(
399+
frontend_metric_name(frontend_service::OUTPUT_TOKENS_TOTAL),
400+
"Total number of output tokens generated (updates in real-time)",
401+
),
402+
&["model"],
403+
)
404+
.unwrap();
405+
395406
// Time to first token buckets: configurable via DYN_METRICS_TTFT_{MIN,MAX,COUNT}
396407
let (ttft_min, ttft_max, ttft_count) =
397408
parse_bucket_config("DYN_METRICS_TTFT", 0.001, 480.0, 18);
@@ -487,6 +498,7 @@ impl Metrics {
487498
request_duration,
488499
input_sequence_length,
489500
output_sequence_length,
501+
output_tokens_counter,
490502
time_to_first_token,
491503
inter_token_latency,
492504
model_total_kv_blocks,
@@ -581,6 +593,7 @@ impl Metrics {
581593
registry.register(Box::new(self.request_duration.clone()))?;
582594
registry.register(Box::new(self.input_sequence_length.clone()))?;
583595
registry.register(Box::new(self.output_sequence_length.clone()))?;
596+
registry.register(Box::new(self.output_tokens_counter.clone()))?;
584597
registry.register(Box::new(self.time_to_first_token.clone()))?;
585598
registry.register(Box::new(self.inter_token_latency.clone()))?;
586599

@@ -832,6 +845,12 @@ impl ResponseMetricCollector {
832845
return;
833846
}
834847

848+
// Increment the real-time output tokens counter
849+
self.metrics
850+
.output_tokens_counter
851+
.with_label_values(&[&self.model])
852+
.inc_by(num_tokens as u64);
853+
835854
if self.is_first_token {
836855
// NOTE: when there are multiple tokens in the first response,
837856
// we use the full response time as TTFT and ignore the ITL
@@ -1187,4 +1206,151 @@ mod tests {
11871206
);
11881207
}
11891208
}
1209+
1210+
#[test]
1211+
fn test_output_tokens_counter_increments() {
1212+
let metrics = Arc::new(Metrics::new());
1213+
let registry = prometheus::Registry::new();
1214+
metrics.register(&registry).unwrap();
1215+
1216+
let model = "test-model";
1217+
1218+
// Create response collector
1219+
let mut collector = metrics.clone().create_response_collector(model);
1220+
1221+
// Simulate first chunk (5 tokens)
1222+
collector.observe_response(100, 5);
1223+
1224+
// Verify counter incremented by 5
1225+
let counter_value = metrics
1226+
.output_tokens_counter
1227+
.with_label_values(&[model])
1228+
.get();
1229+
assert_eq!(counter_value, 5);
1230+
1231+
// Simulate second chunk (10 tokens)
1232+
collector.observe_response(100, 10);
1233+
1234+
// Verify counter incremented to 15
1235+
let counter_value = metrics
1236+
.output_tokens_counter
1237+
.with_label_values(&[model])
1238+
.get();
1239+
assert_eq!(counter_value, 15);
1240+
1241+
// Simulate third chunk (7 tokens)
1242+
collector.observe_response(100, 7);
1243+
1244+
// Verify counter incremented to 22
1245+
let counter_value = metrics
1246+
.output_tokens_counter
1247+
.with_label_values(&[model])
1248+
.get();
1249+
assert_eq!(counter_value, 22);
1250+
}
1251+
1252+
#[test]
1253+
fn test_output_tokens_counter_zero_tokens() {
1254+
let metrics = Arc::new(Metrics::new());
1255+
let registry = prometheus::Registry::new();
1256+
metrics.register(&registry).unwrap();
1257+
1258+
let model = "test-model";
1259+
let mut collector = metrics.clone().create_response_collector(model);
1260+
1261+
// Simulate chunk with zero tokens (should not increment)
1262+
collector.observe_response(100, 0);
1263+
1264+
// Verify counter remains 0
1265+
let counter_value = metrics
1266+
.output_tokens_counter
1267+
.with_label_values(&[model])
1268+
.get();
1269+
assert_eq!(counter_value, 0);
1270+
1271+
// Add some tokens
1272+
collector.observe_response(100, 5);
1273+
assert_eq!(
1274+
metrics
1275+
.output_tokens_counter
1276+
.with_label_values(&[model])
1277+
.get(),
1278+
5
1279+
);
1280+
1281+
// Try zero tokens again (should not change counter)
1282+
collector.observe_response(100, 0);
1283+
assert_eq!(
1284+
metrics
1285+
.output_tokens_counter
1286+
.with_label_values(&[model])
1287+
.get(),
1288+
5
1289+
);
1290+
}
1291+
1292+
#[test]
1293+
fn test_output_tokens_counter_multiple_models() {
1294+
let metrics = Arc::new(Metrics::new());
1295+
let registry = prometheus::Registry::new();
1296+
metrics.register(&registry).unwrap();
1297+
1298+
let model1 = "model-1";
1299+
let model2 = "model-2";
1300+
1301+
// Create collectors for different models
1302+
let mut collector1 = metrics.clone().create_response_collector(model1);
1303+
let mut collector2 = metrics.clone().create_response_collector(model2);
1304+
1305+
// Increment model1
1306+
collector1.observe_response(100, 10);
1307+
assert_eq!(
1308+
metrics
1309+
.output_tokens_counter
1310+
.with_label_values(&[model1])
1311+
.get(),
1312+
10
1313+
);
1314+
assert_eq!(
1315+
metrics
1316+
.output_tokens_counter
1317+
.with_label_values(&[model2])
1318+
.get(),
1319+
0
1320+
);
1321+
1322+
// Increment model2
1323+
collector2.observe_response(200, 20);
1324+
assert_eq!(
1325+
metrics
1326+
.output_tokens_counter
1327+
.with_label_values(&[model1])
1328+
.get(),
1329+
10
1330+
);
1331+
assert_eq!(
1332+
metrics
1333+
.output_tokens_counter
1334+
.with_label_values(&[model2])
1335+
.get(),
1336+
20
1337+
);
1338+
1339+
// Increment model1 again
1340+
collector1.observe_response(100, 5);
1341+
assert_eq!(
1342+
metrics
1343+
.output_tokens_counter
1344+
.with_label_values(&[model1])
1345+
.get(),
1346+
15
1347+
);
1348+
assert_eq!(
1349+
metrics
1350+
.output_tokens_counter
1351+
.with_label_values(&[model2])
1352+
.get(),
1353+
20
1354+
);
1355+
}
11901356
}

lib/runtime/src/metrics/prometheus_names.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,9 @@ pub mod frontend_service {
113113
/// Output sequence length in tokens
114114
pub const OUTPUT_SEQUENCE_TOKENS: &str = "output_sequence_tokens";
115115

116+
/// Total number of output tokens generated (counter that updates in real-time)
117+
pub const OUTPUT_TOKENS_TOTAL: &str = "output_tokens_total";
118+
116119
/// Time to first token in seconds
117120
pub const TIME_TO_FIRST_TOKEN_SECONDS: &str = "time_to_first_token_seconds";
118121

0 commit comments

Comments
 (0)