Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 17 additions & 7 deletions lib/bindings/python/src/dynamo/prometheus_names.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ class frontend_service:
INPUT_SEQUENCE_TOKENS = "input_sequence_tokens"
# Output sequence length in tokens
OUTPUT_SEQUENCE_TOKENS = "output_sequence_tokens"
# Total number of output tokens generated (counter that updates in real-time)
OUTPUT_TOKENS_TOTAL = "output_tokens_total"
# Time to first token in seconds
TIME_TO_FIRST_TOKEN_SECONDS = "time_to_first_token_seconds"
# Inter-token latency in seconds
Expand All @@ -76,13 +78,21 @@ class frontend_service:
MODEL_MIGRATION_LIMIT = "model_migration_limit"


class kvbm_connector:
"""KVBM connector"""

# KVBM connector leader
KVBM_CONNECTOR_LEADER = "kvbm_connector_leader"
# KVBM connector worker
KVBM_CONNECTOR_WORKER = "kvbm_connector_worker"
class kvbm:
"""KVBM"""

# The number of offload blocks from device to host
OFFLOAD_BLOCKS_D2H = "offload_blocks_d2h"
# The number of offload blocks from host to disk
OFFLOAD_BLOCKS_H2D = "offload_blocks_h2d"
# The number of offload blocks from device to disk (bypassing host memory)
OFFLOAD_BLOCKS_D2D = "offload_blocks_d2d"
# The number of onboard blocks from host to device
ONBOARD_BLOCKS_H2D = "onboard_blocks_h2d"
# The number of onboard blocks from disk to device
ONBOARD_BLOCKS_D2D = "onboard_blocks_d2d"
# The number of matched tokens
MATCHED_TOKENS = "matched_tokens"


class kvrouter:
Expand Down
166 changes: 166 additions & 0 deletions lib/llm/src/http/service/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ pub struct Metrics {
request_duration: HistogramVec,
input_sequence_length: HistogramVec,
output_sequence_length: HistogramVec,
output_tokens_counter: IntCounterVec,
time_to_first_token: HistogramVec,
inter_token_latency: HistogramVec,

Expand Down Expand Up @@ -266,6 +267,7 @@ impl Metrics {
/// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
/// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
/// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
/// - `{prefix}_output_tokens_total` - IntCounterVec for total output tokens generated (real-time updates)
/// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
/// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
///
Expand Down Expand Up @@ -392,6 +394,15 @@ impl Metrics {
)
.unwrap();

let output_tokens_counter = IntCounterVec::new(
Opts::new(
frontend_metric_name(frontend_service::OUTPUT_TOKENS_TOTAL),
"Total number of output tokens generated (updates in real-time)",
),
&["model"],
)
.unwrap();

// Time to first token buckets: configurable via DYN_METRICS_TTFT_{MIN,MAX,COUNT}
let (ttft_min, ttft_max, ttft_count) =
parse_bucket_config("DYN_METRICS_TTFT", 0.001, 480.0, 18);
Expand Down Expand Up @@ -487,6 +498,7 @@ impl Metrics {
request_duration,
input_sequence_length,
output_sequence_length,
output_tokens_counter,
time_to_first_token,
inter_token_latency,
model_total_kv_blocks,
Expand Down Expand Up @@ -581,6 +593,7 @@ impl Metrics {
registry.register(Box::new(self.request_duration.clone()))?;
registry.register(Box::new(self.input_sequence_length.clone()))?;
registry.register(Box::new(self.output_sequence_length.clone()))?;
registry.register(Box::new(self.output_tokens_counter.clone()))?;
registry.register(Box::new(self.time_to_first_token.clone()))?;
registry.register(Box::new(self.inter_token_latency.clone()))?;

Expand Down Expand Up @@ -832,6 +845,12 @@ impl ResponseMetricCollector {
return;
}

// Increment the real-time output tokens counter
self.metrics
.output_tokens_counter
.with_label_values(&[&self.model])
.inc_by(num_tokens as u64);

if self.is_first_token {
// NOTE: when there are multiple tokens in the first response,
// we use the full response time as TTFT and ignore the ITL
Expand Down Expand Up @@ -1187,4 +1206,151 @@ mod tests {
);
}
}

#[test]
fn test_output_tokens_counter_increments() {
let metrics = Arc::new(Metrics::new());
let registry = prometheus::Registry::new();
metrics.register(&registry).unwrap();

let model = "test-model";

// Create response collector
let mut collector = metrics.clone().create_response_collector(model);

// Simulate first chunk (5 tokens)
collector.observe_response(100, 5);

// Verify counter incremented by 5
let counter_value = metrics
.output_tokens_counter
.with_label_values(&[model])
.get();
assert_eq!(counter_value, 5);

// Simulate second chunk (10 tokens)
collector.observe_response(100, 10);

// Verify counter incremented to 15
let counter_value = metrics
.output_tokens_counter
.with_label_values(&[model])
.get();
assert_eq!(counter_value, 15);

// Simulate third chunk (7 tokens)
collector.observe_response(100, 7);

// Verify counter incremented to 22
let counter_value = metrics
.output_tokens_counter
.with_label_values(&[model])
.get();
assert_eq!(counter_value, 22);
}

#[test]
fn test_output_tokens_counter_zero_tokens() {
let metrics = Arc::new(Metrics::new());
let registry = prometheus::Registry::new();
metrics.register(&registry).unwrap();

let model = "test-model";
let mut collector = metrics.clone().create_response_collector(model);

// Simulate chunk with zero tokens (should not increment)
collector.observe_response(100, 0);

// Verify counter remains 0
let counter_value = metrics
.output_tokens_counter
.with_label_values(&[model])
.get();
assert_eq!(counter_value, 0);

// Add some tokens
collector.observe_response(100, 5);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model])
.get(),
5
);

// Try zero tokens again (should not change counter)
collector.observe_response(100, 0);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model])
.get(),
5
);
}

#[test]
fn test_output_tokens_counter_multiple_models() {
let metrics = Arc::new(Metrics::new());
let registry = prometheus::Registry::new();
metrics.register(&registry).unwrap();

let model1 = "model-1";
let model2 = "model-2";

// Create collectors for different models
let mut collector1 = metrics.clone().create_response_collector(model1);
let mut collector2 = metrics.clone().create_response_collector(model2);

// Increment model1
collector1.observe_response(100, 10);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model1])
.get(),
10
);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model2])
.get(),
0
);

// Increment model2
collector2.observe_response(200, 20);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model1])
.get(),
10
);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model2])
.get(),
20
);

// Increment model1 again
collector1.observe_response(100, 5);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model1])
.get(),
15
);
assert_eq!(
metrics
.output_tokens_counter
.with_label_values(&[model2])
.get(),
20
);
}
}
3 changes: 3 additions & 0 deletions lib/runtime/src/metrics/prometheus_names.rs
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,9 @@ pub mod frontend_service {
/// Output sequence length in tokens
pub const OUTPUT_SEQUENCE_TOKENS: &str = "output_sequence_tokens";

/// Total number of output tokens generated (counter that updates in real-time)
pub const OUTPUT_TOKENS_TOTAL: &str = "output_tokens_total";

/// Time to first token in seconds
pub const TIME_TO_FIRST_TOKEN_SECONDS: &str = "time_to_first_token_seconds";

Expand Down
Loading