Skip to content

Commit 57e8844

Browse files
committed
docs: improve metrics documentation and fix naming
- Fix metric name from model_workers_total to model_workers - Document model name deduplication behavior in README.md - Add comments explaining gauge vs counter usage for runtime config metrics - Clarify that some metrics use gauges because they're synchronized from upstream Signed-off-by: Keiven Chang <[email protected]>
1 parent fe553a0 commit 57e8844

File tree

2 files changed

+18
-11
lines changed

2 files changed

+18
-11
lines changed

deploy/metrics/README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,9 +98,11 @@ These metrics come from the Model Deployment Card information provided by worker
9898
- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge)
9999

100100
**Worker Management Metrics:**
101-
- `dynamo_frontend_model_workers_total`: Number of worker instances currently serving the model (gauge)
101+
- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge)
102102

103-
**Note**: The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
103+
**Important Notes:**
104+
- The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
105+
- **Model Name Deduplication**: When multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances.
104106

105107
#### Request Processing Flow
106108

lib/llm/src/http/service/metrics.rs

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,14 +40,16 @@ pub struct Metrics {
4040
time_to_first_token: HistogramVec,
4141
inter_token_latency: HistogramVec,
4242

43-
// Runtime configuration metrics
43+
// Runtime configuration metrics. Note: Some of these metrics represent counter-like values from
44+
// source systems, but are implemented as gauges because they are copied/synchronized from upstream
45+
// counter values rather than being directly incremented.
4446
model_total_kv_blocks: IntGaugeVec,
4547
model_max_num_seqs: IntGaugeVec,
4648
model_max_num_batched_tokens: IntGaugeVec,
4749
model_context_length: IntGaugeVec,
4850
model_kv_cache_block_size: IntGaugeVec,
4951
model_migration_limit: IntGaugeVec,
50-
model_workers_total: IntGaugeVec,
52+
model_workers: IntGaugeVec, // this is an actual gauge, not a counter
5153
}
5254

5355
// Inflight tracks requests from HTTP handler start until complete response is finished.
@@ -151,7 +153,7 @@ impl Metrics {
151153
/// - `{prefix}_model_context_length` - IntGaugeVec for maximum context length for a worker serving the model
152154
/// - `{prefix}_model_kv_cache_block_size` - IntGaugeVec for KV cache block size for a worker serving the model
153155
/// - `{prefix}_model_migration_limit` - IntGaugeVec for request migration limit for a worker serving the model
154-
/// - `{prefix}_model_workers_total` - IntGaugeVec for number of worker instances serving each model
156+
/// - `{prefix}_model_workers` - IntGaugeVec for number of worker instances serving each model
155157
///
156158
/// ## Runtime Config Polling Configuration
157159
///
@@ -270,6 +272,9 @@ impl Metrics {
270272
.unwrap();
271273

272274
// Runtime configuration metrics
275+
// Note: Some of these metrics represent counter-like values from source systems,
276+
// but are implemented as gauges because they are copied/synchronized from upstream
277+
// counter values rather than being directly incremented.
273278
let model_total_kv_blocks = IntGaugeVec::new(
274279
Opts::new(
275280
frontend_metric_name(frontend_service::MODEL_TOTAL_KV_BLOCKS),
@@ -324,9 +329,9 @@ impl Metrics {
324329
)
325330
.unwrap();
326331

327-
let model_workers_total = IntGaugeVec::new(
332+
let model_workers = IntGaugeVec::new(
328333
Opts::new(
329-
frontend_metric_name(frontend_service::MODEL_WORKERS_TOTAL),
334+
frontend_metric_name(frontend_service::MODEL_WORKERS),
330335
"Number of worker instances currently serving the model",
331336
),
332337
&["model"],
@@ -349,7 +354,7 @@ impl Metrics {
349354
model_context_length,
350355
model_kv_cache_block_size,
351356
model_migration_limit,
352-
model_workers_total,
357+
model_workers,
353358
}
354359
}
355360

@@ -446,7 +451,7 @@ impl Metrics {
446451
registry.register(Box::new(self.model_context_length.clone()))?;
447452
registry.register(Box::new(self.model_kv_cache_block_size.clone()))?;
448453
registry.register(Box::new(self.model_migration_limit.clone()))?;
449-
registry.register(Box::new(self.model_workers_total.clone()))?;
454+
registry.register(Box::new(self.model_workers.clone()))?;
450455

451456
Ok(())
452457
}
@@ -604,7 +609,7 @@ impl Metrics {
604609

605610
// Update worker count metrics for all models
606611
for (model_name, count) in &model_worker_counts {
607-
metrics.model_workers_total
612+
metrics.model_workers
608613
.with_label_values(&[model_name])
609614
.set(*count);
610615
}
@@ -613,7 +618,7 @@ impl Metrics {
613618
let current_models_with_workers: std::collections::HashSet<String> =
614619
model_worker_counts.keys().cloned().collect();
615620
for model_name in known_models.difference(&current_models_with_workers) {
616-
metrics.model_workers_total
621+
metrics.model_workers
617622
.with_label_values(&[model_name])
618623
.set(0);
619624
}

0 commit comments

Comments
 (0)