Skip to content

Commit 5e2a2ba

Browse files
keivenchangjasonqinzhou
authored andcommitted
feat: add NIM FE (num_request_max) + runtime config metrics with periodic polling (#3107)
Signed-off-by: Keiven Chang <[email protected]> Signed-off-by: Jason Zhou <[email protected]>
1 parent eeedcae commit 5e2a2ba

File tree

7 files changed

+915
-8
lines changed

7 files changed

+915
-8
lines changed

components/backends/sglang/src/dynamo/sglang/register.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ async def register_llm_with_runtime_config(
2323
Returns:
2424
bool: True if registration succeeded, False if it failed
2525
"""
26-
runtime_config = await _get_runtime_config(engine, dynamo_args)
26+
runtime_config = await _get_runtime_config(engine, server_args, dynamo_args)
2727
input_type = ModelInput.Tokens
2828
output_type = ModelType.Chat | ModelType.Completions
2929
if not server_args.skip_tokenizer_init:
@@ -51,13 +51,25 @@ async def register_llm_with_runtime_config(
5151

5252

5353
async def _get_runtime_config(
54-
engine: sgl.Engine, dynamo_args: DynamoArgs
54+
engine: sgl.Engine, server_args: ServerArgs, dynamo_args: DynamoArgs
5555
) -> Optional[ModelRuntimeConfig]:
5656
"""Get runtime config from SGLang engine"""
5757
runtime_config = ModelRuntimeConfig()
5858
# set reasoning parser and tool call parser
5959
runtime_config.reasoning_parser = dynamo_args.reasoning_parser
6060
runtime_config.tool_call_parser = dynamo_args.tool_call_parser
61+
62+
# In SGLang, these are server_args, not scheduler_info (unlike vLLM)
63+
# Note: If --max-running-requests is not specified, SGLang uses an internal default
64+
# undocumented value. The value here will be None if not explicitly set by user.
65+
max_running_requests = getattr(server_args, "max_running_requests", None)
66+
if max_running_requests:
67+
runtime_config.max_num_seqs = max_running_requests
68+
69+
max_prefill_tokens = getattr(server_args, "max_prefill_tokens", None)
70+
if max_prefill_tokens:
71+
runtime_config.max_num_batched_tokens = max_prefill_tokens
72+
6173
try:
6274
# Try to check if the engine has a scheduler attribute with the computed values
6375
if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
@@ -77,7 +89,10 @@ async def _get_runtime_config(
7789
f"(max_total_tokens={max_total_tokens}, page_size={page_size})"
7890
)
7991

80-
# Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info
92+
# Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
93+
# SGLang separates configuration (server_args) from runtime stats (scheduler_info).
94+
# In contrast, vLLM exposes both config and runtime values through engine config.
95+
# These are config parameters, so they must be retrieved from server_args only.
8196

8297
return runtime_config
8398

components/backends/trtllm/src/dynamo/trtllm/main.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -281,9 +281,29 @@ async def init(runtime: DistributedRuntime, config: Config):
281281
# TODO: fix this once we have a better way to get total_kv_blocks
282282
runtime_config = ModelRuntimeConfig()
283283

284+
# Set values from config that are available immediately
285+
# Note: We populate max_num_seqs and max_num_batched_tokens from config
286+
# to ensure Prometheus metrics are available even without engine stats
287+
288+
# Naming clarification:
289+
# - In vLLM: max_num_seqs = maximum concurrent requests (this is an unusual name due to vLLM's historic reasons)
290+
# - In TensorRT-LLM: max_batch_size = maximum concurrent requests (clearer name)
291+
# Both parameters control the same thing: how many requests can be processed simultaneously
292+
runtime_config.max_num_seqs = config.max_batch_size
293+
runtime_config.max_num_batched_tokens = config.max_num_tokens
284294
runtime_config.reasoning_parser = config.reasoning_parser
285295
runtime_config.tool_call_parser = config.tool_call_parser
286296

297+
logging.info(f"Set runtime config max_num_seqs: {runtime_config.max_num_seqs}")
298+
logging.info(
299+
f"Set runtime config max_num_batched_tokens: {runtime_config.max_num_batched_tokens}"
300+
)
301+
302+
# The get_engine_runtime_config function exists but is not called here due to:
303+
# 1. get_stats_async requires active requests to work properly
304+
# 2. We need runtime config during registration, before any requests are made
305+
# 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation
306+
287307
# publisher will be set later if publishing is enabled.
288308
handler_config = RequestHandlerConfig(
289309
component=component,

deploy/metrics/README.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,30 @@ When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), th
7979
- `dynamo_frontend_requests_total`: Total LLM requests (counter)
8080
- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram)
8181

82-
**Note**: The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
82+
##### Model Configuration Metrics
83+
84+
The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system:
85+
86+
**Runtime Config Metrics (from ModelRuntimeConfig):**
87+
These metrics come from the runtime configuration provided by worker backends during registration.
88+
89+
- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge)
90+
- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge)
91+
- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge)
92+
93+
**MDC Metrics (from ModelDeploymentCard):**
94+
These metrics come from the Model Deployment Card information provided by worker backends during registration.
95+
96+
- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge)
97+
- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge)
98+
- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge)
99+
100+
**Worker Management Metrics:**
101+
- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge)
102+
103+
**Important Notes:**
104+
- The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
105+
- **Model Name Deduplication**: When multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances.
83106

84107
#### Request Processing Flow
85108

0 commit comments

Comments
 (0)