ai-dynamo · keivenchang · Sep 24, 2025 · Sep 18, 2025 · Sep 19, 2025 · Sep 19, 2025
@@ -23,7 +23,7 @@ async def register_llm_with_runtime_config(
     Returns:
         bool: True if registration succeeded, False if it failed
     """
-    runtime_config = await _get_runtime_config(engine, dynamo_args)
+    runtime_config = await _get_runtime_config(engine, server_args, dynamo_args)
     input_type = ModelInput.Tokens
     output_type = ModelType.Chat | ModelType.Completions
     if not server_args.skip_tokenizer_init:
@@ -51,13 +51,25 @@ async def register_llm_with_runtime_config(
 
 
 async def _get_runtime_config(
-    engine: sgl.Engine, dynamo_args: DynamoArgs
+    engine: sgl.Engine, server_args: ServerArgs, dynamo_args: DynamoArgs
 ) -> Optional[ModelRuntimeConfig]:
     """Get runtime config from SGLang engine"""
     runtime_config = ModelRuntimeConfig()
     # set reasoning parser and tool call parser
     runtime_config.reasoning_parser = dynamo_args.reasoning_parser
     runtime_config.tool_call_parser = dynamo_args.tool_call_parser
+
+    # In SGLang, these are server_args, not scheduler_info (unlike vLLM)
+    # Note: If --max-running-requests is not specified, SGLang uses an internal default
+    # undocumented value. The value here will be None if not explicitly set by user.
+    max_running_requests = getattr(server_args, "max_running_requests", None)
+    if max_running_requests:
+        runtime_config.max_num_seqs = max_running_requests
+
+    max_prefill_tokens = getattr(server_args, "max_prefill_tokens", None)
+    if max_prefill_tokens:
+        runtime_config.max_num_batched_tokens = max_prefill_tokens
+
     try:
         # Try to check if the engine has a scheduler attribute with the computed values
         if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
@@ -77,7 +89,10 @@ async def _get_runtime_config(
                             f"(max_total_tokens={max_total_tokens}, page_size={page_size})"
                         )
 
-            # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info
+            # Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
+            # SGLang separates configuration (server_args) from runtime stats (scheduler_info).
+            # In contrast, vLLM exposes both config and runtime values through engine config.
+            # These are config parameters, so they must be retrieved from server_args only.
 
             return runtime_config
 

@@ -281,9 +281,29 @@ async def init(runtime: DistributedRuntime, config: Config):
         # TODO: fix this once we have a better way to get total_kv_blocks
         runtime_config = ModelRuntimeConfig()
 
+        # Set values from config that are available immediately
+        # Note: We populate max_num_seqs and max_num_batched_tokens from config
+        # to ensure Prometheus metrics are available even without engine stats
+
+        # Naming clarification:
+        # - In vLLM: max_num_seqs = maximum concurrent requests (this is an unusual name due to vLLM's historic reasons)
+        # - In TensorRT-LLM: max_batch_size = maximum concurrent requests (clearer name)
+        # Both parameters control the same thing: how many requests can be processed simultaneously
+        runtime_config.max_num_seqs = config.max_batch_size
+        runtime_config.max_num_batched_tokens = config.max_num_tokens
         runtime_config.reasoning_parser = config.reasoning_parser
         runtime_config.tool_call_parser = config.tool_call_parser
 
+        logging.info(f"Set runtime config max_num_seqs: {runtime_config.max_num_seqs}")
+        logging.info(
+            f"Set runtime config max_num_batched_tokens: {runtime_config.max_num_batched_tokens}"
+        )
+
+        # The get_engine_runtime_config function exists but is not called here due to:
+        # 1. get_stats_async requires active requests to work properly
+        # 2. We need runtime config during registration, before any requests are made
+        # 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation
+
         # publisher will be set later if publishing is enabled.
         handler_config = RequestHandlerConfig(
             component=component,

@@ -79,7 +79,30 @@ When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), th
 - `dynamo_frontend_requests_total`: Total LLM requests (counter)
 - `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram)
 
-**Note**: The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
+##### Model Configuration Metrics
+
+The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system:
+
+**Runtime Config Metrics (from ModelRuntimeConfig):**
+These metrics come from the runtime configuration provided by worker backends during registration.
+
+- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge)
+- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge)
+- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge)
+
+**MDC Metrics (from ModelDeploymentCard):**
+These metrics come from the Model Deployment Card information provided by worker backends during registration.
+
+- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge)
+- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge)
+- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge)
+
+**Worker Management Metrics:**
+- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge)
+
+**Important Notes:**
+- The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
+- **Model Name Deduplication**: When multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances.
 
 #### Request Processing Flow