Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 18 additions & 3 deletions components/backends/sglang/src/dynamo/sglang/register.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ async def register_llm_with_runtime_config(
Returns:
bool: True if registration succeeded, False if it failed
"""
runtime_config = await _get_runtime_config(engine, dynamo_args)
runtime_config = await _get_runtime_config(engine, server_args, dynamo_args)
input_type = ModelInput.Tokens
output_type = ModelType.Chat | ModelType.Completions
if not server_args.skip_tokenizer_init:
Expand Down Expand Up @@ -51,13 +51,25 @@ async def register_llm_with_runtime_config(


async def _get_runtime_config(
engine: sgl.Engine, dynamo_args: DynamoArgs
engine: sgl.Engine, server_args: ServerArgs, dynamo_args: DynamoArgs
) -> Optional[ModelRuntimeConfig]:
"""Get runtime config from SGLang engine"""
runtime_config = ModelRuntimeConfig()
# set reasoning parser and tool call parser
runtime_config.reasoning_parser = dynamo_args.reasoning_parser
runtime_config.tool_call_parser = dynamo_args.tool_call_parser

# In SGLang, these are server_args, not scheduler_info (unlike vLLM)
# Note: If --max-running-requests is not specified, SGLang uses an internal default
# undocumented value. The value here will be None if not explicitly set by user.
max_running_requests = getattr(server_args, "max_running_requests", None)
if max_running_requests:
runtime_config.max_num_seqs = max_running_requests

max_prefill_tokens = getattr(server_args, "max_prefill_tokens", None)
if max_prefill_tokens:
runtime_config.max_num_batched_tokens = max_prefill_tokens

try:
# Try to check if the engine has a scheduler attribute with the computed values
if hasattr(engine, "scheduler_info") and engine.scheduler_info is not None:
Expand All @@ -77,7 +89,10 @@ async def _get_runtime_config(
f"(max_total_tokens={max_total_tokens}, page_size={page_size})"
)

# Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info
# Note: max_running_requests and max_prefill_tokens are NOT available in scheduler_info.
# SGLang separates configuration (server_args) from runtime stats (scheduler_info).
# In contrast, vLLM exposes both config and runtime values through engine config.
# These are config parameters, so they must be retrieved from server_args only.

return runtime_config

Expand Down
20 changes: 20 additions & 0 deletions components/backends/trtllm/src/dynamo/trtllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,29 @@ async def init(runtime: DistributedRuntime, config: Config):
# TODO: fix this once we have a better way to get total_kv_blocks
runtime_config = ModelRuntimeConfig()

# Set values from config that are available immediately
# Note: We populate max_num_seqs and max_num_batched_tokens from config
# to ensure Prometheus metrics are available even without engine stats

# Naming clarification:
# - In vLLM: max_num_seqs = maximum concurrent requests (this is an unusual name due to vLLM's historic reasons)
# - In TensorRT-LLM: max_batch_size = maximum concurrent requests (clearer name)
# Both parameters control the same thing: how many requests can be processed simultaneously
runtime_config.max_num_seqs = config.max_batch_size
runtime_config.max_num_batched_tokens = config.max_num_tokens
runtime_config.reasoning_parser = config.reasoning_parser
runtime_config.tool_call_parser = config.tool_call_parser

logging.info(f"Set runtime config max_num_seqs: {runtime_config.max_num_seqs}")
logging.info(
f"Set runtime config max_num_batched_tokens: {runtime_config.max_num_batched_tokens}"
)

# The get_engine_runtime_config function exists but is not called here due to:
# 1. get_stats_async requires active requests to work properly
# 2. We need runtime config during registration, before any requests are made
# 3. total_kv_blocks would ideally come from engine stats but is not critical for basic operation

# publisher will be set later if publishing is enabled.
handler_config = RequestHandlerConfig(
component=component,
Expand Down
25 changes: 24 additions & 1 deletion deploy/metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,30 @@ When using Dynamo HTTP Frontend (`--framework VLLM` or `--framework TRTLLM`), th
- `dynamo_frontend_requests_total`: Total LLM requests (counter)
- `dynamo_frontend_time_to_first_token_seconds`: Time to first token (histogram)

**Note**: The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
##### Model Configuration Metrics

The frontend also exposes model configuration metrics with the `dynamo_frontend_model_*` prefix. These metrics are populated from the worker backend registration service when workers register with the system:

**Runtime Config Metrics (from ModelRuntimeConfig):**
These metrics come from the runtime configuration provided by worker backends during registration.

- `dynamo_frontend_model_total_kv_blocks`: Total KV blocks available for a worker serving the model (gauge)
- `dynamo_frontend_model_max_num_seqs`: Maximum number of sequences for a worker serving the model (gauge)
- `dynamo_frontend_model_max_num_batched_tokens`: Maximum number of batched tokens for a worker serving the model (gauge)

**MDC Metrics (from ModelDeploymentCard):**
These metrics come from the Model Deployment Card information provided by worker backends during registration.

- `dynamo_frontend_model_context_length`: Maximum context length for a worker serving the model (gauge)
- `dynamo_frontend_model_kv_cache_block_size`: KV cache block size for a worker serving the model (gauge)
- `dynamo_frontend_model_migration_limit`: Request migration limit for a worker serving the model (gauge)

**Worker Management Metrics:**
- `dynamo_frontend_model_workers`: Number of worker instances currently serving the model (gauge)

**Important Notes:**
- The `dynamo_frontend_inflight_requests_total` metric tracks requests from HTTP handler start until the complete response is finished, while `dynamo_frontend_queued_requests_total` tracks requests from HTTP handler start until first token generation begins (including prefill time). HTTP queue time is a subset of inflight time.
- **Model Name Deduplication**: When multiple worker instances register with the same model name, only the first instance's configuration metrics (runtime config and MDC metrics) will be populated. Subsequent instances with duplicate model names will be skipped for configuration metric updates, though the worker count metric will reflect all instances.

#### Request Processing Flow

Expand Down
Loading
Loading