Update default_moe_max_num_tokens.

yuxianq · yuxianq · commit 2bd9e773ac0b · 2025-08-12T04:16:06.000Z
Signed-off-by: Yuxian Qiu &lt;142763828+yuxianq@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -111,10 +111,10 @@ def __init__(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
-        self.moe_max_num_tokens = model_config.moe_max_num_tokens or model_config.max_num_tokens
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
         # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
-        if self.moe_max_num_tokens < max_num_tokens:
+        if self.moe_max_num_tokens < moe_max_num_tokens:
             self.aux_stream = aux_stream_dict[
                 AuxStreamType.
                 MoeChunkingOverlap] if aux_stream_dict is not None else torch.cuda.Stream(
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -327,6 +327,16 @@ def __init__(
         apply_router_weight_on_input: bool = False,
         layer_idx: Optional[int] = None,
     ):
+        if model_config.moe_max_num_tokens is None:
+            moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+            # The default moe_max_num_tokens is calculated from the following formula:
+            # max_isl = 8196, max_batch_size = 1024, mtp = 0
+            # max_num_tokens = ((mtp+1)*max_batch_size+max_isl+128+63)//64*64 = 9344
+            # moe_max_num_tokens = max_num_tokens * 2 = 18688
+            # It can avoid OOM for 8k/1k cases.
+            default_moe_max_num_tokens = 18688
+            if moe_max_num_tokens > default_moe_max_num_tokens:
+                model_config.moe_max_num_tokens = default_moe_max_num_tokens
 
         super().__init__(
             routing_method=routing_method,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_vanilla.py
@@ -82,8 +82,8 @@ def __init__(
         self.expert_size_per_partition = self.expert_end - self.expert_start
 
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
-        self.moe_max_num_tokens = model_config.moe_max_num_tokens or max_num_tokens
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
 
         self._weights_created = False
         if not model_config.skip_create_weights_in_init:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -151,10 +151,10 @@ def __init__(
             self.initial_local_expert_ids) == self.expert_size_per_partition
 
         # The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
-        max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
-        self.moe_max_num_tokens = model_config.moe_max_num_tokens or model_config.max_num_tokens
+        moe_max_num_tokens = model_config.max_num_tokens * model_config.mapping.dp_size
+        self.moe_max_num_tokens = model_config.moe_max_num_tokens or moe_max_num_tokens
         # The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
-        if self.moe_max_num_tokens < max_num_tokens:
+        if self.moe_max_num_tokens < moe_max_num_tokens:
             self.aux_stream = aux_stream_dict[
                 AuxStreamType.
                 MoeChunkingOverlap] if aux_stream_dict is not None else torch.cuda.Stream(