File tree Expand file tree Collapse file tree 4 files changed +18
-8
lines changed
tensorrt_llm/_torch/modules/fused_moe Expand file tree Collapse file tree 4 files changed +18
-8
lines changed Original file line number Diff line number Diff line change @@ -111,10 +111,10 @@ def __init__(
111
111
self .initial_local_expert_ids ) == self .expert_size_per_partition
112
112
113
113
# The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
114
- max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
115
- self .moe_max_num_tokens = model_config .moe_max_num_tokens or model_config . max_num_tokens
114
+ moe_max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
115
+ self .moe_max_num_tokens = model_config .moe_max_num_tokens or moe_max_num_tokens
116
116
# The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
117
- if self .moe_max_num_tokens < max_num_tokens :
117
+ if self .moe_max_num_tokens < moe_max_num_tokens :
118
118
self .aux_stream = aux_stream_dict [
119
119
AuxStreamType .
120
120
MoeChunkingOverlap ] if aux_stream_dict is not None else torch .cuda .Stream (
Original file line number Diff line number Diff line change @@ -327,6 +327,16 @@ def __init__(
327
327
apply_router_weight_on_input : bool = False ,
328
328
layer_idx : Optional [int ] = None ,
329
329
):
330
+ if model_config .moe_max_num_tokens is None :
331
+ moe_max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
332
+ # The default moe_max_num_tokens is calculated from the following formula:
333
+ # max_isl = 8196, max_batch_size = 1024, mtp = 0
334
+ # max_num_tokens = ((mtp+1)*max_batch_size+max_isl+128+63)//64*64 = 9344
335
+ # moe_max_num_tokens = max_num_tokens * 2 = 18688
336
+ # It can avoid OOM for 8k/1k cases.
337
+ default_moe_max_num_tokens = 18688
338
+ if moe_max_num_tokens > default_moe_max_num_tokens :
339
+ model_config .moe_max_num_tokens = default_moe_max_num_tokens
330
340
331
341
super ().__init__ (
332
342
routing_method = routing_method ,
Original file line number Diff line number Diff line change @@ -82,8 +82,8 @@ def __init__(
82
82
self .expert_size_per_partition = self .expert_end - self .expert_start
83
83
84
84
# The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
85
- max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
86
- self .moe_max_num_tokens = model_config .moe_max_num_tokens or max_num_tokens
85
+ moe_max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
86
+ self .moe_max_num_tokens = model_config .moe_max_num_tokens or moe_max_num_tokens
87
87
88
88
self ._weights_created = False
89
89
if not model_config .skip_create_weights_in_init :
Original file line number Diff line number Diff line change @@ -151,10 +151,10 @@ def __init__(
151
151
self .initial_local_expert_ids ) == self .expert_size_per_partition
152
152
153
153
# The maximum number of tokens in MoE are multiplied by DP size when attention DP is enabled
154
- max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
155
- self .moe_max_num_tokens = model_config .moe_max_num_tokens or model_config . max_num_tokens
154
+ moe_max_num_tokens = model_config .max_num_tokens * model_config .mapping .dp_size
155
+ self .moe_max_num_tokens = model_config .moe_max_num_tokens or moe_max_num_tokens
156
156
# The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
157
- if self .moe_max_num_tokens < max_num_tokens :
157
+ if self .moe_max_num_tokens < moe_max_num_tokens :
158
158
self .aux_stream = aux_stream_dict [
159
159
AuxStreamType .
160
160
MoeChunkingOverlap ] if aux_stream_dict is not None else torch .cuda .Stream (
You can’t perform that action at this time.
0 commit comments