Added support for Qwen3-MoE

kinjalpatel27 · kinjalpatel27 · commit f46e41d2d2fb · 2025-11-22T01:56:53.000Z
Signed-off-by: Kinjal Patel &lt;kinjalpravin@nvidia.com&gt;
diff --git a/examples/vllm_serve/fakequant_worker.py b/examples/vllm_serve/fakequant_worker.py
@@ -33,7 +33,7 @@
 
 
 def convert_amax_hf2vllm(
-    hf_state_dict: dict[str, torch.Tensor],
+    hf_state_dict: dict[str, torch.Tensor], fuse_experts: bool = False
 ) -> dict[str, torch.Tensor]:
     """
     Convert amax values from HuggingFace format to vLLM format.
@@ -66,13 +66,44 @@ def convert_amax_hf2vllm(
             merge_groups[base_pattern].append((key, value))
             continue
 
-        # Check if this is a gate/up projection that needs merging
-        gate_up_match = "mixer" not in key and re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
+        # Check if this is an expert gate/up projection
+        # Pattern: model.layers.0.mlp.experts.*.gate_proj.input_quantizer._amax and
+        # model.layers.0.mlp.experts.*.up_proj.input_quantizer._amax
+        # Maps to: model.layers.0.mlp.experts.w13_input_quantizer._amax
+        expert_gate_up_match = (
+            "mixer" not in key
+            and fuse_experts
+            and re.search(r"(.*\.experts)\.\d+\.(gate|up)_proj\.([^.]+_quantizer\._amax)$", key)
+        )
+        if expert_gate_up_match:
+            base_pattern = expert_gate_up_match.group(1) + ".w13_" + expert_gate_up_match.group(3)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
+        # Check if this is a non-expert gate/up projection that needs merging
+        gate_up_match = (
+            "mixer" not in key
+            and "experts" not in key
+            and re.search(r"(.*\.)(gate|up)_proj(\..+_amax)$", key)
+        )
         if gate_up_match:
             base_pattern = gate_up_match.group(1) + "gate_up_proj" + gate_up_match.group(3)
             merge_groups[base_pattern].append((key, value))
             continue
 
+        # Check if this is an expert down_proj
+        # Pattern: model.layers.0.mlp.experts.*.down_proj.input_quantizer._amax
+        # Maps to: model.layers.0.mlp.experts.w2_input_quantizer._amax
+        expert_down_match = (
+            "mixer" not in key
+            and fuse_experts
+            and re.search(r"(.*\.experts)\.\d+\.down_proj\.([^.]+_quantizer\._amax)$", key)
+        )
+        if expert_down_match:
+            base_pattern = expert_down_match.group(1) + ".w2_" + expert_down_match.group(2)
+            merge_groups[base_pattern].append((key, value))
+            continue
+
         # Copy other amax keys as-is (like o_proj, down_proj)
         vllm_state_dict[key] = value
 
@@ -226,7 +257,7 @@ def calibrate_loop(model: Any = None) -> None:
                 for key, value in saved_amax_dict.items()
                 if key.endswith("quantizer_amax")
             }
-        saved_amax_dict = convert_amax_hf2vllm(saved_amax_dict)
+        saved_amax_dict = convert_amax_hf2vllm(saved_amax_dict, fuse_experts=True)
 
         current_state_dict = model.state_dict()
         # Count amax keys in checkpoint and model
diff --git a/modelopt/torch/quantization/plugins/vllm.py b/modelopt/torch/quantization/plugins/vllm.py
@@ -21,14 +21,21 @@
 import vllm.model_executor.layers.fused_moe.layer as vllm_fused_moe_layer
 import vllm.model_executor.layers.linear as vllm_linear
 
-try:
-    import vllm.model_executor.layers.fused_moe.shared_fused_moe as vllm_shared_fused_moe_layer
-except ImportError:
-    vllm_shared_fused_moe_layer = None
-
 from ...utils.distributed import ParallelState
 from ..nn import QuantLinearConvBase, QuantModule, QuantModuleRegistry, TensorQuantizer
 
+# Try multiple import paths for vLLM compatibility across versions
+vllm_shared_fused_moe_layer = None
+for module_path in [
+    "vllm.model_executor.layers.fused_moe.shared_fused_moe",  # 0.11.0+
+    "vllm.model_executor.layers.shared_fused_moe.shared_fused_moe",  # 0.10.2
+]:
+    try:
+        vllm_shared_fused_moe_layer = importlib.import_module(module_path)
+        break
+    except ImportError:
+        continue
+
 vllm_fused_moe_package = importlib.import_module("vllm.model_executor.layers.fused_moe.fused_moe")
 
 
diff --git a/tests/gpu/torch/export/test_vllm_fakequant_export.py b/tests/gpu/torch/export/test_vllm_fakequant_export.py
@@ -13,94 +13,97 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import torch
+import json
 from copy import deepcopy
 from functools import partial
-import modelopt.torch.quantization as mtq
-from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
-from modelopt.torch.export.unified_export_megatron import export_mcore_gpt_to_hf
-from _test_utils.torch.transformers_models import create_tiny_llama_dir
+
+import pytest
+import torch
+from _test_utils.import_helper import skip_if_no_megatron
 from _test_utils.torch.distributed.utils import spawn_multiprocess_job
 from _test_utils.torch.megatron.models import get_mcore_gpt_model
-from _test_utils.import_helper import skip_if_no_megatron
+from _test_utils.torch.transformers_models import create_tiny_llama_dir
 from transformers import AutoModelForCausalLM
 
-import os
-import json
+import modelopt.torch.quantization as mtq
+from modelopt.torch.export.unified_export_hf import export_hf_checkpoint
+from modelopt.torch.export.unified_export_megatron import export_mcore_gpt_to_hf
 
 skip_if_no_megatron(apex_or_te_required=True)
 
+
 @pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
 def test_hf_vllm_export(tmp_path, quant_cfg):
     """Test HuggingFace model export for vLLM with fake quantization.
-    
+
     This test verifies:
     1. Model weights match before and after export
     2. quant_amax.pth file is created, huggingface config file does not exist
     3. Amax values are correctly extracted and saved in quant_amax.pth file
     """
-    
+
     # Create a tiny LLaMA model for testing
     tiny_model_dir = create_tiny_llama_dir(tmp_path, with_tokenizer=True, num_hidden_layers=2)
-    
+
     # Load the model
     model = AutoModelForCausalLM.from_pretrained(tiny_model_dir)
     model = model.cuda()
     model.eval()
-    
+
     # Quantize the model
     def forward_loop(model):
         input_ids = torch.randint(0, model.config.vocab_size, (1, 128)).cuda()
         with torch.no_grad():
             model(input_ids)
-    
+
     model = mtq.quantize(model, quant_cfg, forward_loop)
-    
+
     model_state_dict = deepcopy(model.state_dict())
 
     # Export directory
     export_dir = tmp_path / "vllm_export"
     export_dir.mkdir(exist_ok=True)
-    
+
     # Export for vLLM
     export_hf_checkpoint(model, export_dir=export_dir, export_vllm_fq_weights_qstate=True)
 
     # check if quant_amax.pth file exists
     quant_amax_file = export_dir / "quant_amax.pth"
     assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
-    
+
     # make sure hf_quant_config.json file does not exist
     hf_quant_config_file = export_dir / "hf_quant_config.json"
-    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+    assert not hf_quant_config_file.exists(), (
+        f"hf_quant_config.json file should not be created in {export_dir}"
+    )
 
     # check weights match before and after export
     model_after = AutoModelForCausalLM.from_pretrained(export_dir)
     model_after = model_after.cuda()
     model_after.eval()
     model_after_state_dict = model_after.state_dict()
     amax_state_dict = {}
-    for key in model_state_dict.keys():
+    for key, param in model_state_dict.items():
         if key.endswith("_amax"):
-            amax_state_dict[key] = model_state_dict[key]
+            amax_state_dict[key] = param
             continue
-        
-        assert torch.allclose(model_state_dict[key], model_after_state_dict[key], atol=1e-6), (
+
+        assert torch.allclose(param, model_after_state_dict[key], atol=1e-6), (
             f"Weight mismatch for {key}: "
-            f"before shape={model_state_dict[key].shape}, after shape={model_after_state_dict[key].shape}, "
-            f"max diff={torch.abs(model_state_dict[key] - model_after_state_dict[key]).max()}"
+            f"before shape={param.shape}, after shape={model_after_state_dict[key].shape}, "
+            f"max diff={torch.abs(param - model_after_state_dict[key]).max()}"
         )
 
     # Verify amax values are correct
     amax_dict = torch.load(quant_amax_file)
     assert len(amax_dict) > 0, "amax_dict should not be empty"
-    assert amax_dict.keys() == amax_state_dict.keys(), f"amax keys mismatch between before and after export"
+    assert amax_dict.keys() == amax_state_dict.keys(), (
+        "amax keys mismatch between before and after export"
+    )
 
 
 def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
-    """Test megatron-core model export for vLLM with fake quantization.
-    
-    """
+    """Test megatron-core model export for vLLM with fake quantization."""
     # Create a tiny mcore GPT model
     num_layers = 2
     hidden_size = 64
@@ -109,7 +112,7 @@ def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
     ffn_hidden_size = 128
     max_sequence_length = 32
     vocab_size = 64
-    
+
     model = get_mcore_gpt_model(
         tensor_model_parallel_size=size,
         pipeline_model_parallel_size=1,
@@ -126,7 +129,7 @@ def _test_mcore_vllm_export(tmp_path, quant_cfg, rank, size):
         transformer_impl="modelopt",
     ).cuda()
     model.eval()
-    
+
     # Quantize the model
     def forward_loop(model):
         batch_size = 1
@@ -138,11 +141,8 @@ def forward_loop(model):
         attention_mask = attention_mask < 0.5  # Convert to boolean mask
         with torch.no_grad():
             model(input_ids, position_ids, attention_mask)
-    
-    model = mtq.quantize(model, quant_cfg, forward_loop)
-    
-    model_state_dict = deepcopy(model.state_dict())
 
+    model = mtq.quantize(model, quant_cfg, forward_loop)
     # Create HF config for export
     pretrained_config = {
         "architectures": ["LlamaForCausalLM"],
@@ -156,14 +156,14 @@ def forward_loop(model):
         "num_key_value_heads": num_query_groups,
         "torch_dtype": "bfloat16",
     }
-    
+
     with open(tmp_path / "config.json", "w") as f:
         json.dump(pretrained_config, f)
 
     # Export directory
     export_dir = tmp_path / "vllm_export"
     export_dir.mkdir(exist_ok=True)
-    
+
     # Export for vLLM
     export_mcore_gpt_to_hf(
         model,
@@ -176,10 +176,12 @@ def forward_loop(model):
     # check if quant_amax.pth file exists
     quant_amax_file = export_dir / "quant_amax.pth"
     assert quant_amax_file.exists(), f"quant_amax.pth file should be created in {export_dir}"
-    
+
     # make sure hf_quant_config.json file does not exist
     hf_quant_config_file = export_dir / "hf_quant_config.json"
-    assert not hf_quant_config_file.exists(), f"hf_quant_config.json file should not be created in {export_dir}"
+    assert not hf_quant_config_file.exists(), (
+        f"hf_quant_config.json file should not be created in {export_dir}"
+    )
 
 
 @pytest.mark.parametrize("quant_cfg", [mtq.FP8_DEFAULT_CFG])
@@ -190,5 +192,3 @@ def test_mcore_vllm_export(tmp_path, quant_cfg):
         job=partial(_test_mcore_vllm_export, tmp_path, quant_cfg),
         backend="nccl",
     )
-
-