pytorch
diff --git a/‎test/quantization/quantize_/workflows/int4/test_int4_tensor.py
Lines changed: 312 additions & 29 deletions b/‎test/quantization/quantize_/workflows/int4/test_int4_tensor.py
Lines changed: 312 additions & 29 deletions
@@ -7,20 +7,79 @@
 import unittest
 
 import torch
+import torch.nn as nn
+import torch.nn.functional as F
 from torch.testing._internal.common_utils import (
     TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
-from torchao.quantization import (
-    Int4WeightOnlyConfig,
-    quantize_,
-)
+from torchao.prototype.moe_quant.utils import MoEQuantConfig
+from torchao.quantization import Int4WeightOnlyConfig, quantize_
 from torchao.quantization.utils import compute_error
-from torchao.utils import (
-    TORCH_VERSION_AT_LEAST_2_8,
-    is_sm_at_least_90,
-)
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_8, is_sm_at_least_90
+
+
+class Experts(nn.Module):
+    def __init__(
+        self,
+        num_local_experts: int,
+        dim: int,
+        hidden_dim: int,
+        dtype: torch.dtype,
+        device: torch.device,
+    ) -> None:
+        super().__init__()
+
+        self.num_local_experts = num_local_experts
+        self.dim = dim
+
+        self.w1: nn.Parameter = nn.Parameter(
+            torch.randn(
+                num_local_experts,
+                dim,
+                hidden_dim,
+                dtype=dtype,
+                device=device,
+            )
+        )
+
+        self.w2: nn.Parameter = nn.Parameter(
+            torch.randn(
+                num_local_experts,
+                hidden_dim,
+                dim,
+                dtype=dtype,
+                device=device,
+            )
+        )
+
+        self.w3: nn.Parameter = nn.Parameter(
+            torch.randn(
+                num_local_experts,
+                dim,
+                hidden_dim,
+                dtype=dtype,
+                device=device,
+            )
+        )
+
+    def forward(
+        self,
+        routed_in_egD: torch.Tensor,  # noqa: N803
+    ) -> torch.Tensor:
+        e = self.num_local_experts
+        D = self.dim
+
+        x_egD = routed_in_egD.view(e, -1, D)
+
+        middle_out_egF = F.silu(torch.bmm(x_egD, self.w1)) * torch.bmm(x_egD, self.w3)
+        out_egD = torch.bmm(middle_out_egF, self.w2)
+        out_egD = out_egD.view(-1, D)
+
+        return out_egD
 
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
@@ -61,9 +120,9 @@ def test_slice(self):
         quantize_(dummy, self.config)
         weight1 = dummy.weight.narrow(0, 0, 64)
         weight2 = dummy.weight.narrow(1, 0, 128)
-        self.assertEqual(weight1._data, dummy.weight._data.narrow(0, 0, 64))
+        self.assertEqual(weight1.qdata, dummy.weight.qdata.narrow(0, 0, 64))
         self.assertEqual(weight1.scale, dummy.weight.scale.narrow(1, 0, 64))
-        self.assertEqual(weight2._data, dummy.weight._data.narrow(1, 0, 64))
+        self.assertEqual(weight2.qdata, dummy.weight.qdata.narrow(1, 0, 64))
         self.assertEqual(weight2.scale, dummy.weight.scale.narrow(0, 0, 1))
 
         # check for sliced weight, before and after float8 quantization
@@ -80,31 +139,62 @@ def test_slice(self):
         res = dummy(input)
         assert compute_error(res, res_ref) > 15
 
-    def test_slice_and_copy_(self):
+    def test_slice_preserves_aliasing(self):
+        config = self.config
         l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
         )
-        quantize_(l, self.config)
+        quantize_(l, config)
         param = l.weight
         param_data = param.data
         param_data = param_data.narrow(0, 0, 512)
-        assert param.data._data.data_ptr() == param_data._data.data_ptr()
+        # Making sure the aliasing is preserved in sliced quantized Tensor
+        assert param.data.qdata.data_ptr() == param_data.qdata.data_ptr()
         assert param.data.scale.data_ptr() == param_data.scale.data_ptr()
-        assert param.data.zero_point.data_ptr() == param_data.zero_point.data_ptr()
-        orig_value = param.data._data[0][0].item()
 
-        # dummy_l has random input (shouldn't be 0)
+    def test_slice_and_copy_similar_to_vllm(self):
+        # making sure https://github.com/vllm-project/vllm/blob/90bd2ab6e3eb7e83d3f40d99fc23e6e43834743a/vllm/model_executor/layers/linear.py#L483-L495 works properly
+        # the test is similar to the linked code, but with some hardcoded arguments
+        # and does not use tensor parallelism
+
+        dtype = torch.bfloat16
+        device = "cuda"
+        config = self.config
+        l = torch.nn.Linear(1024, 1024, device="cuda", dtype=dtype)
+        quantize_(l, config)
+
+        # high level, we do a narrow for both param.data and the loaded_weights
+        # and do inplace copy_ to copy from the loaded_weights into param.data
+
+        # simulate loaded_weight
         dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
-        quantize_(dummy_l, self.config)
-        quantized = dummy_l.weight
-        quantized = quantized.narrow(0, 0, 512)
+        # making the weight different
+        dummy_l.weight = torch.nn.Parameter(
+            dummy_l.weight + 2 * torch.randn(1024, 1024, device=device, dtype=dtype),
+            requires_grad=False,
+        )
+        quantize_(dummy_l, config)
 
-        param_data.copy_(quantized)
+        output_dim = 0
+        shard_size = 512
+        for tp_rank in [0, 1]:
+            start_idx = tp_rank * shard_size
+            param = l.weight
+            param_data = param.data
+            param_data = param_data.narrow(output_dim, start_idx, shard_size)
+            orig_value = param_data.qdata[0][0].item()
+            loaded_weight = dummy_l.weight
+            loaded_weight = loaded_weight.narrow(output_dim, start_idx, shard_size)
 
-        # making sure param.data is updated
-        assert param.data._data[0][0] != orig_value
+            # making sure param.data.qdata[0][0] is not the same as loaded_weight.qdata[0][0]
+            assert orig_value != loaded_weight.qdata[0][0]
+            param_data.copy_(loaded_weight)
+            # making sure param.data is updated to loaded_weight
+            assert param_data.qdata[0][0] == loaded_weight.qdata[0][0]
+            assert torch.equal(param_data.scale, loaded_weight.scale)
 
+    @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
     def test_bmm(self):
         class M(torch.nn.Module):
             def __init__(self, weight):
@@ -126,20 +216,213 @@ def forward(self, x):
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
 
-    def test_to_device(self):
+    @parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 64, 256),
+            ((2, 32, 128), 64, 256),
+        ],
+    )
+    def test_to_device(self, sizes):
+        config = self.config
+        M, N, K = sizes
+        dtype = torch.bfloat16
         for device in self.GPU_DEVICES:
-            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            input_tensor = torch.randn(*M, K, dtype=dtype, device=device)
+            linear = torch.nn.Linear(K, N, dtype=dtype)
+            quantize_(linear, config)
             linear.to(device)
+            linear(input_tensor)
 
-            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            linear = torch.nn.Linear(K, N, dtype=dtype)
+            quantize_(linear, config)
             linear.to(device=device)
+            linear(input_tensor)
 
-            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            linear = torch.nn.Linear(K, N, dtype=dtype)
+            quantize_(linear, config)
             linear.to(device)
+            linear(input_tensor)
+
+    @parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 64, 256),
+            ((2, 32, 128), 64, 256),
+        ],
+    )
+    def test_cat(self, sizes):
+        config = self.config
+        dtype = torch.bfloat16
+        device = "cuda"
+        M, N, K = sizes
+        linear1 = torch.nn.Linear(K, N, dtype=dtype, device=device)
+        linear2 = torch.nn.Linear(K, N, dtype=dtype, device=device)
+        input_cat1 = torch.randn(*M, K, dtype=dtype, device=device)
+
+        cat_weight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        dummy_linear1 = torch.nn.Linear(K, N, bias=False, dtype=dtype, device=device)
+
+        dummy_linear1.weight = torch.nn.Parameter(cat_weight1)
+        quantize_(dummy_linear1, config)
+
+        quantize_(linear1, config)
+        quantize_(linear2, config)
+
+        cat_qweight1 = torch.cat([linear1.weight, linear2.weight], dim=0)
+        self.assertTrue(cat_qweight1.shape, (2 * N, K))
+        self.assertEqual(
+            dummy_linear1.weight.qdata,
+            cat_qweight1.qdata,
+        )
+        self.assertEqual(
+            dummy_linear1.weight.scale,
+            cat_qweight1.scale,
+        )
+        self.assertEqual(
+            dummy_linear1.weight.zero_point,
+            cat_qweight1.zero_point,
+        )
+
+        # making sure cat_qweight1 can be used for inference
+        dummy_linear1.weight = torch.nn.Parameter(cat_qweight1, requires_grad=False)
+        dummy_linear1(input_cat1)
+
+        # align the scale and zero_point before concatenation
+        linear2.weight.scale = linear1.weight.scale
+        linear2.weight.zero_point = linear1.weight.zero_point
+        cat_qweight2 = torch.cat([linear1.weight, linear2.weight], dim=1)
+        self.assertTrue(cat_qweight2.shape, (N, 2 * K))
+        ref_data = torch.cat(
+            [
+                linear1.weight.qdata,
+                linear2.weight.qdata,
+            ],
+            dim=1,
+        )
+        ref_scale = linear1.weight.scale
+        self.assertEqual(cat_qweight2.qdata, ref_data)
+        self.assertEqual(cat_qweight2.scale, ref_scale)
+
+    def test_moe_weight_reshape_ops(self):
+        """This is testing the op call sequence in saving and loading quantization
+        checkpoints in llama-models for llama4
+        (https://github.com/meta-llama/llama-models/tree/main/models/llama4)
+        """
+        # only per row quantization is supported for bmm
+        dtype = torch.bfloat16
+        device = "cuda"
+
+        bmm_config = self.config
+        moe_config = MoEQuantConfig(bmm_config)
+
+        batch_size = 4
+        num_experts = 2
+        input_dim = 64
+        dim = 128
+        hidden_dim = 256
+
+        moe1 = Experts(num_experts, dim, hidden_dim, dtype, device)
+        moe2 = Experts(num_experts, dim, hidden_dim, dtype, device)
+        moe_combined = Experts(num_experts, dim, 2 * hidden_dim, dtype, device)
+        input = torch.randn(batch_size, input_dim, dim, dtype=dtype, device=device)
+
+        moes = [moe1, moe2]
+
+        for moe in moes:
+            moe(input)
+
+            def filter_fn(module, fqn):
+                return isinstance(module, Experts)
+
+            # need to transpose before quantizing
+            moe.w1 = torch.nn.Parameter(
+                moe.w1.transpose(1, 2).contiguous(), requires_grad=False
+            )
+            moe.w2 = torch.nn.Parameter(
+                moe.w2.transpose(1, 2).contiguous(), requires_grad=False
+            )
+            moe.w3 = torch.nn.Parameter(
+                moe.w3.transpose(1, 2).contiguous(), requires_grad=False
+            )
+
+            quantize_(moe, moe_config, filter_fn=filter_fn)
+
+            before = moe(input)
+
+            # transposing for resharding support since only 2D resharding is supported
+            new_last_dim = moe.w1.shape[-2]
+            moe.w1 = torch.nn.Parameter(
+                moe.w1.transpose(1, 2).reshape(-1, new_last_dim), requires_grad=False
+            )
+            new_last_dim = moe.w2.shape[-2]
+            moe.w2 = torch.nn.Parameter(
+                moe.w2.transpose(1, 2).reshape(-1, new_last_dim), requires_grad=False
+            )
+            new_last_dim = moe.w3.shape[-2]
+            moe.w3 = torch.nn.Parameter(
+                moe.w3.transpose(1, 2).reshape(-1, new_last_dim), requires_grad=False
+            )
+
+            moe.w1 = torch.nn.Parameter(
+                moe.w1.unflatten(0, (num_experts, -1)).squeeze(dim=0),
+                requires_grad=False,
+            )
+            moe.w2 = torch.nn.Parameter(
+                moe.w2.unflatten(0, (num_experts, -1)).squeeze(dim=0),
+                requires_grad=False,
+            )
+            moe.w3 = torch.nn.Parameter(
+                moe.w3.unflatten(0, (num_experts, -1)).squeeze(dim=0),
+                requires_grad=False,
+            )
+
+            # transpose again to recover the original weights
+            moe.w1 = torch.nn.Parameter(moe.w1.transpose(1, 2), requires_grad=False)
+            moe.w2 = torch.nn.Parameter(moe.w2.transpose(1, 2), requires_grad=False)
+            moe.w3 = torch.nn.Parameter(moe.w3.transpose(1, 2), requires_grad=False)
+
+            after = moe(input)
+            self.assertEqual(before, after)
+
+        state_dicts = [moe1.state_dict(), moe2.state_dict()]
+        # align the scale parameter so they can be concatenated
+        for key in ["w1", "w2", "w3"]:
+            weights = [st[key] for st in state_dicts]
+            for i in range(1, len(weights)):
+                weights[i].scale = weights[0].scale
+                weights[i].zero_point = weights[0].zero_point
+
+        def process_key(key: str) -> torch.Tensor:
+            tensors = [s[key] for s in state_dicts]
+            # Note: we have a hacky implementation for cat in user codebase
+            # since it is not implemented correctly before
+            if key == "w2":
+                return torch.cat(tensors, dim=-1)
+            else:
+                return torch.cat(tensors, dim=-2)
+
+        new_state_dict = {}
+        for key in ["w1", "w2", "w3"]:
+            new_state_dict[key] = process_key(key)
+
+        moe_combined.w1 = torch.nn.Parameter(
+            moe_combined.w1.transpose(1, 2), requires_grad=False
+        )
+        moe_combined.w2 = torch.nn.Parameter(
+            moe_combined.w2.transpose(1, 2), requires_grad=False
+        )
+        moe_combined.w3 = torch.nn.Parameter(
+            moe_combined.w3.transpose(1, 2), requires_grad=False
+        )
+        moe_combined.load_state_dict(new_state_dict, assign=True)
+        # make sure it runs
+        moe_combined(input)
+
 
+instantiate_parametrized_tests(TestInt4Tensor)
 
 if __name__ == "__main__":
     run_tests()