Integrate PARQ with lowbit Arm CPU kernels (#2622)

metascroy · web-flow · commit b6ef500d3efc · 2025-07-29T14:02:16.000-07:00
* Integrate PARQ with lowbit Arm CPU kernels

* up
diff --git a/.github/workflows/torchao_experimental_test.yml b/.github/workflows/torchao_experimental_test.yml
@@ -53,6 +53,7 @@ jobs:
           pytest torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
           python torchao/experimental/tests/test_embedding_xbit_quantizer.py
           python torchao/experimental/tests/test_quant_passes.py
+          pytest -s test/prototype/test_dynamic_activation_lut.py
       - name: Run kernels/cpu/aarch64/tests
         if: runner.os == 'macOS'
         run: |
@@ -106,7 +107,7 @@ jobs:
   #         conda run -n test-mps-ops-env pip install torch --index-url "https://download.pytorch.org/whl/nightly/cpu"
   #     - name: Print torch version
   #       run: |
-          
+
   #         conda run -n test-mps-ops-env python -c "import torch; print(torch.__version__)"
   #     - name: Install requirements
   #       run: |
diff --git a/test/prototype/test_dynamic_activation_lut.py b/test/prototype/test_dynamic_activation_lut.py
@@ -0,0 +1,161 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import platform
+import sys
+from copy import deepcopy
+from dataclasses import dataclass
+
+import pytest
+import torch
+import torch.nn as nn
+
+from torchao.core.config import AOBaseConfig
+from torchao.prototype.parq.quant import StretchedUnifTorchaoQuantizer
+from torchao.prototype.parq.quant.quant_api import StretchedIntxWeightOnlyConfig
+from torchao.prototype.quantization.dynamic_activation_lut import (
+    StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig,
+)
+from torchao.quantization import quantize_
+from torchao.quantization.granularity import PerAxis, PerGroup
+from torchao.quantization.linear_activation_quantized_tensor import (
+    to_linear_activation_quantized,
+)
+from torchao.quantization.quant_api import (
+    _int8_asymm_per_token_quant,
+)
+from torchao.quantization.transform_module import register_quantize_module_handler
+
+is_arm64_mac = sys.platform == "darwin" and platform.machine() == "arm64"
+
+
+@dataclass
+class Int8DynamicActivationConfig(AOBaseConfig):
+    pass
+
+
+@register_quantize_module_handler(Int8DynamicActivationConfig)
+def _int8_dynamic_activation_transform(
+    module: nn.Module, config: Int8DynamicActivationConfig
+) -> nn.Module:
+    weight = module.weight
+    weight = to_linear_activation_quantized(weight, _int8_asymm_per_token_quant)
+    module.weight = torch.nn.Parameter(weight, requires_grad=False)
+    return module
+
+
+class ToyLinearModel(torch.nn.Module):
+    def __init__(self, d1=512, d2=256, d3=128, d4=8):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(d1, d2, bias=False)
+        self.linear2 = torch.nn.Linear(d2, d3, bias=True)
+        self.linear3 = torch.nn.Linear(d3, d4, bias=False)
+
+    def example_inputs(
+        self,
+        lead_dim=(1,),
+        dtype=torch.bfloat16,
+    ):
+        return torch.randn(
+            *lead_dim, self.linear1.in_features, dtype=dtype, device="cpu"
+        )
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        x = self.linear3(x)
+        return x
+
+
+@pytest.fixture(autouse=True)
+def run_before_and_after_tests():
+    yield
+    torch._dynamo.reset()  # reset cache between tests
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("granularity", [PerGroup(32), PerAxis(0)])
+@pytest.mark.parametrize("bit_width", [1, 2, 3, 4])
+@pytest.mark.parametrize("lead_dim", [(5,), (2, 3)])
+@pytest.mark.skipif(not is_arm64_mac, reason="requires arm64 mac")
+def test_parq_conversion(dtype, granularity, bit_width, lead_dim):
+    quantizer = StretchedUnifTorchaoQuantizer(bit_width)
+    config = StretchedIntxWeightOnlyConfig(
+        b=bit_width,
+        quant_min=quantizer.quant_min,
+        quant_max=quantizer.quant_max,
+        granularity=granularity,
+    )
+
+    parq_model = ToyLinearModel(128, 256, 128, 1).to(dtype)
+    activations = parq_model.example_inputs(lead_dim=lead_dim, dtype=dtype)
+    quantize_(parq_model, config)
+
+    # Apply dynamic activation to parq model.  This will serve as the LUT reference
+    parq_model_with_dyn_quant = deepcopy(parq_model)
+    quantize_(
+        parq_model_with_dyn_quant,
+        Int8DynamicActivationConfig(),
+        # We have to explicitly provide filter_fn because the default linear filter
+        # excludes modules with AffinQUnatizedTensor weights
+        filter_fn=lambda m, fqn: isinstance(m, torch.nn.Linear),
+    )
+
+    # Convert PARQ model to lowbit LUT model
+    lut_model = deepcopy(parq_model)
+    conversion_config = (
+        StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig(
+            config.b, config.granularity
+        )
+    )
+    quantize_(lut_model, conversion_config, filter_fn=conversion_config.get_filter_fn())
+
+    # Run both models and compare
+    parq_out = parq_model(activations)
+    parq_with_dyn_quant_out = parq_model_with_dyn_quant(activations)
+    lut_out = lut_model(activations)
+
+    assert torch.allclose(parq_out, parq_with_dyn_quant_out, atol=1e-1, rtol=1e-1)
+    if dtype == torch.float32:
+        assert torch.allclose(lut_out, parq_with_dyn_quant_out, atol=1e-4, rtol=1e-4)
+    elif dtype == torch.bfloat16:
+        assert torch.allclose(lut_out, parq_with_dyn_quant_out, atol=1e-2, rtol=1e-2)
+    else:
+        raise ValueError(f"Unsupported dtype {dtype}")
+
+
+@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16])
+@pytest.mark.parametrize("granularity", [PerGroup(32), PerAxis(0)])
+@pytest.mark.parametrize("bit_width", [1, 2, 3, 4])
+@pytest.mark.parametrize("lead_dim", [(5,), (2, 3)])
+@pytest.mark.skipif(not is_arm64_mac, reason="requires arm64 mac")
+def test_export(dtype, granularity, bit_width, lead_dim):
+    quantizer = StretchedUnifTorchaoQuantizer(bit_width)
+    config = StretchedIntxWeightOnlyConfig(
+        b=bit_width,
+        quant_min=quantizer.quant_min,
+        quant_max=quantizer.quant_max,
+        granularity=granularity,
+    )
+
+    parq_model = ToyLinearModel(128, 256, 128, 8).to(dtype)
+    activations = parq_model.example_inputs(lead_dim=lead_dim)
+    quantize_(parq_model, config)
+
+    conversion_config = (
+        StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig(
+            config.b, config.granularity
+        )
+    )
+    quantize_(
+        parq_model, conversion_config, filter_fn=conversion_config.get_filter_fn()
+    )
+
+    ep = torch.export.export(parq_model, (activations,))
+    assert (
+        f"torch.ops.torchao._linear_8bit_act_{bit_width}bit_weight.default"
+        in ep.graph_module.code
+    )
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_selector.h
@@ -184,7 +184,7 @@ void register_ukernel_config_lut(
     namespace kernel = torchao::kernels::cpu::aarch64::linear::
       channelwise_8bit_activation_groupwise_lowbit_weight;
 
-    if (cpuinfo_has_arm_neon_dot()) {
+    if (!cpuinfo_has_arm_neon_dot()) {
       return;
     }
     if (format.has_weight_zeros) {
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
@@ -251,6 +251,7 @@ Tensor pack_weights_with_lut_cpu(
       "weight_scales must be float32");
   TORCHAO_CHECK(weight_scales.dim() == 1, "weight_scales must be 1D");
   TORCHAO_CHECK(group_size >= 1, "group_size must be >= 1");
+  TORCHAO_CHECK(group_size % 16 == 0, "group_size must be a multiple of 16");
   TORCHAO_CHECK(
       weight_scales.size(0) == ((n * k) / group_size),
       "expected 1 scale per group");
@@ -285,8 +286,8 @@ Tensor pack_weights_with_lut_cpu(
           weight_nbit>(target, has_weight_zeros, has_bias);
   TORCHAO_CHECK(packed_weights_format.nr == 8, "nr must be 8");
   TORCHAO_CHECK(
-        lut_channel_group_size % 8 == 0,
-        "the lut_channel_group_size must be a multiple of nr (8)");
+        lut_channel_group_size == n || lut_channel_group_size % 8 == 0,
+        "the lut_channel_group_size must be n or a multiple of nr (8)");
 
   auto packed_weights_header = packed_weights_format.to_packed_weights_header();
   auto uk = torchao::ops::linear_8bit_act_xbit_weight::select_ukernel_config<
diff --git a/torchao/prototype/quantization/dynamic_activation_lut/__init__.py b/torchao/prototype/quantization/dynamic_activation_lut/__init__.py
@@ -0,0 +1,7 @@
+from .api import StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig
+from .int8_dynamic_activation_lut_tensor import Int8DynamicActivationLutTensor
+
+__all__ = [
+    "StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig",
+    "Int8DynamicActivationLutTensor",
+]
diff --git a/torchao/prototype/quantization/dynamic_activation_lut/api.py b/torchao/prototype/quantization/dynamic_activation_lut/api.py
@@ -0,0 +1,83 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import Callable
+
+import torch
+import torch.nn as nn
+
+from torchao.core.config import AOBaseConfig
+from torchao.prototype.parq.quant.quant_api import StretchedAffineQuantizedTensor
+from torchao.prototype.quantization.dynamic_activation_lut.int8_dynamic_activation_lut_tensor import (
+    Int8DynamicActivationLutTensor,
+)
+from torchao.quantization.granularity import Granularity, PerAxis, PerGroup
+from torchao.quantization.quant_primitives import _DTYPE_TO_QVALUE_BOUNDS
+from torchao.quantization.transform_module import register_quantize_module_handler
+
+
+@dataclass
+class StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig(
+    AOBaseConfig
+):
+    bit_width: int
+    granularity: Granularity
+
+    def get_filter_fn(self) -> Callable[[nn.Module, str], bool]:
+        return lambda m, fqn: isinstance(m, torch.nn.Linear) and isinstance(
+            m.weight, StretchedAffineQuantizedTensor
+        )
+
+
+@register_quantize_module_handler(
+    StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig
+)
+def _(
+    module: nn.Module,
+    config: StretchedAffineQuantizedTensor_to_Int8DynamicActivationLutTensorConfig,
+) -> nn.Module:
+    weight = module.weight
+    bias = module.bias
+    assert isinstance(weight, StretchedAffineQuantizedTensor)
+
+    b = config.bit_width
+    granularity = config.granularity
+    if isinstance(granularity, PerGroup):
+        group_size = granularity.group_size
+    elif isinstance(granularity, PerAxis):
+        assert granularity.axis == 0, (
+            f"axis must be 0 with PerAxis, but got {granularity.axis}"
+        )
+        group_size = weight.shape[-1]
+    else:
+        raise ValueError(f"granularity must be PerGroup or PerAxis, got {granularity}")
+
+    int_data, scale, zero_point = weight.tensor_impl.get_plain()
+    q_min, q_max = _DTYPE_TO_QVALUE_BOUNDS[getattr(torch, f"int{b}")]
+
+    # Construct LUT as 2 * ([q_min, q_max] - 0.5)
+    assert torch.all(zero_point == -0.5)
+    lut = torch.arange(q_min, q_max + 1)
+    lut = 2 * lut + 1
+
+    # Construct idx values
+    qval_idx = int_data - q_min
+
+    # Construct scale
+    scale = scale.reshape(-1).to(torch.float32)
+    scale = 0.5 * scale  # since we multiply LUT values by 2
+
+    weight_tensor = Int8DynamicActivationLutTensor.from_plain(
+        qval_idx,
+        lut,
+        scale,
+        group_size,
+        bias.to(torch.float32) if bias is not None else None,
+    )
+    module.weight = torch.nn.Parameter(weight_tensor, requires_grad=False)
+    module.bias = None
+    return module
diff --git a/torchao/prototype/quantization/dynamic_activation_lut/int8_dynamic_activation_lut_tensor.py b/torchao/prototype/quantization/dynamic_activation_lut/int8_dynamic_activation_lut_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,7 @@ void register_ukernel_config_lut(`
`184`	`184`	`namespace kernel = torchao::kernels::cpu::aarch64::linear::`
`185`	`185`	`channelwise_8bit_activation_groupwise_lowbit_weight;`
`186`	`186`
`187`		`- if (cpuinfo_has_arm_neon_dot()) {`
	`187`	`+ if (!cpuinfo_has_arm_neon_dot()) {`
`188`	`188`	`return;`
`189`	`189`	`}`
`190`	`190`	`if (format.has_weight_zeros) {`