Revert "Unify get_block_size (#3039)"

jerryzh168 · web-flow · commit d613f9a2a6c7 · 2025-09-24T11:21:46.000-07:00
This reverts commit 8e2ca35.
diff --git a/test/quantization/pt2e/test_quantize_pt2e.py b/test/quantization/pt2e/test_quantize_pt2e.py
@@ -2948,11 +2948,10 @@ def has_inplace_ops(graph_module: torch.fx.GraphModule) -> bool:
 @unittest.skipIf(not torch_version_at_least("2.7.0"), "Requires torch 2.7+")
 class TestQuantizePT2EAffineQuantization(PT2EQuantizationTestCase):
     def test_channel_group_quantization(self):
-        from torchao.quantization import PerGroup, PerToken
         from torchao.quantization.pt2e._affine_quantization import (
             AffineQuantizedMinMaxObserver,
         )
-        from torchao.quantization.pt2e.observer import MappingType
+        from torchao.quantization.pt2e.observer import MappingType, PerGroup, PerToken
 
         class BackendAQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
@@ -3032,13 +3031,13 @@ def forward(self, x):
     def test_dynamic_affine_act_per_channel_weights(self):
         import operator
 
-        from torchao.quantization import PerToken
         from torchao.quantization.pt2e._affine_quantization import (
             AffineQuantizedMovingAverageMinMaxObserver,
         )
         from torchao.quantization.pt2e.observer import (
             MappingType,
             PerChannelMinMaxObserver,
+            PerToken,
         )
 
         class BackendAQuantizer(Quantizer):
@@ -3123,14 +3122,12 @@ def forward(self, x):
     def test_dynamic_per_tok_act_per_group_weights(self):
         import operator
 
-        from torchao.quantization import PerGroup, PerToken
-
         # TODO: merge into torchao observer
         from torchao.quantization.pt2e._affine_quantization import (
             AffineQuantizedMinMaxObserver,
             AffineQuantizedPlaceholderObserver,
         )
-        from torchao.quantization.pt2e.observer import MappingType
+        from torchao.quantization.pt2e.observer import MappingType, PerGroup, PerToken
 
         class BackendAQuantizer(Quantizer):
             def annotate(self, model: torch.fx.GraphModule) -> torch.fx.GraphModule:
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -19,7 +19,6 @@
     MultiTensorInputRecorder,
 )
 from .granularity import (
-    Granularity,
     PerAxis,
     PerGroup,
     PerRow,
@@ -198,7 +197,6 @@
     "MappingType",
     "ZeroPointDomain",
     "TorchAODType",
-    "Granularity",
     "PerTensor",
     "PerAxis",
     "PerGroup",
diff --git a/torchao/quantization/linear_activation_quantized_tensor.py b/torchao/quantization/linear_activation_quantized_tensor.py
@@ -133,6 +133,7 @@ def _same_metadata(
 
 @implements([torch.nn.functional.linear, aten.linear.default])
 def _(func, types, args, kwargs):
+
     input_tensor = kwargs.get("input", args[0] if len(args) > 0 else None)
     weight_tensor = kwargs.get("weight", args[1] if len(args) > 1 else None)
     bias = kwargs.get("bias", args[2] if len(args) > 2 else None)
diff --git a/torchao/quantization/observer.py b/torchao/quantization/observer.py
@@ -14,6 +14,7 @@
 
 from .granularity import (
     Granularity,
+    PerAxis,
     PerRow,
     PerTensor,
 )
@@ -23,7 +24,6 @@
     _get_reduction_params,
     choose_qparams_affine_with_min_max,
 )
-from .utils import get_block_size
 
 logger = logging.getLogger(__name__)
 
@@ -63,6 +63,26 @@ def _with_args(cls_or_self, *args, **kwargs):
     return r
 
 
+def get_block_size(
+    input_shape: Tuple[int, ...], granularity: Granularity
+) -> Tuple[int, ...]:
+    """Get the block size based on the input shape and granularity type.
+
+    Args:
+        input_shape: The input tensor shape possibly more than 2 dimensions
+        granularity: The granularity type of the quantization
+    """
+    if isinstance(granularity, PerTensor):
+        return input_shape
+    elif isinstance(granularity, PerAxis):
+        block_size = list(input_shape)
+        block_size[granularity.axis] = 1
+        return tuple(block_size)
+    elif isinstance(granularity, PerRow):
+        return (1,) * (len(input_shape) - 1) + (input_shape[-1],)
+    raise ValueError(f"Unsupported Granularity: {granularity}")
+
+
 ABC: Any = ABCMeta("ABC", (object,), {})  # compatible with Python 2 *and* 3:
 
 
diff --git a/torchao/quantization/pt2e/__init__.py b/torchao/quantization/pt2e/__init__.py
@@ -48,6 +48,7 @@
 from .observer import (
     AffineQuantizedObserverBase,
     FixedQParamsObserver,
+    Granularity,
     HistogramObserver,
     MappingType,
     MinMaxObserver,
@@ -56,13 +57,20 @@
     NoopObserver,
     ObserverBase,
     PartialWrapper,
+    PerAxis,
+    PerBlock,
     PerChannelMinMaxObserver,
+    PerGroup,
+    PerRow,
+    PerTensor,
+    PerToken,
     PlaceholderObserver,
     RecordingObserver,
     ReuseInputObserver,
     TorchAODType,
     UniformQuantizationObserverBase,
     ZeroPointDomain,
+    get_block_size,
 )
 
 for _f in [
@@ -131,9 +139,17 @@
     "compare_results",
     # should be merged with torchao/quantization/observer.py in the future
     "AffineQuantizedObserverBase",
+    "Granularity",
     "MappingType",
+    "PerAxis",
+    "PerBlock",
+    "PerGroup",
+    "PerRow",
+    "PerTensor",
+    "PerToken",
     "TorchAODType",
     "ZeroPointDomain",
+    "get_block_size",
     "default_fake_quant",
     "default_dynamic_fake_quant",
 ]
diff --git a/torchao/quantization/pt2e/_affine_quantization.py b/torchao/quantization/pt2e/_affine_quantization.py
@@ -19,8 +19,8 @@
     MappingType,
     TorchAODType,
     ZeroPointDomain,
+    get_block_size,
 )
-from torchao.quantization.utils import get_block_size
 
 ABC: Any = ABCMeta("ABC", (object,), {})  # compatible with Python 2 *and* 3:
 
diff --git a/torchao/quantization/pt2e/observer.py b/torchao/quantization/pt2e/observer.py
@@ -27,7 +27,6 @@
 from torch.fx import Node
 
 import torchao
-from torchao.quantization import Granularity
 from torchao.quantization.pt2e.utils import (
     calculate_qmin_qmax,
     check_min_max_valid,
@@ -68,9 +67,17 @@
     "ReuseInputObserver",
     "UniformQuantizationObserverBase",
     "AffineQuantizedObserverBase",
+    "Granularity",
     "MappingType",
+    "PerAxis",
+    "PerBlock",
+    "PerGroup",
+    "PerRow",
+    "PerTensor",
+    "PerToken",
     "TorchAODType",
     "ZeroPointDomain",
+    "get_block_size",
 ]
 
 
@@ -1615,6 +1622,7 @@ def calculate_qparams(self):
 We plan to merge the following with torchao repo after we move pt2e flow to torchao
 copied from https://github.com/pytorch/ao/blob/main/torchao/quantization/observer.py
 """
+from dataclasses import dataclass
 from enum import Enum, auto
 
 
@@ -1671,6 +1679,139 @@ class TorchAODType(Enum):
     INT7 = auto()
 
 
+@dataclass(frozen=True)
+class Granularity:
+    """
+    Base class for representing the granularity of quantization.
+
+    This class serves as a parent for specific granularity types used in
+    quantization operations, such as per-tensor or per-axis quantization.
+    """
+
+
+@dataclass(frozen=True)
+class PerBlock(Granularity):
+    """
+    Represents per-block granularity in quantization. See
+    :func:`~torchao.quantization.quant_primitives.quantize_affine` for docs for
+    `block_size`
+
+    Attributes:
+        block_size (Tuple[int, ...]): The size of each quantization group
+    """
+
+    block_size: tuple[int, ...]
+
+
+@dataclass(frozen=True)
+class PerTensor(Granularity):
+    """
+    Represents per-tensor granularity in quantization.
+
+    This granularity type calculates the quantization parameters
+    based off the entire tensor.
+
+    """
+
+
+@dataclass(frozen=True)
+class PerAxis(Granularity):
+    """
+    Represents per-axis granularity in quantization.
+
+    This granularity type calculates different quantization parameters
+    along a specified axis of the tensor.
+
+    For example if the input tensor is shape [8, 16] and axis=0, then
+    the quantization parameters are calculated for each row of the tensor.
+    Giving a total of 8 quantization parameters.
+
+    Attributes:
+        axis (int): The axis along which reduction is performed.
+    """
+
+    axis: int
+
+
+@dataclass(frozen=True)
+class PerGroup(Granularity):
+    """
+    Represents per-channel group granularity in quantization.
+
+    This granularity type calculates different quantization parameters
+    for each group of <group_size> elements.
+
+    For example if the input tensor is shape [8, 16], and the group size is 4, then
+    the input tensor is reshaped to [64, 4]
+    quantization parameters are calculated for each group of 4 elements,
+    giving a total of 64 quantization parameters.
+
+    Attributes:
+        group_size (int): The size of each quantization group
+
+    """
+
+    group_size: int
+
+
+class PerRow(Granularity):
+    """
+    Represents row-wise granularity in quantization.
+
+    This is a special case of per-axis quantization and is unique to Float8 matmuls
+    where the input is quantized with a block_size of (1, ..., input.shape[-1]). And the weight
+    is quantized with a block_size of (1, weight.shape[1]).
+    """
+
+
+class PerToken(Granularity):
+    """
+    Represents per-token granularity in quantization.
+
+    This granularity type calculates a different set of quantization parameters
+    for each token, which is represented as the last dimension of the tensor.
+
+    For example, if the input tensor has shape [2, 3, 4], then there are 6 tokens
+    with 4 elements each, and we will calculate 6 sets of quantization parameters,
+    one for each token.
+
+    If the input tensor has only two dimensions, e.g. [8, 16], then this is
+    equivalent to `PerAxis(axis=0)`, which yields 8 sets of quantization parameters.
+    """
+
+
+def get_block_size(
+    input_shape: tuple[int, ...], granularity: Granularity
+) -> tuple[int, ...]:
+    """Get the block size based on the input shape and granularity type.
+
+    Args:
+        input_shape: The input tensor shape possibly more than 2 dimensions
+        granularity: The granularity type of the quantization
+    """
+    assert isinstance(granularity, Granularity), (
+        "Please provide an instance of Granularity, not subclass of it"
+    )
+    if isinstance(granularity, PerTensor):
+        return input_shape
+    elif isinstance(granularity, PerAxis):
+        block_size = list(input_shape)
+        block_size[granularity.axis] = 1
+        return tuple(block_size)
+    elif isinstance(granularity, PerRow):
+        return (1,) * (len(input_shape) - 1) + (input_shape[-1],)
+    elif isinstance(granularity, PerGroup):
+        assert len(input_shape) == 2, (
+            f"Expecting input shape dim to be 2 for per group quantization, gotinput shape: {input_shape}"
+        )
+        return (1, granularity.group_size)
+    elif isinstance(granularity, PerToken):
+        block_size = [1] * len(input_shape)
+        block_size[-1] = input_shape[-1]
+        return tuple(block_size)
+    raise ValueError(f"Unsupported Granularity: {granularity}")
+
+
 class AffineQuantizedObserverBase(ABC, torch.nn.Module):
     """Observer module for affine quantization (https://github.com/pytorch/ao/tree/main/torchao/quantization#affine-quantization)
 
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -14,6 +14,7 @@
     PerRow,
     PerToken,
 )
+from torchao.quantization.observer import get_block_size
 from torchao.quantization.quant_primitives import (
     _DTYPE_TO_BIT_WIDTH,
     _DTYPE_TO_QVALUE_BOUNDS,
@@ -27,7 +28,6 @@
 )
 from torchao.quantization.utils import (
     _get_per_token_block_size,
-    get_block_size,
     get_group_qparams_symmetric,
     get_groupwise_affine_qparams,
 )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -64,7 +64,7 @@
 from torchao.quantization.linear_activation_weight_observed_tensor import (
     LinearActivationWeightObservedTensor,
 )
-from torchao.quantization.observer import AffineQuantizedObserverBase
+from torchao.quantization.observer import AffineQuantizedObserverBase, get_block_size
 from torchao.quantization.quantize_.common import (
     KernelPreference,
 )
@@ -87,7 +87,6 @@
     _QUANTIZE_CONFIG_HANDLER,
     register_quantize_module_handler,
 )
-from torchao.quantization.utils import get_block_size
 from torchao.quantization.weight_tensor_linear_activation_quantization import (
     to_weight_tensor_with_linear_activation_quantization_metadata,
 )
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -23,6 +23,7 @@
     preprocess_scale,
 )
 from torchao.quantization.granularity import PerRow, PerTensor
+from torchao.quantization.observer import get_block_size
 from torchao.quantization.quant_primitives import (
     _choose_scale_float8,
     _dequantize_affine_float8,
@@ -33,7 +34,6 @@
     QuantizeTensorKwargs,
     _choose_quant_func_and_quantize_tensor,
 )
-from torchao.quantization.utils import get_block_size
 from torchao.utils import (
     TorchAOBaseTensor,
     _is_fbgemm_genai_gpu_available,
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py

Original file line number	Diff line number	Diff line change
`@@ -19,8 +19,8 @@`
`19`	`19`	`MappingType,`
`20`	`20`	`TorchAODType,`
`21`	`21`	`ZeroPointDomain,`
	`22`	`+ get_block_size,`
`22`	`23`	`)`
`23`		`-from torchao.quantization.utils import get_block_size`
`24`	`24`
`25`	`25`	`ABC: Any = ABCMeta("ABC", (object,), {}) # compatible with Python 2 and 3:`
`26`	`26`
Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`PerRow,`
`15`	`15`	`PerToken,`
`16`	`16`	`)`
	`17`	`+from torchao.quantization.observer import get_block_size`
`17`	`18`	`from torchao.quantization.quant_primitives import (`
`18`	`19`	`_DTYPE_TO_BIT_WIDTH,`
`19`	`20`	`_DTYPE_TO_QVALUE_BOUNDS,`
`@@ -27,7 +28,6 @@`
`27`	`28`	`)`
`28`	`29`	`from torchao.quantization.utils import (`
`29`	`30`	`_get_per_token_block_size,`
`30`		`- get_block_size,`
`31`	`31`	`get_group_qparams_symmetric,`
`32`	`32`	`get_groupwise_affine_qparams,`
`33`	`33`	`)`
Original file line number	Diff line number	Diff line change
`@@ -64,7 +64,7 @@`
`64`	`64`	`from torchao.quantization.linear_activation_weight_observed_tensor import (`
`65`	`65`	`LinearActivationWeightObservedTensor,`
`66`	`66`	`)`
`67`		`-from torchao.quantization.observer import AffineQuantizedObserverBase`
	`67`	`+from torchao.quantization.observer import AffineQuantizedObserverBase, get_block_size`
`68`	`68`	`from torchao.quantization.quantize_.common import (`
`69`	`69`	`KernelPreference,`
`70`	`70`	`)`
`@@ -87,7 +87,6 @@`
`87`	`87`	`_QUANTIZE_CONFIG_HANDLER,`
`88`	`88`	`register_quantize_module_handler,`
`89`	`89`	`)`
`90`		`-from torchao.quantization.utils import get_block_size`
`91`	`90`	`from torchao.quantization.weight_tensor_linear_activation_quantization import (`
`92`	`91`	`to_weight_tensor_with_linear_activation_quantization_metadata,`
`93`	`92`	`)`