refactor packing utils into quantizers

JyotinderSingh · JyotinderSingh · commit dd11851e0b47 · 2025-06-29T16:02:13.000+05:30
diff --git a/keras/api/_tf_keras/keras/quantizers/__init__.py b/keras/api/_tf_keras/keras/quantizers/__init__.py
@@ -19,6 +19,8 @@
 from keras.src.quantizers.quantizers import (
     fake_quant_with_min_max_vars as fake_quant_with_min_max_vars,
 )
+from keras.src.quantizers.quantizers import pack_int4 as pack_int4
 from keras.src.quantizers.quantizers import (
     quantize_and_dequantize as quantize_and_dequantize,
 )
+from keras.src.quantizers.quantizers import unpack_int4 as unpack_int4
diff --git a/keras/api/quantizers/__init__.py b/keras/api/quantizers/__init__.py
@@ -19,6 +19,8 @@
 from keras.src.quantizers.quantizers import (
     fake_quant_with_min_max_vars as fake_quant_with_min_max_vars,
 )
+from keras.src.quantizers.quantizers import pack_int4 as pack_int4
 from keras.src.quantizers.quantizers import (
     quantize_and_dequantize as quantize_and_dequantize,
 )
+from keras.src.quantizers.quantizers import unpack_int4 as unpack_int4
diff --git a/keras/src/layers/core/dense.py b/keras/src/layers/core/dense.py
@@ -467,7 +467,7 @@ def _int4_call(self, inputs, training=None):
 
         @ops.custom_gradient
         def matmul_with_inputs_gradient(inputs, kernel, kernel_scale):
-            unpacked_kernel = self._unpack_int4_ops(
+            unpacked_kernel = quantizers.unpack_int4(
                 kernel, self._orig_input_dim
             )
 
@@ -623,7 +623,7 @@ def quantize(self, mode, type_check=True):
             )
             kernel_scale = ops.squeeze(kernel_scale, axis=0)
             # 2. Pack two int4 values into a single int8 byte.
-            packed_kernel_value, packed_shape, orig_rows = self._pack_int4_ops(
+            packed_kernel_value, _, orig_rows = quantizers.pack_int4(
                 kernel_value_int4
             )
             del self._kernel
@@ -658,7 +658,7 @@ def _get_kernel_with_merged_lora(self):
                 # update, and then pack it again after requantization.
                 if self.quantization_mode == "int4":
                     # 1) Unpack packed int4 tensor to int8 range [-8, 7].
-                    unpacked_kernel = self._unpack_int4_ops(
+                    unpacked_kernel = quantizers.unpack_int4(
                         kernel_value, self._orig_input_dim
                     )
                     # 2) De-scale to recover float32 kernel.
@@ -679,7 +679,7 @@ def _get_kernel_with_merged_lora(self):
                     )
                     kernel_scale = ops.squeeze(kernel_scale, axis=0)
                     # 5) Pack the int4 values back into the compact int8 layout.
-                    kernel_value, _, _ = self._pack_int4_ops(kernel_int4)
+                    kernel_value, _, _ = quantizers.pack_int4(kernel_int4)
                 else:
                     # int8 path (regular): unpacking not required.
                     kernel_value = ops.divide(kernel_value, kernel_scale)
@@ -694,62 +694,3 @@ def _get_kernel_with_merged_lora(self):
                     kernel_scale = ops.squeeze(kernel_scale, axis=0)
             return kernel_value, kernel_scale
         return self.kernel, None
-
-    def _pack_int4_ops(self, arr):
-        """Pack an int4 tensor into an int8 tensor with packed nibbles.
-
-        Accepts a Keras-compatible tensor. The input values must already be int8
-        in the signed range ``[-8, 7]`` and represent the desired int4 values.
-        Packing is performed along axis 0:
-
-        * For every two consecutive rows, the **low nibble** of the output byte
-          stores the value from the first row, and the **high nibble** stores
-          the value from the second row.
-
-        Returns a tuple ``(packed, packed_shape, orig_rows)`` where ``packed``
-        is the packed ``int8`` tensor, ``packed_shape`` is its shape, and
-        ``orig_rows`` is the original (unpacked) row count prior to any padding
-        that may have been inserted when an odd number of rows is supplied.
-        """
-        if arr.dtype != "int8":
-            raise TypeError("Expected int8 tensor for packing")
-
-        shape = ops.shape(arr)
-        rows, cols = shape[0], shape[1]
-
-        orig_rows = rows
-        if rows % 2 == 1:
-            padding_row = ops.zeros((1, cols), dtype="int8")
-            arr = ops.concatenate([arr, padding_row], axis=0)
-            rows += 1
-
-        # Map signed [-8,7] to unsigned 4-bit two's complement (0..15)
-        arr_u = ops.where(arr < 0, arr + 16, arr)
-        arr_u = ops.cast(arr_u, "uint8")
-        arr_u = ops.reshape(arr_u, (rows // 2, 2, cols))
-        low = arr_u[:, 0, :]
-        high = arr_u[:, 1, :]
-        packed = ops.bitwise_or(ops.left_shift(high, 4), low)
-        packed = ops.cast(packed, "int8")
-        return packed, ops.shape(packed), orig_rows
-
-    @staticmethod
-    def _unpack_int4_ops(packed, orig_rows):
-        """Unpack packed int4 tensor (ops) to int8 [-8,7]."""
-        # Bitwise operations work element-wise.
-        low = ops.bitwise_and(packed, 0x0F)
-        high = ops.right_shift(packed, 4)
-        high = ops.bitwise_and(high, 0x0F)
-
-        def _to_signed(x):
-            return ops.where(x < 8, x, ops.subtract(x, 16))
-
-        low = _to_signed(low)
-        high = _to_signed(high)
-
-        # Interleave rows back: stacked shape (2, packed_rows, cols)
-        stacked = ops.stack([low, high], axis=1)  # (pairs, 2, cols)
-        unpacked_full = ops.reshape(stacked, (-1, stacked.shape[-1]))
-        # Remove potential padded row.
-        unpacked = unpacked_full[:orig_rows, :]
-        return unpacked
diff --git a/keras/src/quantizers/__init__.py b/keras/src/quantizers/__init__.py
@@ -7,7 +7,9 @@
 from keras.src.quantizers.quantizers import compute_float8_amax_history
 from keras.src.quantizers.quantizers import compute_float8_scale
 from keras.src.quantizers.quantizers import fake_quant_with_min_max_vars
+from keras.src.quantizers.quantizers import pack_int4
 from keras.src.quantizers.quantizers import quantize_and_dequantize
+from keras.src.quantizers.quantizers import unpack_int4
 from keras.src.saving import serialization_lib
 from keras.src.utils.naming import to_snake_case
 
diff --git a/keras/src/quantizers/quantizers.py b/keras/src/quantizers/quantizers.py
@@ -374,3 +374,65 @@ def quantize_and_dequantize(inputs, scale, quantized_dtype, compute_dtype):
     # Dequantize
     x = ops.multiply(ops.cast(x, compute_dtype), ops.cast(scale, compute_dtype))
     return x
+
+
+@keras_export("keras.quantizers.pack_int4")
+def pack_int4(arr):
+    """Pack an int4 tensor into an int8 tensor with packed nibbles.
+
+    Accepts a Keras-compatible tensor. The input values must already be int8
+    in the signed range ``[-8, 7]`` and represent the desired int4 values.
+    Packing is performed along axis 0:
+
+    * For every two consecutive rows, the **low nibble** of the output byte
+      stores the value from the first row, and the **high nibble** stores
+      the value from the second row.
+
+    Returns a tuple ``(packed, packed_shape, orig_rows)`` where ``packed``
+    is the packed ``int8`` tensor, ``packed_shape`` is its shape, and
+    ``orig_rows`` is the original (unpacked) row count prior to any padding
+    that may have been inserted when an odd number of rows is supplied.
+    """
+    if arr.dtype != "int8":
+        raise TypeError("Expected int8 tensor for packing")
+
+    shape = ops.shape(arr)
+    rows, cols = shape[0], shape[1]
+
+    orig_rows = rows
+    if rows % 2 == 1:
+        padding_row = ops.zeros((1, cols), dtype="int8")
+        arr = ops.concatenate([arr, padding_row], axis=0)
+        rows += 1
+
+    # Map signed [-8,7] to unsigned 4-bit two's complement (0..15)
+    arr_u = ops.where(arr < 0, arr + 16, arr)
+    arr_u = ops.cast(arr_u, "uint8")
+    arr_u = ops.reshape(arr_u, (rows // 2, 2, cols))
+    low = arr_u[:, 0, :]
+    high = arr_u[:, 1, :]
+    packed = ops.bitwise_or(ops.left_shift(high, 4), low)
+    packed = ops.cast(packed, "int8")
+    return packed, ops.shape(packed), orig_rows
+
+
+@keras_export("keras.quantizers.unpack_int4")
+def unpack_int4(packed, orig_rows):
+    """Unpack packed int4 tensor (ops) to int8 [-8,7]."""
+    # Bitwise operations work element-wise.
+    low = ops.bitwise_and(packed, 0x0F)
+    high = ops.right_shift(packed, 4)
+    high = ops.bitwise_and(high, 0x0F)
+
+    def _to_signed(x):
+        return ops.where(x < 8, x, ops.subtract(x, 16))
+
+    low = _to_signed(low)
+    high = _to_signed(high)
+
+    # Interleave rows back: stacked shape (2, packed_rows, cols)
+    stacked = ops.stack([low, high], axis=1)  # (pairs, 2, cols)
+    unpacked_full = ops.reshape(stacked, (-1, stacked.shape[-1]))
+    # Remove potential padded row.
+    unpacked = unpacked_full[:orig_rows, :]
+    return unpacked

Original file line number	Diff line number	Diff line change
`@@ -19,6 +19,8 @@`
`19`	`19`	`from keras.src.quantizers.quantizers import (`
`20`	`20`	`fake_quant_with_min_max_vars as fake_quant_with_min_max_vars,`
`21`	`21`	`)`
	`22`	`+from keras.src.quantizers.quantizers import pack_int4 as pack_int4`
`22`	`23`	`from keras.src.quantizers.quantizers import (`
`23`	`24`	`quantize_and_dequantize as quantize_and_dequantize,`
`24`	`25`	`)`
	`26`	`+from keras.src.quantizers.quantizers import unpack_int4 as unpack_int4`