Using generic implementation for 16-bit activations and 8-bit weights for matmul in backends

RahulC7 · web-flow · commit a09a4b74feb4 · 2025-12-04T21:31:47.000Z
Differential Revision: D87997149 Pull Request resolved: #16008
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -386,3 +386,16 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         quantizers.append(CadenceAtenQuantizer(Conv1dPattern(), qconfig_A16))
         quantizers.append(CadenceAtenQuantizer(Conv2dPattern(), qconfig_A16))
         super().__init__(quantizers)
+
+
+class CadenceWith16BitMatmulActivationsQuantizer(CadenceQuantizer):
+    """
+    Quantizer including A16 matmul
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = []
+        # Add 16-bit quantizers for MatmulPattern
+        quantizers.append(CadenceAtenQuantizer(MatmulPattern(), qconfig_A16))
+        super().__init__(quantizers)
diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/op_quantized_matmul_out.cpp
@@ -8,6 +8,7 @@
 
 #include <executorch/backends/cadence/hifi/kernels/kernels.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
+#include <on_device_ai/Assistant/Jarvis/min_runtime/operators/generic/op_quantized_matmul.h>
 #include <stdlib.h>
 
 using executorch::aten::ScalarType;
@@ -192,8 +193,20 @@ void quantized_matmul_out(
   size_t leading_dim = X.size(X.dim() - 2);
   size_t out_dim = Y.size(Y.dim() - 1 - transposed);
   size_t in_dim = X.size(X.dim() - 1);
-
-  if (out.scalar_type() == exec_aten::ScalarType::Byte) {
+  if (out.scalar_type() == exec_aten::ScalarType::Short) {
+    ::impl::generic::native::quantized_matmul_out(
+        ctx,
+        X,
+        X_zero_point,
+        Y,
+        Y_zero_point,
+        bias,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        transposed,
+        out);
+  } else if (out.scalar_type() == exec_aten::ScalarType::Byte) {
     _typed_quantized_matmul<uint8_t>(
         ctx,
         X,
diff --git a/backends/cadence/hifi/operators/op_quantized_matmul_out.h b/backends/cadence/hifi/operators/op_quantized_matmul_out.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include "executorch/runtime/core/exec_aten/exec_aten.h"
+#include "executorch/runtime/kernel/kernel_runtime_context.h"
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+::executorch::aten::Tensor& quantized_matmul_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& X,
+    int64_t X_zero_point,
+    const ::executorch::aten::Tensor& Y,
+    int64_t Y_zero_point,
+    const ::executorch::aten::optional<::executorch::aten::Tensor>& bias,
+    int64_t out_multiplier,
+    int64_t out_shift,
+    int64_t out_zero_point,
+    bool transposed,
+    ::executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
@@ -2,7 +2,7 @@ load("@fbsource//tools/build_defs:platform_defs.bzl", "CXX")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 
-def define_operator(name: str, deps: list[str] | None = None) -> None:
+def define_operator(name: str, deps: list[str] | None = None, exported_headers: list[str] | None = None) -> None:
     op_name = "op_{}".format(name)
 
     # Deps used by all operators.
@@ -21,6 +21,8 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
     ]
     if deps == None:
         deps = []
+    if exported_headers == None:
+        exported_headers = ["operators.h"]
 
     runtime.cxx_library(
         name = op_name,
@@ -32,7 +34,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         ],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
-        exported_headers = ["operators.h"],
+        exported_headers = exported_headers,
     )
 
 OPERATORS = [
@@ -87,7 +89,6 @@ OPERATORS = [
     "quantized_layer_norm",
     "quantized_linear_asym8sxasym8s_asym8s_per_tensor_out",
     "quantized_linear_asym8uxasym8u_asym8u_per_tensor_out",
-    "quantized_matmul_out",
     "quantized_matmul_asym8sxasym8s_asym8s_out",
     "quantized_matmul_asym8uxasym8u_asym8u_out",
     "quantized_relu_out",
@@ -127,3 +128,6 @@ def define_common_targets():
     # quantized_conv2d_nchw_out and quantized_conv2d_nhwc_out need additional dependency for int16 support
     define_operator("quantized_conv2d_nchw_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"])
     define_operator("quantized_conv2d_nhwc_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_conv2d"])
+
+    # quantized_matmul_out needs additional dependency for int16 support
+    define_operator("quantized_matmul_out", deps=["fbcode//on_device_ai/Assistant/Jarvis/min_runtime/operators/generic:op_quantized_matmul"], exported_headers=["op_quantized_matmul_out.h"])
diff --git a/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp b/backends/cadence/hifi/operators/tests/test_op_quantized_matmul_out.cpp
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/operators/op_quantized_matmul_out.h>
+
+#include <executorch/kernels/test/TestUtil.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
+#include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <gtest/gtest.h>
+
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::aten::TensorImpl;
+using ::executorch::runtime::Error;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::executorch::runtime::runtime_init;
+using ::executorch::runtime::testing::TensorFactory;
+
+class HiFiQuantizedMatmulTest : public OperatorTest {
+ public:
+ protected:
+  Tensor& quantized_matmul_out(
+      const Tensor& X,
+      int64_t X_zero_point,
+      const Tensor& Y,
+      int64_t Y_zero_point,
+      const std::optional<Tensor>& bias,
+      int64_t out_multiplier,
+      int64_t out_shift,
+      int64_t out_zero_point,
+      bool transposed,
+      Tensor& output) {
+    return impl::HiFi::native::quantized_matmul_out(
+        context_,
+        X,
+        X_zero_point,
+        Y,
+        Y_zero_point,
+        bias,
+        out_multiplier,
+        out_shift,
+        out_zero_point,
+        transposed,
+        output);
+  }
+};
+
+// Test quantized_matmul_out with int16 activations and int8 weights
+TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16Test) {
+  TensorFactory<ScalarType::Short> tf_int16;
+  TensorFactory<ScalarType::Int> tf_int32;
+  TensorFactory<ScalarType::Char> tf_int8;
+
+  // Minimal test case: X [2, 2] x Y [2, 2] = output [2, 2]
+  // Small enough to verify by hand calculation
+  //
+  // X (2x2):          Y (2x2):
+  // 2  4              1  2
+  // 6  8              1  0
+  //
+  // Hand calculation for matmul (before scaling):
+  // (0,0): 2*1 + 4*1 = 6
+  // (0,1): 2*2 + 4*0 = 4
+  // (1,0): 6*1 + 8*1 = 14
+  // (1,1): 6*2 + 8*0 = 12
+  //
+  // Raw result: [[6, 4], [14, 12]]
+  // After 0.5 scaling: [[3, 2], [7, 6]]
+  Tensor X = tf_int16.make({2, 2}, {2, 4, 6, 8});
+  Tensor Y = tf_int8.make({2, 2}, {1, 2, 1, 0});
+  Tensor bias = tf_int32.zeros({2});
+  Tensor output = tf_int16.zeros({2, 2});
+
+  int64_t X_zero_point = 0;
+  int64_t Y_zero_point = 0;
+  int64_t out_multiplier = 1073741824; // 0.5 * 2^31
+  int64_t out_shift = 0;
+  int64_t out_zero_point = 0;
+
+  quantized_matmul_out(
+      X,
+      X_zero_point,
+      Y,
+      Y_zero_point,
+      bias,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      false, // transposed
+      output);
+
+  Tensor expected = tf_int16.make({2, 2}, {3, 2, 7, 6});
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+// Test quantized_matmul_out with transposed Y (int16 activations and int8
+// weights)
+TEST_F(HiFiQuantizedMatmulTest, QuantizedMatmulInt16TransposedTest) {
+  TensorFactory<ScalarType::Short> tf_int16;
+  TensorFactory<ScalarType::Int> tf_int32;
+  TensorFactory<ScalarType::Char> tf_int8;
+
+  // Minimal test case with transposed Y: X [2, 2] x Y^T [2, 2] = output [2, 2]
+  // Y is stored transposed, so we compute X @ Y^T
+  //
+  // X (2x2):          Y_stored (2x2, which is Y^T):
+  // 2  4              1  1
+  // 6  8              2  0
+  //
+  // When transposed=true, we compute X @ Y_stored^T = X @ Y
+  // Y = Y_stored^T = [[1, 2], [1, 0]]
+  //
+  // Hand calculation for matmul (before scaling):
+  // (0,0): 2*1 + 4*1 = 6
+  // (0,1): 2*2 + 4*0 = 4
+  // (1,0): 6*1 + 8*1 = 14
+  // (1,1): 6*2 + 8*0 = 12
+  //
+  // Raw result: [[6, 4], [14, 12]]
+  // After 0.5 scaling: [[3, 2], [7, 6]]
+  Tensor X = tf_int16.make({2, 2}, {2, 4, 6, 8});
+  Tensor Y = tf_int8.make({2, 2}, {1, 1, 2, 0}); // Stored as Y^T
+  Tensor bias = tf_int32.zeros({2});
+  Tensor output = tf_int16.zeros({2, 2});
+
+  int64_t X_zero_point = 0;
+  int64_t Y_zero_point = 0;
+  int64_t out_multiplier = 1073741824; // 0.5 * 2^31
+  int64_t out_shift = 0;
+  int64_t out_zero_point = 0;
+
+  quantized_matmul_out(
+      X,
+      X_zero_point,
+      Y,
+      Y_zero_point,
+      bias,
+      out_multiplier,
+      out_shift,
+      out_zero_point,
+      true, // transposed
+      output);
+
+  Tensor expected = tf_int16.make({2, 2}, {3, 2, 7, 6});
+  EXPECT_TENSOR_EQ(output, expected);
+}
+
+} // namespace
+} // namespace native
+} // namespace HiFi
+} // namespace impl