pytorch
diff --git a/‎backends/cadence/generic/operators/op_fully_connected.cpp‎
Lines changed: 67 additions & 0 deletions b/‎backends/cadence/generic/operators/op_fully_connected.cpp‎
Lines changed: 67 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_fully_connected.h‎
Lines changed: 31 additions & 0 deletions b/‎backends/cadence/generic/operators/op_fully_connected.h‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_idma_copy.cpp‎
Lines changed: 53 additions & 0 deletions b/‎backends/cadence/generic/operators/op_idma_copy.cpp‎
Lines changed: 53 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_idma_copy.h‎
Lines changed: 27 additions & 0 deletions b/‎backends/cadence/generic/operators/op_idma_copy.h‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_idma_wait.cpp‎
Lines changed: 48 additions & 0 deletions b/‎backends/cadence/generic/operators/op_idma_wait.cpp‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_idma_wait.h‎
Lines changed: 26 additions & 0 deletions b/‎backends/cadence/generic/operators/op_idma_wait.h‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎backends/cadence/generic/operators/op_requantize_out.cpp‎ renamed to ‎backends/cadence/generic/operators/op_requantize.cpp‎
Lines changed: 27 additions & 20 deletions b/‎backends/cadence/generic/operators/op_requantize_out.cpp‎ renamed to ‎backends/cadence/generic/operators/op_requantize.cpp‎
Lines changed: 27 additions & 20 deletions
diff --git a/‎backends/cadence/generic/operators/op_requantize.h‎
Lines changed: 40 additions & 0 deletions b/‎backends/cadence/generic/operators/op_requantize.h‎
Lines changed: 40 additions & 0 deletions
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_fully_connected.h>
+
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::optional;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
+
+void linear(
+    const Tensor& input,
+    const Tensor& weight,
+    const optional<Tensor>& bias,
+    Tensor& output) {
+  const float* __restrict__ input_data = input.const_data_ptr<float>();
+  const float* __restrict__ weight_data = weight.const_data_ptr<float>();
+  const float* __restrict__ bias_data = bias.value().const_data_ptr<float>();
+  float* __restrict__ output_data = output.mutable_data_ptr<float>();
+
+  // input comes in shape [batch_size, in_dim]
+  // weight comes in shape [out_dim, in_dim]
+  // output comes in empty with shape [batch_size, out_dim]
+  // Perform matrix multiply (M x N) x (N x P) => M x P
+  int64_t M = weight.size(0); // = out_dim
+  int64_t N = weight.size(1); // = in_dim
+
+  // Given an N-dimensional input [d0, d1, d2, ..., d_{N-2}, d_{N-1}], the
+  // leading dimensions is d0 * d1 * ... * d_{N-2}
+  int64_t leading_dims = getLeadingDims(input, input.dim() - 1);
+
+  for (int i = 0; i < leading_dims; ++i) {
+    for (int j = 0; j < M; ++j) {
+      float sum = bias_data[j];
+      for (int k = 0; k < N; ++k) {
+        sum += input_data[i * N + k] * weight_data[j * N + k];
+      }
+      output_data[i * M + j] = sum;
+    }
+  }
+}
+
+Tensor& fully_connected_out(
+    ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const optional<Tensor>& bias,
+    Tensor& output) {
+  linear(input, weight, bias, output);
+  return output;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::optional;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+Tensor& fully_connected_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const optional<Tensor>& bias,
+    Tensor& output);
+
+} // namespace native
+} // namespace generic
+} // namespace impl
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/operators/op_idma_copy.h>
+
+#include <cstdint>
+#include <cstring> // For std::memcpy
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// CPU implementation of idma_copy_out using std::memcpy
+// This function performs a direct memory copy between tensors
+Tensor& idma_copy_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const int64_t
+        task_num, // Unused in CPU implementation but kept for API compatibility
+    const int64_t
+        channel, // Unused in CPU implementation but kept for API compatibility
+    Tensor& out) {
+  ET_KERNEL_CHECK(
+      ctx,
+      src.dtype() == out.dtype() && src.numel() == out.numel(),
+      InvalidArgument,
+      out);
+
+  // Use std::memcpy for direct memory copy
+  std::memcpy(
+      out.mutable_data_ptr<uint8_t>(),
+      src.const_data_ptr<uint8_t>(),
+      out.nbytes());
+
+  return out;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+::executorch::aten::Tensor& idma_copy_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& src,
+    const int64_t task_num,
+    const int64_t channel,
+    ::executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "executorch/backends/cadence/generic/operators/op_idma_wait.h"
+
+#include <cstdint>
+
+#include "executorch/runtime/core/exec_aten/exec_aten.h"
+#include "executorch/runtime/core/exec_aten/util/tensor_util.h"
+#include "executorch/runtime/kernel/kernel_runtime_context.h"
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// CPU implementation of idma_wait_out
+// Since there's no actual DMA operation in the CPU implementation,
+// this is essentially a no-op function that just ensures the output tensor
+// has the same content as the input tensor
+Tensor& idma_wait_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& src,
+    const int64_t
+        task_num, // Unused in CPU implementation but kept for API compatibility
+    Tensor& out) {
+  ET_KERNEL_CHECK(ctx, src.numel() == out.numel(), InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, src.dtype() == out.dtype(), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      src.const_data_ptr<uint8_t>() == out.const_data_ptr<uint8_t>(),
+      InvalidArgument,
+      out);
+
+  return out;
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+::executorch::aten::Tensor& idma_wait_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& src,
+    const int64_t task_num,
+    ::executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl
@@ -6,16 +6,26 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/op_requantize.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <cstdint>
+#include <cstdlib>
+
 namespace impl {
 namespace generic {
 namespace native {
 
-using executorch::aten::ScalarType;
-using executorch::aten::Tensor;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::optional;
+using ::executorch::aten::Scalar;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+using ::impl::generic::kernels::dequantize;
+using ::impl::generic::kernels::quantize;
 
 // Requantize the int8_t/uint8_t input tensor to a uint8_t/int8_t out tensor.
 // The scale and zero_point for requantization are in the args.
@@ -86,15 +96,14 @@ Tensor& requantize_out(
       torch::executor::toString(out.scalar_type()),
       torch::executor::toString(out_dtype));
 
-#define typed_requantize(ctype, dtype)                                      \
-  const ctype* input_data = input.const_data_ptr<ctype>();                  \
-  dtype* out_data = out.mutable_data_ptr<dtype>();                          \
-  for (size_t i = 0; i < numel; ++i) {                                      \
-    float dequant =                                                         \
-        kernels::dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
-    out_data[i] =                                                           \
-        kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
+#define typed_requantize(ctype, dtype)                                         \
+  const ctype* input_data = input.const_data_ptr<ctype>();                     \
+  dtype* out_data = out.mutable_data_ptr<dtype>();                             \
+  for (size_t i = 0; i < numel; ++i) {                                         \
+    float dequant = dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
+    out_data[i] = quantize<dtype>(dequant, 1 / out_scale, out_zero_point);     \
   };
+
 #define typed_requantize_in(ctype)               \
   switch (out_dtype) {                           \
     case ScalarType::Byte: {                     \
@@ -187,14 +196,12 @@ Tensor& requantize_per_tensor_out(
       torch::executor::toString(out.scalar_type()),
       torch::executor::toString(out_dtype));
 
-#define typed_requantize(ctype, dtype)                                      \
-  const ctype* input_data = input.const_data_ptr<ctype>();                  \
-  dtype* out_data = out.mutable_data_ptr<dtype>();                          \
-  for (size_t i = 0; i < numel; ++i) {                                      \
-    float dequant =                                                         \
-        kernels::dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
-    out_data[i] =                                                           \
-        kernels::quantize<dtype>(dequant, 1 / out_scale, out_zero_point);   \
+#define typed_requantize(ctype, dtype)                                         \
+  const ctype* input_data = input.const_data_ptr<ctype>();                     \
+  dtype* out_data = out.mutable_data_ptr<dtype>();                             \
+  for (size_t i = 0; i < numel; ++i) {                                         \
+    float dequant = dequantize<ctype>(input_data[i], in_scale, in_zero_point); \
+    out_data[i] = quantize<dtype>(dequant, 1 / out_scale, out_zero_point);     \
   };
 
 #define typed_requantize_in(ctype)               \
 
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+::executorch::aten::Tensor& requantize_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& input,
+    const ::executorch::aten::Tensor& in_scale_t,
+    const ::executorch::aten::Tensor& in_zero_point_t,
+    const ::executorch::aten::Tensor& out_scale_t,
+    const ::executorch::aten::Tensor& out_zero_point_t,
+    const ::executorch::aten::ScalarType out_dtype,
+    ::executorch::aten::Tensor& out);
+
+::executorch::aten::Tensor& requantize_per_tensor_out(
+    ::executorch::runtime::KernelRuntimeContext& ctx,
+    const ::executorch::aten::Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    double out_scale,
+    int64_t out_zero_point,
+    const ::executorch::aten::ScalarType out_dtype,
+    ::executorch::aten::Tensor& out);
+
+} // namespace native
+} // namespace generic
+} // namespace impl