[nativert] Move c10_kernel (pytorch#156208)

yushangdi · pytorchmergebot · commit e4c9f6d9a286 · 2025-06-19T17:36:23.000Z
Summary: Torch Native Runtime RFC: pytorch/rfcs#72 As part of the effort to open source TorchNativeRuntime (or what we call Sigmoid), we are moving the Pytree implementation to torch/: fbcode/sigmoid/kernels -> fbcode/caffe2/torch/nativert/kernels Test Plan: ``` buck run fbcode//mode/dev-nosan //caffe2/test/cpp/nativert:c10_kernel_test ``` Differential Revision: D76825830 Pull Request resolved: pytorch#156208 Approved by: https://github.com/zhxchen17
diff --git a/build_variables.bzl b/build_variables.bzl
@@ -606,6 +606,7 @@ libtorch_nativert_sources = [
     "torch/nativert/executor/memory/FunctionSchema.cpp",
     "torch/nativert/common/FileUtil.cpp",
     "torch/nativert/detail/ITree.cpp",
+    "torch/nativert/kernels/C10Kernel.cpp",
 ]
 
 torch_mobile_tracer_sources = [
diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt
@@ -17,6 +17,7 @@ set(NATIVERT_TEST_SRCS
   ${TORCH_ROOT}/torch/nativert/executor/ExecutionPlanner.cpp
   ${TORCH_ROOT}/torch/nativert/detail/ITree.cpp
   ${TORCH_ROOT}/torch/nativert/executor/ExecutionFrame.cpp
+  ${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp
 )
 
 add_executable(test_nativert
diff --git a/test/cpp/nativert/test_c10_kernel.cpp b/test/cpp/nativert/test_c10_kernel.cpp
@@ -0,0 +1,76 @@
+#include <ATen/core/op_registration/op_registration.h>
+#include <gtest/gtest.h>
+#include <torch/nativert/executor/ExecutionFrame.h>
+#include <torch/nativert/graph/Graph.h>
+#include <torch/nativert/kernels/C10Kernel.h>
+#include <torch/torch.h>
+
+namespace torch::nativert {
+
+at::Tensor foo_kernel(const at::Tensor& a, const at::Tensor& b) {
+  return a + b;
+}
+
+TEST(C10KernelTest, computeInternal) {
+  auto registrar = c10::RegisterOperators().op(
+      "test::foo(Tensor a, Tensor b) -> Tensor", &foo_kernel);
+
+  static constexpr std::string_view source =
+      R"(graph(%a, %b):
+%x = test.foo.default(a=%a, b=%b)
+return (%x)
+)";
+
+  auto graph = stringToGraph(source);
+  const auto& nodes = graph->nodes();
+  auto it = nodes.begin();
+  std::advance(it, 1);
+  const Node& node = *it;
+
+  c10::Device device = torch::Device(torch::kCPU, 0);
+
+  auto a = at::randn({6, 6, 6});
+  auto b = at::randn({6, 6, 6});
+
+  auto frame = ExecutionFrame(*graph);
+  frame.setIValue(graph->getValue("a")->id(), a);
+  frame.setIValue(graph->getValue("b")->id(), b);
+
+  auto kernel = C10Kernel(&node, device);
+
+  kernel.computeInternal(frame);
+
+  at::Tensor expected = a + b;
+  EXPECT_TRUE(
+      torch::equal(frame.getTensor(graph->getValue("x")->id()), expected));
+}
+
+TEST(ScalarBinaryOpKernelTest, computeInternal) {
+  static constexpr std::string_view source =
+      R"(graph(%a, %b):
+%x = _operator.add(a=%a, b=%b)
+return (%x)
+)";
+
+  auto graph = stringToGraph(source);
+  const auto& nodes = graph->nodes();
+  auto it = nodes.begin();
+  std::advance(it, 1);
+  const Node& node = *it;
+
+  auto a = 1;
+  auto b = 2;
+
+  auto frame = ExecutionFrame(*graph);
+  frame.setIValue(graph->getValue("a")->id(), a);
+  frame.setIValue(graph->getValue("b")->id(), b);
+
+  auto kernel = ScalarBinaryOpKernel(&node);
+
+  kernel.computeInternal(frame);
+
+  auto expected = a + b;
+  EXPECT_EQ(frame.getIValue(graph->getValue("x")->id()).toInt(), expected);
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/C10Kernel.cpp b/torch/nativert/kernels/C10Kernel.cpp
@@ -0,0 +1,265 @@
+#include <torch/nativert/kernels/C10Kernel.h>
+
+#include <fmt/ostream.h>
+
+#include <c10/util/Enumerate.h>
+
+#ifdef __SIGRID_USE_GPU__
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/Exceptions.h>
+#endif
+
+namespace torch::nativert {
+
+C10Kernel::C10Kernel(
+    const Node* node,
+    c10::Device device,
+    OpKernelKind kind,
+    AliasingSpec&& aliasingSpec)
+    : OpKernel(node, device, kind),
+      op_(getOperatorForTarget(node->target(), node)),
+      schema_(op_.schema(), std::move(aliasingSpec), kind_),
+      arguments_(prefillStackWithStaticArgs(node, op_.schema())) {}
+
+void C10Kernel::computeInternal(ExecutionFrame& executionFrame) const {
+  // Make a copy of the stack
+  std::vector<c10::IValue> stack = arguments_.getStackWithStaticArgs();
+
+  fillDynamicInputs(executionFrame, arguments_, stack);
+
+  // Call the op with the prepared stack.
+  try {
+    op_.callBoxed(stack);
+  } catch (const std::exception& ex) {
+    auto stackTrace = node_->getMetadata("stack_trace");
+    throw std::runtime_error(fmt::format(
+        "Exception while executing node: {}\n"
+        "with args:\n{}\n"
+        "{}\n"
+        "Original Python stacktrace:\n{}",
+        fmt::streamed(*node_),
+        readableArgs(op_.schema(), stack),
+        ex.what(),
+        stackTrace ? *stackTrace : "<no stack trace>"));
+  }
+
+  // Write out results
+  // TODO: we store intermediates in a single table (symint and tensor alike).
+  // This can theoretically lead to name collisions, although based on how
+  // these are named I don't think it will ever happen in practice. We need to
+  // enforce it though.
+  const auto& outputValues = node_->outputs();
+  TORCH_CHECK_EQ(outputValues.size(), stack.size())
+      << "Output size mismatch for " << node_->toString();
+  for (auto&& [i, actualOutput] : c10::enumerate(stack)) {
+    executionFrame.setIValue(outputValues[i]->id(), std::move(actualOutput));
+  }
+}
+
+namespace {
+std::unordered_map<std::string, c10::IValue> getSymInputs(
+    const ExecutionFrame& executionFrame,
+    const Node& node) {
+  std::unordered_map<std::string, c10::IValue> inputs;
+  for (const auto& input : node.inputs()) {
+    const auto& val = executionFrame.getIValue(input.value->id());
+    if (val.isInt() || val.isDouble() || val.isBool()) {
+      inputs[input.name] = val;
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+  }
+  for (const auto& attribute : node.attributes()) {
+    if (std::holds_alternative<int64_t>(attribute.value)) {
+      inputs[attribute.name] = std::get<int64_t>(attribute.value);
+    } else if (std::holds_alternative<double>(attribute.value)) {
+      inputs[attribute.name] = std::get<double>(attribute.value);
+    } else if (std::holds_alternative<bool>(attribute.value)) {
+      inputs[attribute.name] = std::get<bool>(attribute.value);
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+  }
+  return inputs;
+}
+
+template <typename T>
+void computeScalarBinaryOp(
+    ExecutionFrame& executionFrame,
+    const Node& node,
+    std::enable_if_t<true, T> a,
+    std::enable_if_t<true, T> b) {
+  std::string_view target = node.target();
+  T out;
+
+  if (target == "_operator.add") {
+    out = a + b;
+  } else if (target == "_operator.sub") {
+    out = a - b;
+  } else if (target == "_operator.mul") {
+    out = a * b;
+  } else if (target == "_operator.pow") {
+    out = std::pow(a, b);
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for symbolic values: {}", target));
+  }
+
+  executionFrame.setIValue(node.outputs()[0]->id(), out);
+  VLOG(2) << fmt::format(
+      "Completed executing node: {} with a={}, b={}, out={}",
+      fmt::streamed(node),
+      a,
+      b,
+      out);
+}
+
+} // namespace
+
+void ScalarBinaryOpKernel::computeInternal(
+    ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  const auto& a = inputs.at("a");
+  const auto& b = inputs.at("b");
+
+  auto coerceToDouble = [](const c10::IValue& x) -> double {
+    if (x.isInt()) {
+      return static_cast<double>(x.toInt());
+    } else if (x.isDouble()) {
+      return x.toDouble();
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+  };
+
+  if (a.isInt() && b.isInt()) {
+    computeScalarBinaryOp<int64_t>(
+        executionFrame, *node_, a.toInt(), b.toInt());
+  } else {
+    computeScalarBinaryOp<double>(
+        executionFrame, *node_, coerceToDouble(a), coerceToDouble(b));
+  }
+}
+
+void SymIntOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  int64_t a = inputs.at("a").toInt();
+  std::string_view target = node_->target();
+  if (target == "torch.sym_float") {
+    double out = static_cast<double>(a);
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+    VLOG(2) << fmt::format(
+        "Completed executing node: {} with a={}, out={}",
+        fmt::streamed(*node_),
+        a,
+        out);
+    return;
+  }
+  int64_t b = inputs.at("b").toInt();
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  int64_t out;
+
+  if (target == "_operator.floordiv") {
+    out = a / b;
+  } else if (target == "_operator.mod") {
+    out = a % b;
+  } else if (target == "torch.sym_max") {
+    out = std::max(a, b);
+  } else if (target == "torch.sym_min") {
+    out = std::min(a, b);
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymInt: {}", node_->target()));
+  }
+
+  executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  VLOG(2) << fmt::format(
+      "Completed executing node: {} with a={}, b={}, out={}",
+      fmt::streamed(*node_),
+      a,
+      b,
+      out);
+}
+
+void SymBoolOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
+  bool out;
+
+  const std::string_view target = node_->target();
+  if (target == "torch.sym_not") {
+    bool a = inputs.at("a").toBool();
+    out = !a;
+  } else if (target == "_operator.ge") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a >= b;
+  } else if (target == "_operator.le") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a <= b;
+  } else if (target == "_operator.eq") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a == b;
+  } else if (target == "_operator.gt") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a > b;
+  } else if (target == "_operator.lt") {
+    int64_t a = inputs.at("a").toInt();
+    int64_t b = inputs.at("b").toInt();
+    out = a < b;
+  } else if (target == "_operator.and_") {
+    bool a = inputs.at("a").toBool();
+    bool b = inputs.at("b").toBool();
+    out = a && b;
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymBool: {}", node_->target()));
+  }
+
+  executionFrame.setIValue(node_->outputs()[0]->id(), out);
+}
+
+void SymFloatOpKernel::computeInternal(ExecutionFrame& executionFrame) const {
+  auto inputs = getSymInputs(executionFrame, *node_);
+
+  const std::string_view target = node_->target();
+  if (target == "math.trunc") {
+    double x = inputs.at("x").toDouble();
+    // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
+    int64_t out = trunc(x);
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else if (target == "torch._sym_sqrt") {
+    double a = inputs.at("a").toDouble();
+    double out = std::sqrt(a);
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else if (target == "_operator.neg") {
+    auto a = inputs.at("a");
+    c10::IValue out;
+    if (a.isInt()) {
+      out = -a.toInt();
+    } else if (a.isDouble()) {
+      out = -a.toDouble();
+    } else {
+      throw std::runtime_error("unsupported type for symbolic input");
+    }
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else if (target == "_operator.truediv") {
+    auto ia = inputs.at("a");
+    double a = ia.isInt() ? static_cast<double>(ia.toInt()) : ia.toDouble();
+    auto ib = inputs.at("b");
+    double b = ib.isInt() ? static_cast<double>(ib.toInt()) : ib.toDouble();
+    double out = a / b;
+    executionFrame.setIValue(node_->outputs()[0]->id(), out);
+  } else {
+    throw std::runtime_error(
+        fmt::format("unsupported operator for SymFloat: {}", node_->target()));
+  }
+}
+
+} // namespace torch::nativert
diff --git a/torch/nativert/kernels/C10Kernel.h b/torch/nativert/kernels/C10Kernel.h

Original file line number	Diff line number	Diff line change
`@@ -606,6 +606,7 @@ libtorch_nativert_sources = [`
`606`	`606`	`"torch/nativert/executor/memory/FunctionSchema.cpp",`
`607`	`607`	`"torch/nativert/common/FileUtil.cpp",`
`608`	`608`	`"torch/nativert/detail/ITree.cpp",`
	`609`	`+ "torch/nativert/kernels/C10Kernel.cpp",`
`609`	`610`	`]`
`610`	`611`
`611`	`612`	`torch_mobile_tracer_sources = [`
Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@ set(NATIVERT_TEST_SRCS`
`17`	`17`	`${TORCH_ROOT}/torch/nativert/executor/ExecutionPlanner.cpp`
`18`	`18`	`${TORCH_ROOT}/torch/nativert/detail/ITree.cpp`
`19`	`19`	`${TORCH_ROOT}/torch/nativert/executor/ExecutionFrame.cpp`
	`20`	`+ ${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp`
`20`	`21`	`)`
`21`	`22`
`22`	`23`	`add_executable(test_nativert`