pytorch
diff --git a/‎exir/emit/test/test_emit.py‎
Lines changed: 110 additions & 0 deletions b/‎exir/emit/test/test_emit.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎preprocess.pt2‎
305 KB b/‎preprocess.pt2‎
305 KB
diff --git a/‎src/executorch/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h‎
Lines changed: 260 additions & 0 deletions b/‎src/executorch/include/executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h‎
Lines changed: 260 additions & 0 deletions
@@ -967,6 +967,116 @@ def combine_fn(carry, x):
         self.assertIn("aten::select_copy", op_names)
         self.assertIn("executorch_prim::et_copy_index", op_names)
 
+    def test_emit_scan_gru(self) -> None:
+        """Test scan with a simple GRU-like computation."""
+        from torch._higher_order_ops.scan import scan
+
+        class SimpleGRU(torch.nn.Module):
+            """Simple single-layer unidirectional GRU using scan."""
+
+            def __init__(self, input_size: int, hidden_size: int):
+                super().__init__()
+                self.input_size = input_size
+                self.hidden_size = hidden_size
+
+                # GRU gates: reset, update, new
+                self.weight_ih = torch.nn.Parameter(
+                    torch.randn(3 * hidden_size, input_size), requires_grad=False
+                )
+                self.weight_hh = torch.nn.Parameter(
+                    torch.randn(3 * hidden_size, hidden_size), requires_grad=False
+                )
+                self.bias_ih = torch.nn.Parameter(
+                    torch.randn(3 * hidden_size), requires_grad=False
+                )
+                self.bias_hh = torch.nn.Parameter(
+                    torch.randn(3 * hidden_size), requires_grad=False
+                )
+
+            def forward(
+                self, x: torch.Tensor, h0: torch.Tensor
+            ) -> Tuple[torch.Tensor, torch.Tensor]:
+                """
+                Args:
+                    x: Input tensor of shape [seq_len, batch, input_size]
+                    h0: Initial hidden state of shape [batch, hidden_size]
+                Returns:
+                    output: Output tensor of shape [seq_len, batch, hidden_size]
+                    h_n: Final hidden state of shape [batch, hidden_size]
+                """
+                weight_ih = self.weight_ih
+                weight_hh = self.weight_hh
+                bias_ih = self.bias_ih
+                bias_hh = self.bias_hh
+
+                def gru_cell(
+                    h: torch.Tensor, x_t: torch.Tensor
+                ) -> Tuple[torch.Tensor, torch.Tensor]:
+                    # Compute gates
+                    gates_ih = torch.nn.functional.linear(x_t, weight_ih, bias_ih)
+                    gates_hh = torch.nn.functional.linear(h, weight_hh, bias_hh)
+
+                    # Split into reset, update, new gates
+                    r_ih, z_ih, n_ih = gates_ih.chunk(3, dim=-1)
+                    r_hh, z_hh, n_hh = gates_hh.chunk(3, dim=-1)
+
+                    r = torch.sigmoid(r_ih + r_hh)
+                    z = torch.sigmoid(z_ih + z_hh)
+                    n = torch.tanh(n_ih + r * n_hh)
+
+                    h_new = (1 - z) * n + z * h
+                    return h_new, h_new.clone()
+
+                final_h, outputs = scan(gru_cell, h0, x)
+                return outputs, final_h
+
+        # Create model and inputs
+        input_size = 4
+        hidden_size = 8
+        seq_len = 5
+        batch_size = 2
+
+        model = SimpleGRU(input_size, hidden_size)
+        x = torch.randn(seq_len, batch_size, input_size)
+        h0 = torch.randn(batch_size, hidden_size)
+        inputs = (x, h0)
+
+        # Run through eager PyTorch
+        eager_outputs = model(*inputs)
+
+        # Export and convert to edge
+        module = to_edge(
+            export(model, inputs, strict=True),
+            compile_config=exir.EdgeCompileConfig(_check_ir_validity=False),
+        )
+        et = module.to_executorch()
+        program = et.executorch_program
+
+        # Verify the program has expected operators
+        op_names = [op.name for op in program.execution_plan[0].operators]
+
+        # Should have scan control flow operators
+        self.assertIn("aten::sym_size", op_names)
+        self.assertIn("aten::select_copy", op_names)
+        self.assertIn("executorch_prim::et_copy_index", op_names)
+
+        # Verify we can load the program
+        buffer = et.buffer
+        loaded_model = _load_for_executorch_from_buffer(buffer)
+
+        # Run through executorch
+        et_outputs = loaded_model(inputs)
+
+        # Compare outputs (with tolerance for floating point)
+        self.assertTrue(
+            torch.allclose(et_outputs[0], eager_outputs[0], atol=1e-5),
+            f"Output mismatch: {et_outputs[0]} vs {eager_outputs[0]}",
+        )
+        self.assertTrue(
+            torch.allclose(et_outputs[1], eager_outputs[1], atol=1e-5),
+            f"Final hidden state mismatch: {et_outputs[1]} vs {eager_outputs[1]}",
+        )
+
     def test_dim_order(self) -> None:
         class SimpleLinear(torch.nn.Module):
             def __init__(self) -> None:
 
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+//===----------------------------------------------------------------------===//
+/// \file extension/kernel_util/make_boxed_from_unboxed_functor.h
+/// Defines a template that can be used to create a boxed version of an unboxed
+/// functor.
+/// Example usage:
+/// ```
+/// Tensor&
+/// my_op(KernelRuntimeContext& ctx, const Tensor& self, const Tensor& other,
+///       Tensor& out)
+/// {
+///   // ...
+///   return out;
+/// }
+///
+/// Kernel my_kernel = Kernel::make_boxed_kernel("my_ns::my_op",
+///   EXECUTORCH_FN(my_op));
+/// static auto res = register_kernels({my_kernel});
+/// ```
+/// Or simply:
+/// ```
+/// EXECUTORCH_LIBRARY(my_ns, "my_op", my_op);
+/// ```
+///
+/// The trick here is to convert each EValue to inferred argument type. This
+/// uses a lot of C++17 features.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <executorch/extension/kernel_util/meta_programming.h>
+#include <executorch/extension/kernel_util/type_list.h>
+#include <executorch/runtime/core/evalue.h>
+#include <executorch/runtime/core/event_tracer_hooks.h>
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <cstdlib>
+#include <memory>
+#include <type_traits>
+#include <typeinfo>
+
+namespace executorch {
+namespace runtime {
+class KernelRuntimeContext; // Forward declaration
+} // namespace runtime
+} // namespace executorch
+
+namespace executorch {
+namespace extension {
+
+// This extension has a lot of generic internal names like "size"; use a unique
+// internal namespace to avoid conflicts with other extensions.
+namespace kernel_util_internal {
+
+// Template trait to check if a type is a non-const tensor
+template <class T>
+struct is_nonconst_tensor : std::false_type {};
+
+template <>
+struct is_nonconst_tensor<executorch::aten::Tensor&> : std::true_type {};
+
+// Template trait to check if a type is a non-const tensor
+// Count non-const tensors in a typelist
+template <class TypeList>
+struct count_nonconst_tensors;
+
+template <>
+struct count_nonconst_tensors<typelist<>> {
+  static constexpr size_t value = 0;
+};
+
+template <class T>
+struct count_nonconst_tensors<typelist<T>> {
+  static constexpr size_t value = 0;
+};
+
+template <>
+struct count_nonconst_tensors<typelist<executorch::aten::Tensor&>> {
+  static constexpr size_t value = 1;
+};
+
+template <class Head, class... Tail>
+struct count_nonconst_tensors<typelist<Head, Tail...>> {
+ private:
+  static constexpr size_t tail_tensor_count =
+      count_nonconst_tensors<typelist<Tail...>>::value;
+  static constexpr size_t tail_args_count = sizeof...(Tail);
+  static constexpr bool is_head_a_tensor = is_nonconst_tensor<Head>::value;
+  static constexpr bool all_tail_args_are_tensor =
+      tail_tensor_count == tail_args_count;
+
+ public:
+  static constexpr size_t value = (is_head_a_tensor && all_tail_args_are_tensor)
+      ? tail_tensor_count + 1
+      : tail_tensor_count;
+};
+
+template <class T>
+struct decay_if_not_tensor final {
+  using type = std::decay_t<T>;
+};
+template <>
+struct decay_if_not_tensor<executorch::aten::Tensor&> final {
+  using type = executorch::aten::Tensor&;
+};
+template <>
+struct decay_if_not_tensor<const executorch::aten::Tensor&> final {
+  using type = const executorch::aten::Tensor&;
+};
+
+template <class T>
+struct evalue_to_arg final {
+  static T call(executorch::runtime::EValue& v) {
+    return std::move(v).to<T>();
+  }
+};
+
+template <>
+struct evalue_to_arg<executorch::aten::Tensor&> final {
+  static executorch::aten::Tensor& call(executorch::runtime::EValue& v) {
+    return v.toTensor();
+  }
+};
+
+template <>
+struct evalue_to_arg<const executorch::aten::Tensor&> final {
+  static const executorch::aten::Tensor& call(executorch::runtime::EValue& v) {
+    return v.toTensor();
+  }
+};
+
+template <class T>
+struct evalue_to_arg<std::optional<T>> final {
+  static std::optional<T> call(executorch::runtime::EValue& v) {
+    return v.toOptional<T>();
+  }
+};
+
+template <class T>
+struct evalue_to_arg<executorch::aten::ArrayRef<std::optional<T>>> final {
+  static executorch::aten::ArrayRef<std::optional<T>> call(
+      executorch::runtime::EValue& v) {
+    return v.toListOptionalTensor();
+  }
+};
+
+template <
+    class Functor,
+    size_t nonconst_tensors_to_log,
+    size_t... evalue_arg_indices,
+    typename... ArgTypes>
+void call_functor_with_args_from_stack(
+    executorch::runtime::KernelRuntimeContext& ctx,
+    executorch::runtime::Span<executorch::runtime::EValue*> stack,
+    std::index_sequence<evalue_arg_indices...>,
+    typelist<ArgTypes...>*) {
+  executorch::runtime::internal::EventTracerProfileOpScope
+      event_tracer_op_scope(ctx.internal_event_tracer(), Functor::func_name_);
+  EXECUTORCH_SCOPE_PROF(Functor::func_name_);
+  (*Functor::func_ptr())(
+      ctx,
+      evalue_to_arg<typename decay_if_not_tensor<ArgTypes>::type>::call(
+          *stack[evalue_arg_indices])...);
+  constexpr size_t num_inputs =
+      std::index_sequence<evalue_arg_indices...>::size();
+  for (size_t i = num_inputs - nonconst_tensors_to_log; i < num_inputs; ++i) {
+    executorch::runtime::internal::event_tracer_log_evalue(
+        ctx.internal_event_tracer(), *stack[i]);
+  }
+}
+
+} // namespace kernel_util_internal
+
+/**
+ * WrapUnboxedIntoFunctor: Given a function pointer, wrap it into a functor that
+ * takes EValues as input and returns void. The wrapped functor will unbox all
+ * inputs and forward them to unboxed kernel.
+ */
+template <class FuncType>
+struct WrapUnboxedIntoFunctor {
+  static_assert(
+      kernel_util_internal::is_compile_time_function_pointer<FuncType>::value,
+      "Can't handle function other than EXECUTORCH_FN");
+  using TrueType = typename FuncType::FuncType;
+  using ReturnType = typename kernel_util_internal::infer_function_traits_t<
+      TrueType>::return_type;
+  using ArgsType = typename kernel_util_internal::infer_function_traits_t<
+      TrueType>::parameter_types;
+  // check if the first argument is KernelRuntimeContext, if so, remove it
+  static constexpr bool first_arg_is_context = std::is_same<
+      ::executorch::runtime::KernelRuntimeContext,
+      std::remove_reference_t<
+          kernel_util_internal::head_with_default_t<void, ArgsType>>>::value;
+  using ContextRemovedArgsType = std::conditional_t<
+      first_arg_is_context,
+      kernel_util_internal::drop_if_nonempty_t<ArgsType, 1>,
+      ArgsType>;
+
+  static void call(
+      ::executorch::runtime::KernelRuntimeContext& ctx,
+      executorch::runtime::Span<executorch::runtime::EValue*> stack) {
+    constexpr size_t num_inputs =
+        kernel_util_internal::size<ContextRemovedArgsType>::value;
+    constexpr size_t num_nonconst_tensors =
+        kernel_util_internal::count_nonconst_tensors<
+            ContextRemovedArgsType>::value;
+    static_assert(num_nonconst_tensors == 1, "Invalid number of inputs");
+    return kernel_util_internal::
+        call_functor_with_args_from_stack<FuncType, num_nonconst_tensors>(
+            ctx,
+            stack,
+            std::make_index_sequence<num_inputs>(),
+            static_cast<ContextRemovedArgsType*>(nullptr));
+  }
+};
+
+template <typename FuncType>
+static executorch::runtime::Kernel make_boxed_kernel(
+    const char* name,
+    FuncType) {
+  return executorch::runtime::Kernel(
+      name, WrapUnboxedIntoFunctor<FuncType>::call);
+}
+
+} // namespace extension
+} // namespace executorch
+
+// Inspired from C10_CONCATENATE
+#define ET_CONCATENATE_IMPL(s1, s2) s1##s2
+#define ET_CONCATENATE(s1, s2) ET_CONCATENATE_IMPL(s1, s2)
+#define ET_UID __LINE__
+
+#define EXECUTORCH_LIBRARY(ns, op_name, func) \
+  _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, ET_UID)
+
+#define _EXECUTORCH_LIBRARY_IMPL(ns, op_name, func, uid)           \
+  static constexpr const char ET_CONCATENATE(name_of_op_, uid)[] = \
+      #ns "::" op_name;                                            \
+  static auto ET_CONCATENATE(res_##ns##_, uid) =                   \
+      ::executorch::runtime::register_kernel(                      \
+          ::executorch::extension::make_boxed_kernel(              \
+              #ns "::" op_name,                                    \
+              EXECUTORCH_FN(func, ET_CONCATENATE(name_of_op_, uid))))
+
+namespace torch {
+namespace executor {
+// TODO(T197294990): Remove these deprecated aliases once all users have moved
+// to the new `::executorch` namespaces.
+using ::executorch::extension::make_boxed_kernel;
+using ::executorch::extension::WrapUnboxedIntoFunctor;
+} // namespace executor
+} // namespace torch