add

allisonwang-db · allisonwang-db · commit 1621de230949 · 2025-08-20T14:53:19.000-07:00
diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json
@@ -4086,6 +4086,13 @@
     ],
     "sqlState" : "42K0L"
   },
+  "LATERAL_JOIN_WITH_ARROW_UDTF_UNSUPPORTED" : {
+    "message" : [
+      "LATERAL JOIN with Arrow-optimized user-defined table functions (UDTFs) is not supported. Arrow UDTFs cannot be used on the right-hand side of a lateral join.",
+      "Please use a regular UDTF instead, or restructure your query to avoid the lateral join."
+    ],
+    "sqlState" : "0A000"
+  },
   "LOAD_DATA_PATH_NOT_EXISTS" : {
     "message" : [
       "LOAD DATA input path does not exist: <path>."
diff --git a/python/pyspark/sql/tests/arrow/test_arrow_udtf.py b/python/pyspark/sql/tests/arrow/test_arrow_udtf.py
@@ -26,6 +26,7 @@
 
 if have_pyarrow:
     import pyarrow as pa
+    import pyarrow.compute as pc
 
 
 @unittest.skipIf(not have_pyarrow, pyarrow_requirement_message)
@@ -460,6 +461,135 @@ def eval(
         )
         assertDataFrameEqual(sql_result_df, expected_df)
 
+    def test_arrow_udtf_lateral_join_disallowed(self):
+        @arrow_udtf(returnType="x int, result int")
+        class SimpleArrowUDTF:
+            def eval(self, input_val: "pa.Array") -> Iterator["pa.Table"]:
+                val = input_val[0].as_py()
+                result_table = pa.table(
+                    {
+                        "x": pa.array([val], type=pa.int32()),
+                        "result": pa.array([val * 2], type=pa.int32()),
+                    }
+                )
+                yield result_table
+
+        self.spark.udtf.register("simple_arrow_udtf", SimpleArrowUDTF)
+
+        test_df = self.spark.createDataFrame([(1,), (2,), (3,)], "id int")
+        test_df.createOrReplaceTempView("test_table")
+
+        with self.assertRaisesRegex(Exception, "LATERAL_JOIN_WITH_ARROW_UDTF_UNSUPPORTED"):
+            self.spark.sql(
+                """
+            SELECT t.id, f.x, f.result
+            FROM test_table t, LATERAL simple_arrow_udtf(t.id) f
+            """
+            )
+
+    def test_arrow_udtf_lateral_join_with_table_argument_disallowed(self):
+        @arrow_udtf(returnType="filtered_id bigint")
+        class MixedArgsUDTF:
+            def eval(self, input_table: "pa.Table") -> Iterator["pa.Table"]:
+                filtered_data = input_table.filter(pc.greater(input_table["id"], 5))
+                result_table = pa.table({"filtered_id": filtered_data["id"]})
+                yield result_table
+
+        self.spark.udtf.register("mixed_args_udtf", MixedArgsUDTF)
+
+        test_df1 = self.spark.createDataFrame([(1,), (2,), (3,)], "id int")
+        test_df1.createOrReplaceTempView("test_table1")
+
+        test_df2 = self.spark.createDataFrame([(6,), (7,), (8,)], "id bigint")
+        test_df2.createOrReplaceTempView("test_table2")
+
+        # Table arguments create nested lateral joins where our CheckAnalysis rule doesn't trigger
+        # because the Arrow UDTF is in the inner lateral join, not the outer one our rule checks.
+        # So Spark's general lateral join validation catches this first with
+        # NON_DETERMINISTIC_LATERAL_SUBQUERIES.
+        with self.assertRaisesRegex(
+            Exception,
+            "UNSUPPORTED_SUBQUERY_EXPRESSION_CATEGORY.NON_DETERMINISTIC_LATERAL_SUBQUERIES",
+        ):
+            self.spark.sql(
+                """
+            SELECT t1.id, f.filtered_id
+            FROM test_table1 t1, LATERAL mixed_args_udtf(table(SELECT * FROM test_table2)) f
+            """
+            )
+
+    def test_arrow_udtf_with_table_argument_then_lateral_join_allowed(self):
+        @arrow_udtf(returnType="processed_id bigint")
+        class TableArgUDTF:
+            def eval(self, input_table: "pa.Table") -> Iterator["pa.Table"]:
+                processed_data = pc.add(input_table["id"], 100)
+                result_table = pa.table({"processed_id": processed_data})
+                yield result_table
+
+        self.spark.udtf.register("table_arg_udtf", TableArgUDTF)
+
+        source_df = self.spark.createDataFrame([(1,), (2,), (3,)], "id bigint")
+        source_df.createOrReplaceTempView("source_table")
+
+        join_df = self.spark.createDataFrame([("A",), ("B",), ("C",)], "label string")
+        join_df.createOrReplaceTempView("join_table")
+
+        result_df = self.spark.sql(
+            """
+        SELECT f.processed_id, j.label
+        FROM table_arg_udtf(table(SELECT * FROM source_table)) f,
+             join_table j
+        ORDER BY f.processed_id, j.label
+        """
+        )
+
+        expected_data = [
+            (101, "A"),
+            (101, "B"),
+            (101, "C"),
+            (102, "A"),
+            (102, "B"),
+            (102, "C"),
+            (103, "A"),
+            (103, "B"),
+            (103, "C"),
+        ]
+        expected_df = self.spark.createDataFrame(expected_data, "processed_id bigint, label string")
+        assertDataFrameEqual(result_df, expected_df)
+
+    def test_arrow_udtf_table_argument_with_regular_udtf_lateral_join_allowed(self):
+        @arrow_udtf(returnType="computed_value int")
+        class ComputeUDTF:
+            def eval(self, input_table: "pa.Table") -> Iterator["pa.Table"]:
+                total = pc.sum(input_table["value"]).as_py()
+                result_table = pa.table({"computed_value": pa.array([total], type=pa.int32())})
+                yield result_table
+
+        from pyspark.sql.functions import udtf
+        from pyspark.sql.types import StructType, StructField, IntegerType
+
+        @udtf(returnType=StructType([StructField("multiplied", IntegerType())]))
+        class MultiplyUDTF:
+            def eval(self, input_val: int):
+                yield (input_val * 3,)
+
+        self.spark.udtf.register("compute_udtf", ComputeUDTF)
+        self.spark.udtf.register("multiply_udtf", MultiplyUDTF)
+
+        values_df = self.spark.createDataFrame([(10,), (20,), (30,)], "value int")
+        values_df.createOrReplaceTempView("values_table")
+
+        result_df = self.spark.sql(
+            """
+        SELECT c.computed_value, m.multiplied
+        FROM compute_udtf(table(SELECT * FROM values_table) WITH SINGLE PARTITION) c,
+             LATERAL multiply_udtf(c.computed_value) m
+        """
+        )
+
+        expected_df = self.spark.createDataFrame([(60, 180)], "computed_value int, multiplied int")
+        assertDataFrameEqual(result_df, expected_df)
+
 
 class ArrowUDTFTests(ArrowUDTFTestsMixin, ReusedSQLTestCase):
     pass
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala
@@ -2229,12 +2229,15 @@ class Analyzer(override val catalogManager: CatalogManager) extends RuleExecutor
             case _ => tvf
           }
 
-          Project(
-            Seq(UnresolvedStar(Some(Seq(alias)))),
-            LateralJoin(
-              tableArgs.map(_._2).reduceLeft(Join(_, _, Inner, None, JoinHint.NONE)),
-              LateralSubquery(SubqueryAlias(alias, tvfWithTableColumnIndexes)), Inner, None)
-          )
+          val lateralJoin = LateralJoin(
+            tableArgs.map(_._2).reduceLeft(Join(_, _, Inner, None, JoinHint.NONE)),
+            LateralSubquery(SubqueryAlias(alias, tvfWithTableColumnIndexes)), Inner, None)
+
+          // Set the tag so that it can be used to differentiate lateral join added by
+          // TABLE argument vs added by user.
+          lateralJoin.setTagValue(LateralJoin.BY_TABLE_ARGUMENT, ())
+
+          Project(Seq(UnresolvedStar(Some(Seq(alias)))), lateralJoin)
         }
 
       case q: LogicalPlan =>
@@ -4249,6 +4252,8 @@ object RemoveTempResolvedColumn extends Rule[LogicalPlan] {
   }
 }
 
+
+
 /**
  * Rule that's used to handle `UnresolvedHaving` nodes with resolved `condition` and `child`.
  * It's placed outside the main batch to avoid conflicts with other rules that resolve
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.analysis
 import scala.collection.mutable
 
 import org.apache.spark.{SparkException, SparkThrowable}
+import org.apache.spark.api.python.PythonEvalType
 import org.apache.spark.sql.AnalysisException
 import org.apache.spark.sql.catalyst.ExtendedAnalysisException
 import org.apache.spark.sql.catalyst.analysis.ResolveWithCTE.checkIfSelfReferenceIsPlacedCorrectly
@@ -889,6 +890,17 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString
               messageParameters = Map(
                 "invalidExprSqls" -> invalidExprSqls.mkString(", ")))
 
+          case j @ LateralJoin(_, right, _, _)
+              if j.getTagValue(LateralJoin.BY_TABLE_ARGUMENT).isEmpty =>
+            right.plan.foreach {
+              case Generate(pyudtf: PythonUDTF, _, _, _, _, _)
+                  if pyudtf.evalType == PythonEvalType.SQL_ARROW_UDTF =>
+                  j.failAnalysis(
+                    errorClass = "LATERAL_JOIN_WITH_ARROW_UDTF_UNSUPPORTED",
+                    messageParameters = Map.empty)
+              case _ =>
+            }
+
           case _ => // Analysis successful!
         }
     }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/basicLogicalOperators.scala
@@ -2115,6 +2115,15 @@ case class LateralJoin(
   }
 }
 
+
+object LateralJoin {
+  /**
+   * A tag to identify if a Lateral Join is added by resolving table argument.
+   */
+  val BY_TABLE_ARGUMENT = TreeNodeTag[Unit]("by_table_argument")
+}
+
+
 /**
  * A logical plan for as-of join.
  */