[SPARK-53342][SQL] Fix Arrow converter to handle multiple record batches in single IPC stream

grundprinzip · grundprinzip · commit c170ec9f441c · 2025-08-26T23:26:11.000+02:00
This PR adds a new method in ArrowConverters that allows properly decoding an Arrow IPC stream, which can contain multiple record batches. All of the other methods can only deal with message streams that contain exactly one record batch. Previously, when an Arrow IPC stream contained multiple record batches, only the first batch would be processed and the remaining batches would be ignored. This resulted in data loss and incorrect results when working with Arrow data that was serialized as a single stream with multiple batches. Yes. This fixes a data correctness issue where users would lose data when processing Arrow streams with multiple batches. The behavior change is that all batches in a stream are now correctly processed instead of only the first one. Added comprehensive test cases. Tests Generated-by: Claude Code 🤖 Generated with [Claude Code](https://claude.ai/code) Closes apache#52090 from grundprinzip/SPARK-53342. Authored-by: Martin Grund <martin.grund@databricks.com> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala
@@ -704,8 +704,7 @@ class SparkConnectPlanner(val session: SparkSession) {
     }
 
     if (rel.hasData) {
-      val (rows, structType) = ArrowConverters.fromBatchWithSchemaIterator(
-        Iterator(rel.getData.toByteArray),
+      val (rows, structType) = ArrowConverters.fromIPCStream(rel.getData.toByteArray,
         TaskContext.get())
       if (structType == null) {
         throw InvalidPlanInput(s"Input data for LocalRelation does not produce a schema.")
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/arrow/ArrowConverters.scala
@@ -234,6 +234,109 @@ private[sql] object ArrowConverters extends Logging {
     }
   }
 
+  /**
+   * This is a class that converts input data in the form of a Byte array to InternalRow instances
+   * implementing the Iterator interface.
+   *
+   * The input data must be a valid Arrow IPC stream, this means that the first message is always
+   * the schema followed by N record batches.
+   *
+   * @param input Input Data
+   * @param context Task Context for Spark
+   */
+  private[sql] class InternalRowIteratorFromIPCStream(
+      input: Array[Byte],
+      context: TaskContext) extends Iterator[InternalRow] {
+
+    // Keep all the resources we have opened in order, should be closed
+    // in reverse order finally.
+    private val resources = new ArrayBuffer[AutoCloseable]()
+
+    // Create an allocator used for all Arrow related memory.
+    protected val allocator: BufferAllocator = ArrowUtils.rootAllocator.newChildAllocator(
+      s"to${this.getClass.getSimpleName}",
+      0,
+      Long.MaxValue)
+    resources.append(allocator)
+
+    private val reader = try {
+      new ArrowStreamReader(new ByteArrayInputStream(input), allocator)
+    } catch {
+      case e: Exception =>
+        closeAll(resources.toSeq.reverse: _*)
+        throw new IllegalArgumentException(
+          s"Failed to create ArrowStreamReader: ${e.getMessage}", e)
+    }
+    resources.append(reader)
+
+    private val root: VectorSchemaRoot = try {
+      reader.getVectorSchemaRoot
+    } catch {
+      case e: Exception =>
+        closeAll(resources.toSeq.reverse: _*)
+        throw new IllegalArgumentException(
+          s"Failed to read schema from IPC stream: ${e.getMessage}", e)
+    }
+    resources.append(root)
+
+    val schema: StructType = try {
+      ArrowUtils.fromArrowSchema(root.getSchema)
+    } catch {
+      case e: Exception =>
+        closeAll(resources.toSeq.reverse: _*)
+        throw new IllegalArgumentException(s"Failed to convert Arrow schema: ${e.getMessage}", e)
+    }
+
+    // TODO: wrap in exception
+    private var rowIterator: Iterator[InternalRow] = vectorSchemaRootToIter(root)
+
+    // Metrics to track batch processing
+    private var _batchesLoaded: Int = 0
+    private var _totalRowsProcessed: Long = 0L
+
+    if (context != null) {
+      context.addTaskCompletionListener[Unit] { _ =>
+        closeAll(resources.toSeq.reverse: _*)
+      }
+    }
+
+    // Public accessors for metrics
+    def batchesLoaded: Int = _batchesLoaded
+    def totalRowsProcessed: Long = _totalRowsProcessed
+
+    // Loads the next batch from the Arrow reader and returns true or
+    // false if the next batch could be loaded.
+    private def loadNextBatch(): Boolean = {
+      if (reader.loadNextBatch()) {
+        rowIterator = vectorSchemaRootToIter(root)
+        _batchesLoaded += 1
+        true
+      } else {
+        false
+      }
+    }
+
+    override def hasNext: Boolean = {
+      if (rowIterator.hasNext) {
+        true
+      } else {
+        if (!loadNextBatch()) {
+          false
+        } else {
+          hasNext
+        }
+      }
+    }
+
+    override def next(): InternalRow = {
+      if (!hasNext) {
+        throw new NoSuchElementException("No more elements in iterator")
+      }
+      _totalRowsProcessed += 1
+      rowIterator.next()
+    }
+  }
+
   /**
    * An InternalRow iterator which parse data from serialized ArrowRecordBatches, subclass should
    * implement [[nextBatch]] to parse data from binary records.
@@ -345,6 +448,23 @@ private[sql] object ArrowConverters extends Logging {
     (iterator, iterator.schema)
   }
 
+  /**
+   * Creates an iterator from a Byte array to deserialize an Arrow IPC stream with exactly
+   * one schema and a varying number of record batches. Returns an iterator over the
+   * created InternalRow.
+   */
+  private[sql] def fromIPCStream(input: Array[Byte], context: TaskContext):
+      (Iterator[InternalRow], StructType) = {
+    fromIPCStreamWithIterator(input, context)
+  }
+
+  // Overloaded method for tests to access the iterator with metrics
+  private[sql] def fromIPCStreamWithIterator(input: Array[Byte], context: TaskContext):
+      (InternalRowIteratorFromIPCStream, StructType) = {
+    val iterator = new InternalRowIteratorFromIPCStream(input, context)
+    (iterator, iterator.schema)
+  }
+
   /**
    * Convert an arrow batch container into an iterator of InternalRow.
    */
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/arrow/ArrowConvertersSuite.scala

Original file line number	Diff line number	Diff line change
`@@ -704,8 +704,7 @@ class SparkConnectPlanner(val session: SparkSession) {`
`704`	`704`	`}`
`705`	`705`
`706`	`706`	`if (rel.hasData) {`
`707`		`- val (rows, structType) = ArrowConverters.fromBatchWithSchemaIterator(`
`708`		`- Iterator(rel.getData.toByteArray),`
	`707`	`+ val (rows, structType) = ArrowConverters.fromIPCStream(rel.getData.toByteArray,`
`709`	`708`	`TaskContext.get())`
`710`	`709`	`if (structType == null) {`
`711`	`710`	`throw InvalidPlanInput(s"Input data for LocalRelation does not produce a schema.")`