review comments

szehon-ho · szehon-ho · commit 667829e5e748 · 2025-07-25T14:27:58.000-07:00
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/BatchWrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/BatchWrite.java
@@ -87,9 +87,53 @@ default void onDataWriterCommit(WriterCommitMessage message) {}
    * disable this behavior by overriding {@link #useCommitCoordinator()}. If disabled, multiple
    * tasks may have committed successfully and one successful commit message per task will be
    * passed to this commit method. The remaining commit messages are ignored by Spark.
+   *
    */
   void commit(WriterCommitMessage[] messages);
 
+  /**
+   * Commits this writing job with a list of commit messages and operation metrics.
+   * <p>
+   * If this method fails (by throwing an exception), this writing job is considered to to have been
+   * failed, and {@link #abort(WriterCommitMessage[])} would be called. The state of the destination
+   * is undefined and @{@link #abort(WriterCommitMessage[])} may not be able to deal with it.
+   * <p>
+   * Note that speculative execution may cause multiple tasks to run for a partition. By default,
+   * Spark uses the commit coordinator to allow at most one task to commit. Implementations can
+   * disable this behavior by overriding {@link #useCommitCoordinator()}. If disabled, multiple
+   * tasks may have committed successfully and one successful commit message per task will be
+   * passed to this commit method. The remaining commit messages are ignored by Spark.
+   * <p>
+   * @param messages a list of commit messages from successful data writers, produced by
+   *                 {@link DataWriter#commit()}.
+   * @param metrics a map of operation metrics collected from the query producing write.
+   *                The keys will be prefixed by operation type, eg `merge`.
+   *                <p>
+   *                Currently supported metrics are:
+   *                <ul>
+   *                  <li>Operation Type = `merge`
+   *                    <ul>
+   *                      <li>`numTargetRowsCopied`: number of target rows copied unmodified because
+   *                      they did not match any action</li>
+   *                      <li>`numTargetRowsDeleted`: number of target rows deleted</li>
+   *                      <li>`numTargetRowsUpdated`: number of target rows updated</li>
+   *                      <li>`numTargetRowsInserted`: number of target rows inserted</li>
+   *                      <li>`numTargetRowsMatchedUpdated`: number of target rows updated by a
+   *                      matched clause</li>
+   *                      <li>`numTargetRowsMatchedDeleted`: number of target rows deleted by a
+   *                      matched clause</li>
+   *                      <li>`numTargetRowsNotMatchedBySourceUpdated`: number of target rows
+   *                      updated by a not matched by source clause</li>
+   *                      <li>`numTargetRowsNotMatchedBySourceDeleted`: number of target rows
+   *                      deleted by a not matched by source clause</li>
+   *                    </ul>
+   *                  </li>
+   *                </ul>
+   */
+  default void commit(WriterCommitMessage[] messages, Map<String, Long> metrics) {
+    commit(messages);
+  }
+
   /**
    * Aborts this writing job because some data writers are failed and keep failing when retry,
    * or the Spark job fails with some unknown reasons,
@@ -106,14 +150,4 @@ default void onDataWriterCommit(WriterCommitMessage message) {}
    * clean up the data left by data writers.
    */
   void abort(WriterCommitMessage[] messages);
-
-  /**
-   * Similar to {@link #commit(WriterCommitMessage[])}, but providing operation metrics to
-   * this batch write.
-   * @param metrics operation metrics.  The keys will be prefixed by operation type, eg `merge`
-   */
-  default void commitWithOperationMetrics(
-    WriterCommitMessage[] messages, Map<String, Long> metrics) {
-    commit(messages);
-  }
 }
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryRowLevelOperationTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/catalog/InMemoryRowLevelOperationTable.scala
@@ -116,7 +116,7 @@ class InMemoryRowLevelOperationTable(
 
   abstract class RowLevelOperationBatchWrite extends TestBatchWrite {
 
-    override def commitWithOperationMetrics(messages: Array[WriterCommitMessage],
+    override def commit(messages: Array[WriterCommitMessage],
                                             metrics: util.Map[String, lang.Long]): Unit = {
       metrics.asScala.map {
         case (key, value) => commitProperties += key -> String.valueOf(value)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/WriteToDataSourceV2Exec.scala
@@ -17,7 +17,8 @@
 
 package org.apache.spark.sql.execution.datasources.v2
 
-import java.lang.{Long => JLong}
+import java.lang
+import java.util
 
 import scala.jdk.CollectionConverters._
 
@@ -454,13 +455,9 @@ trait V2TableWriteExec extends V2CommandExec with UnaryExecNode with AdaptiveSpa
         }
       )
 
-      val operationMetricOpt = getOperationMetrics(query)
+      val operationMetrics = getOperationMetrics(query)
       logInfo(log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} is committing.")
-      operationMetricOpt match {
-        case Some(metrics) => batchWrite.commitWithOperationMetrics(messages,
-          metrics.map{ case (name, value) => name -> JLong.valueOf(value) }.asJava)
-        case None => batchWrite.commit(messages)
-      }
+      batchWrite.commit(messages, operationMetrics)
       logInfo(log"Data source write support ${MDC(LogKeys.BATCH_WRITE, batchWrite)} committed.")
       commitProgress = Some(StreamWriterCommitProgress(totalNumRowsAccumulator.value))
     } catch {
@@ -483,10 +480,10 @@ trait V2TableWriteExec extends V2CommandExec with UnaryExecNode with AdaptiveSpa
     Nil
   }
 
-  private def getOperationMetrics(query: SparkPlan): Option[Map[String, Long]] = {
+  private def getOperationMetrics(query: SparkPlan): util.Map[String, lang.Long] = {
     collectFirst(query) { case m: MergeRowsExec => m }.map{ n =>
-      n.metrics.map { case (name, metric) => s"merge.$name" -> metric.value }
-    }
+      n.metrics.map { case (name, metric) => s"merge.$name" -> lang.Long.valueOf(metric.value) }
+    }.getOrElse(Map.empty[String, lang.Long]).asJava
   }
 }