[SPARK-53349][SQL] Optimized XML parser can't handle corrupted files correctly

xiaonanyang-db · HyukjinKwon · commit f0a3a2ea1611 · 2025-08-26T15:08:00.000+09:00
### What changes were proposed in this pull request? In #51287, we introduced an optimized XML parser, which is more memory-efficient. However, the new parser reads the input stream eagerly on initialization. If the file is corrupted, the error is not caught properly and handled based on the `ignoreCorruptedFiles` option. This PR addresses the issue. ### Why are the changes needed? Bug fix ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? New tests. ### Was this patch authored or co-authored using generative AI tooling? Closes #52093 from xiaonanyang-db/SPARK-53349. Authored-by: Xiaonan Yang <xiaonan.yang@databricks.com> Signed-off-by: Hyukjin Kwon <gurwls223@apache.org>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXMLRecordReader.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXMLRecordReader.scala
@@ -37,15 +37,15 @@ case class StaxXMLRecordReader(inputStream: () => InputStream, options: XmlOptio
     extends XMLEventReader
     with Logging {
   // Reader for the XML record parsing.
-  private val in1 = inputStream()
-  private val primaryEventReader = StaxXmlParserUtils.filteredReader(in1, options)
+  private lazy val in1 = inputStream()
+  private lazy val primaryEventReader = StaxXmlParserUtils.filteredReader(in1, options)
 
   private val xsdSchemaValidator = Option(options.rowValidationXSDPath)
     .map(path => ValidatorUtil.getSchema(path).newValidator())
   // Reader for the XSD validation, if an XSD schema is provided.
-  private val in2 = xsdSchemaValidator.map(_ => inputStream())
+  private lazy val in2 = xsdSchemaValidator.map(_ => inputStream())
   // An XMLStreamReader used by StAXSource for XSD validation.
-  private val xsdValidationStreamReader =
+  private lazy val xsdValidationStreamReader =
     in2.map(in => StaxXmlParserUtils.filteredStreamReader(in, options))
 
   final var hasMoreRecord: Boolean = true
@@ -108,11 +108,18 @@ case class StaxXMLRecordReader(inputStream: () => InputStream, options: XmlOptio
   }
 
   override def close(): Unit = {
-    primaryEventReader.close()
-    xsdValidationStreamReader.foreach(_.close())
-    in1.close()
-    in2.foreach(_.close())
     hasMoreRecord = false
+    try {
+      in1.close()
+      in2.foreach(_.close())
+      primaryEventReader.close()
+      xsdValidationStreamReader.foreach(_.close())
+    } catch {
+      case NonFatal(e) =>
+        // If the file is corrupted/missing, we won't be able to close the input streams. We do a
+        // best-effort to close the streams and log the error if closing fails.
+        logWarning("Error closing XML stream", e)
+    }
   }
 
   override def nextEvent(): XMLEvent = primaryEventReader.nextEvent()
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
@@ -269,7 +269,7 @@ class StaxXmlParser(
             logWarning("Skipped missing file", e)
             parser.close()
             None
-          case _: IOException | _: RuntimeException | _: InternalError
+          case _: IOException | _: RuntimeException | _: InternalError | _: AssertionError
               if options.ignoreCorruptFiles =>
             logWarning("Skipped the rest of the content in the corrupted file", e)
             parser.close()
@@ -289,6 +289,7 @@ class StaxXmlParser(
               StaxXmlParserUtils.currentElementAsString(parser, options.rowTag, options).trim
             )
             throw BadRecordException(() => record, () => Array.empty, e)
+          case _ => throw e
         }
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/XmlInferSchema.scala
@@ -273,11 +273,15 @@ class XmlInferSchema(options: XmlOptions, caseSensitive: Boolean)
           case _: AccessControlException | _: BlockMissingException =>
             parser.close()
             throw e
-          case _: IOException | _: RuntimeException | _: InternalError
+          case _: IOException | _: RuntimeException | _: InternalError | _: AssertionError
               if options.ignoreCorruptFiles =>
             logWarning("Skipped the rest of the content in the corrupted file", e)
             parser.close()
             Some(StructType(Nil))
+          case _: IOException | _: RuntimeException | _: InternalError
+              if !options.ignoreCorruptFiles =>
+            parser.close()
+            throw e
           case _ =>
             logWarning("Failed to infer schema from XML record", e)
             handleXmlErrorsByParseMode(
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/XmlDataSource.scala
@@ -235,26 +235,12 @@ object MultiLineXmlDataSource extends XmlDataSource {
 
     val xmlParserRdd: RDD[StaxXMLRecordReader] =
       xml.flatMap { portableDataStream =>
-        try {
-          val inputStream = () =>
-            CodecStreams.createInputStreamWithCloseResource(
-              portableDataStream.getConfiguration,
-              new Path(portableDataStream.getPath())
-            )
-          StaxXmlParser.convertStream(inputStream, parsedOptions)(identity)
-        } catch {
-          case e: FileNotFoundException if parsedOptions.ignoreMissingFiles =>
-            logWarning("Skipped missing file", e)
-            None
-          case NonFatal(e) =>
-            Utils.getRootCause(e) match {
-              case _: RuntimeException | _: IOException | _: InternalError
-                  if parsedOptions.ignoreCorruptFiles =>
-                logWarning("Skipped the rest of the content in the corrupted file", e)
-                None
-              case o => throw o
-            }
-        }
+        val inputStream = () =>
+          CodecStreams.createInputStreamWithCloseResource(
+            portableDataStream.getConfiguration,
+            new Path(portableDataStream.getPath())
+          )
+        StaxXmlParser.convertStream(inputStream, parsedOptions)(identity)
       }
 
     SQLExecution.withSQLConfPropagated(sparkSession) {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/xml/XmlSuite.scala
@@ -3021,6 +3021,13 @@ class XmlSuite
            .xml(inputFile.toURI.toString)
            .collect()
         assert(result.isEmpty)
+
+        val result2 = spark.read
+          .option("rowTag", "ROW")
+          .option("multiLine", true)
+          .xml(inputFile.toURI.toString)
+          .collect()
+        assert(result2.isEmpty)
       }
     })
     withTempPath { dir =>

Original file line number	Diff line number	Diff line change
`@@ -269,7 +269,7 @@ class StaxXmlParser(`
`269`	`269`	`logWarning("Skipped missing file", e)`
`270`	`270`	`parser.close()`
`271`	`271`	`None`
`272`		`- case _: IOException \| _: RuntimeException \| _: InternalError`
	`272`	`+ case _: IOException \| _: RuntimeException \| _: InternalError \| _: AssertionError`
`273`	`273`	`if options.ignoreCorruptFiles =>`
`274`	`274`	`logWarning("Skipped the rest of the content in the corrupted file", e)`
`275`	`275`	`parser.close()`
`@@ -289,6 +289,7 @@ class StaxXmlParser(`
`289`	`289`	`StaxXmlParserUtils.currentElementAsString(parser, options.rowTag, options).trim`
`290`	`290`	`)`
`291`	`291`	`throw BadRecordException(() => record, () => Array.empty, e)`
	`292`	`+ case _ => throw e`
`292`	`293`	`}`
`293`	`294`	`}`
`294`	`295`	`}`