apache
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXMLRecordReader.scala
Lines changed: 125 additions & 0 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXMLRecordReader.scala
Lines changed: 125 additions & 0 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
Lines changed: 132 additions & 7 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/xml/StaxXmlParser.scala
Lines changed: 132 additions & 7 deletions
@@ -0,0 +1,125 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.catalyst.xml
+
+import java.io.InputStream
+import javax.xml.stream.{XMLEventReader, XMLStreamConstants, XMLStreamReader}
+import javax.xml.stream.events.{EndDocument, StartElement, XMLEvent}
+import javax.xml.transform.stax.StAXSource
+
+import scala.util.control.NonFatal
+
+import org.apache.hadoop.shaded.com.ctc.wstx.exc.WstxEOFException
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.util.SparkErrorUtils
+
+/**
+ * XML record reader that reads the next XML record in the underlying XML stream. It can support XSD
+ * schema validation by maintaining a separate XML reader and keep it in sync with the primary XML
+ * reader.
+ */
+case class StaxXMLRecordReader(inputStream: () => InputStream, options: XmlOptions)
+    extends XMLEventReader
+    with Logging {
+  // Reader for the XML record parsing.
+  private val in1 = inputStream()
+  private val primaryEventReader = StaxXmlParserUtils.filteredReader(in1, options)
+
+  private val xsdSchemaValidator = Option(options.rowValidationXSDPath)
+    .map(path => ValidatorUtil.getSchema(path).newValidator())
+  // Reader for the XSD validation, if an XSD schema is provided.
+  private val in2 = xsdSchemaValidator.map(_ => inputStream())
+  // An XMLStreamReader used by StAXSource for XSD validation.
+  private val xsdValidationStreamReader =
+    in2.map(in => StaxXmlParserUtils.filteredStreamReader(in, options))
+
+  final var hasMoreRecord: Boolean = true
+
+  /**
+   * Skip through the XML stream until we find the next row start element.
+   * Returns true if a row start element is found, false if end of stream is reached.
+   */
+  def skipToNextRecord(): Boolean = {
+    hasMoreRecord = skipToNextRowStart()
+    if (hasMoreRecord) {
+      xsdValidationStreamReader.foreach(validateXSDSchema)
+    } else {
+      close()
+    }
+    hasMoreRecord
+  }
+
+  /**
+   * Skip through the XML stream until we find the next row start element.
+   */
+  private def skipToNextRowStart(): Boolean = {
+    val rowTagName = options.rowTag
+    try {
+      while (primaryEventReader.hasNext) {
+        val event = primaryEventReader.peek()
+        event match {
+          case startElement: StartElement =>
+            val elementName = StaxXmlParserUtils.getName(startElement.getName, options)
+            if (elementName == rowTagName) {
+              return true
+            }
+          case _: EndDocument =>
+            return false
+          case _ =>
+          // Continue searching
+        }
+        // if not the event we want, advance the reader
+        primaryEventReader.nextEvent()
+      }
+      false
+    } catch {
+      case NonFatal(e) if SparkErrorUtils.getRootCause(e).isInstanceOf[WstxEOFException] =>
+        logWarning("Reached end of file while looking for next row start element.")
+        false
+    }
+  }
+
+  private def validateXSDSchema(streamReader: XMLStreamReader): Unit = {
+    // StAXSource requires the stream reader to start with the START_DOCUMENT OR START_ELEMENT
+    // events.
+    def rowTagStarted: Boolean =
+      streamReader.getEventType == XMLStreamConstants.START_ELEMENT &&
+      StaxXmlParserUtils.getName(streamReader.getName, options) == options.rowTag
+    while (!rowTagStarted && streamReader.hasNext) {
+      streamReader.next()
+    }
+    xsdSchemaValidator.get.reset()
+    xsdSchemaValidator.get.validate(new StAXSource(streamReader))
+  }
+
+  override def close(): Unit = {
+    primaryEventReader.close()
+    xsdValidationStreamReader.foreach(_.close())
+    in1.close()
+    in2.foreach(_.close())
+    hasMoreRecord = false
+  }
+
+  override def nextEvent(): XMLEvent = primaryEventReader.nextEvent()
+  override def hasNext: Boolean = primaryEventReader.hasNext
+  override def peek(): XMLEvent = primaryEventReader.peek()
+  override def getElementText: String = primaryEventReader.getElementText
+  override def nextTag(): XMLEvent = primaryEventReader.nextTag()
+  override def getProperty(name: String): AnyRef = primaryEventReader.getProperty(name)
+  override def next(): AnyRef = primaryEventReader.next()
+}
@@ -33,6 +33,7 @@ import scala.util.control.Exception.allCatch
 import scala.util.control.NonFatal
 import scala.xml.SAXException
 
+import com.google.common.io.ByteStreams
 import org.apache.hadoop.hdfs.BlockMissingException
 import org.apache.hadoop.security.AccessControlException
 
@@ -50,7 +51,7 @@ import org.apache.spark.types.variant.{Variant, VariantBuilder}
 import org.apache.spark.types.variant.VariantBuilder.FieldEntry
 import org.apache.spark.types.variant.VariantUtil
 import org.apache.spark.unsafe.types.{UTF8String, VariantVal}
-import org.apache.spark.util.Utils
+import org.apache.spark.util.{SparkErrorUtils, Utils}
 
 class StaxXmlParser(
     schema: StructType,
@@ -127,12 +128,12 @@ class StaxXmlParser(
     // is not manually specified, then fall back to DROPMALFORMED, which will return
     // null column values where parsing fails.
     val parseMode =
-    if (options.parseMode == PermissiveMode &&
-      !schema.fields.exists(_.name == options.columnNameOfCorruptRecord)) {
-      DropMalformedMode
-    } else {
-      options.parseMode
-    }
+      if (options.parseMode == PermissiveMode &&
+        !schema.fields.exists(_.name == options.columnNameOfCorruptRecord)) {
+        DropMalformedMode
+      } else {
+        options.parseMode
+      }
     val xsdSchema = Option(options.rowValidationXSDPath).map(ValidatorUtil.getSchema)
     doParseColumn(xml, parseMode, xsdSchema).orNull
   }
@@ -188,6 +189,110 @@ class StaxXmlParser(
     }
   }
 
+  /**
+   * XML stream parser that reads XML records from the input file stream sequentially without
+   * loading each individual XML record string into memory.
+   */
+  def parseStreamOptimized(
+      inputStream: () => InputStream,
+      schema: StructType): Iterator[InternalRow] = {
+    val streamLiteral = () =>
+      Utils.tryWithResource(
+        inputStream()
+      ) { is =>
+        UTF8String.fromBytes(ByteStreams.toByteArray(is))
+      }
+    val safeParser = new FailureSafeParser[StaxXMLRecordReader](
+      input => doParseColumnOptimized(input, streamLiteral),
+      options.parseMode,
+      schema,
+      options.columnNameOfCorruptRecord
+    )
+
+    convertStream(inputStream, options) { reader =>
+      safeParser.parse(reader)
+    }.flatten
+  }
+
+  /**
+   * Parse the next XML record from the XML event stream.
+   * Note that the method will **NOT** close the XML event stream as there could have more XML
+   * records to parse. It's the caller's responsibility to close the stream.
+   *
+   * @param parser The XML event reader.
+   * @param xmlLiteral A function that returns the entire XML file content as a UTF8String. Used
+   *                   to create a BadRecordException in case of parsing errors.
+   *                   TODO: Only include the file content starting with the current record.
+   */
+  def doParseColumnOptimized(
+      parser: StaxXMLRecordReader,
+      xmlLiteral: () => UTF8String): Option[InternalRow] = {
+    try {
+      if (!parser.skipToNextRecord()) {
+        return None
+      }
+
+      options.singleVariantColumn match {
+        case Some(_) =>
+          // If the singleVariantColumn is specified, parse the entire xml record as a Variant
+          val v = StaxXmlParser.parseVariant(parser, options)
+          Some(InternalRow(v))
+        case _ =>
+          // Otherwise, parse the xml record as Structs
+          val rootAttributes = parser.nextEvent().asStartElement.getAttributes.asScala.toArray
+          val result = Some(convertObject(parser, schema, rootAttributes))
+          result
+      }
+    } catch {
+      case e: SparkUpgradeException =>
+        parser.close()
+        throw e
+      case e: CharConversionException if options.charset.isEmpty =>
+        val msg =
+          """XML parser cannot handle a character in its input.
+            |Specifying encoding as an input option explicitly might help to resolve the issue.
+            |""".stripMargin + e.getMessage
+        val wrappedCharException = new CharConversionException(msg)
+        wrappedCharException.initCause(e)
+        throw BadRecordException(xmlLiteral, () => Array.empty,
+          wrappedCharException)
+      case PartialResultException(row, cause) =>
+        throw BadRecordException(
+          record = xmlLiteral,
+          partialResults = () => Array(row),
+          cause)
+      case PartialResultArrayException(rows, cause) =>
+        throw BadRecordException(record = xmlLiteral, partialResults = () => rows, cause)
+      case e: Throwable =>
+        SparkErrorUtils.getRootCause(e) match {
+          case _: FileNotFoundException if options.ignoreMissingFiles =>
+            logWarning("Skipped missing file", e)
+            parser.close()
+            None
+          case _: IOException | _: RuntimeException | _: InternalError
+              if options.ignoreCorruptFiles =>
+            logWarning("Skipped the rest of the content in the corrupted file", e)
+            parser.close()
+            None
+          case _: XMLStreamException | _: MalformedInputException =>
+            // Skip rest of the content in the parser and put the whole XML file in the
+            // BadRecordException.
+            parser.close()
+            // XML parser currently doesn't support partial results for corrupted records.
+            // For such records, all fields other than the field configured by
+            // `columnNameOfCorruptRecord` are set to `null`.
+            throw BadRecordException(xmlLiteral, () => Array.empty, e)
+          case _: SAXException =>
+            // XSD validation failed, throw a bad record exception and continue to parse the rest
+            // records.
+            val record = UTF8String.fromString(
+              StaxXmlParserUtils.currentElementAsString(parser, options.rowTag, options).trim
+            )
+            throw BadRecordException(() => record, () => Array.empty, e)
+        }
+    }
+  }
+
   /**
    * Parse the current token (and related children) according to a desired schema
    */
@@ -929,6 +1034,20 @@ object StaxXmlParser {
     }
   }
 
+  def convertStream[T](inputStream: () => InputStream, options: XmlOptions)(
+      convert: StaxXMLRecordReader => T): Iterator[T] = new Iterator[T] {
+    private val reader = StaxXMLRecordReader(inputStream, options)
+
+    override def hasNext: Boolean = reader.hasMoreRecord
+
+    override def next(): T = {
+      if (!hasNext) {
+        throw QueryExecutionErrors.endOfStreamError()
+      }
+      convert(reader)
+    }
+  }
+
   /**
    * Parse the input XML string as a Variant value
    */
@@ -940,6 +1059,12 @@ object StaxXmlParser {
     v
   }
 
+  def parseVariant(parser: StaxXMLRecordReader, options: XmlOptions): VariantVal = {
+    val rootAttributes = parser.nextEvent().asStartElement.getAttributes.asScala.toArray
+    val v = convertVariant(parser, rootAttributes, options)
+    new VariantVal(v.getValue, v.getMetadata)
+  }
+
   /**
    * Parse an XML element from the XML event stream into a Variant.
    * This method transforms the XML element along with its attributes and child elements