apache · sandip-db · Aug 3, 2023 · Aug 12, 2023 · Aug 12, 2023 · Aug 12, 2023
diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json
@@ -589,6 +589,11 @@
           "<errors>"
         ]
       },
+      "INVALID_XML_MAP_KEY_TYPE" : {
+        "message" : [
+          "Input schema <schema> can only contain STRING as a key type for a MAP."
+        ]
+      },
       "IN_SUBQUERY_DATA_TYPE_MISMATCH" : {
         "message" : [
           "The data type of one or more elements in the left hand side of an IN subquery is not compatible with the data type of the output of the subquery. Mismatched columns: [<mismatchedColumns>], left side: [<leftType>], right side: [<rightType>]."

diff --git a/docs/sql-error-conditions-datatype-mismatch-error-class.md b/docs/sql-error-conditions-datatype-mismatch-error-class.md
@@ -123,6 +123,10 @@ The `<functionName>` does not support ordering on type `<dataType>`.
 
 `<errors>`
 
+## INVALID_XML_MAP_KEY_TYPE
+
+Input schema `<schema>` can only contain STRING as a key type for a MAP.
+
 ## IN_SUBQUERY_DATA_TYPE_MISMATCH
 
 The data type of one or more elements in the left hand side of an IN subquery is not compatible with the data type of the output of the subquery. Mismatched columns: [`<mismatchedColumns>`], left side: [`<leftType>`], right side: [`<rightType>`].

diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/ExprUtils.scala
@@ -117,4 +117,26 @@ object ExprUtils extends QueryErrorsBase {
       TypeCheckSuccess
     }
   }
+
+  /**
+   * Check if the schema is valid for XML
+   *
+   * @param schema The schema to check.
+   * @return
+   * `TypeCheckSuccess` if the schema is valid
+   * `DataTypeMismatch` with an error error if the schema is not valid
+   */
+  def checkXmlSchema(schema: DataType): TypeCheckResult = {
+    val isInvalid = schema.existsRecursively {
+      case MapType(keyType, _, _) if keyType != StringType => true
+      case _ => false
+    }
+    if (isInvalid) {
+      DataTypeMismatch(
+        errorSubClass = "INVALID_XML_MAP_KEY_TYPE",
+        messageParameters = Map("schema" -> toSQLType(schema)))
+    } else {
+      TypeCheckSuccess
+    }
+  }
 }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala
@@ -3277,6 +3277,12 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat
       messageParameters = Map("jsonSchema" -> toSQLType(schema)))
   }
 
+  def invalidXmlSchema(schema: DataType): Throwable = {
+    new AnalysisException(
+      errorClass = "INVALID_XML_SCHEMA_MAP_TYPE",
+      messageParameters = Map("xmlSchema" -> toSQLType(schema)))
+  }
+
   def tableIndexNotSupportedError(errorMessage: String): Throwable = {
     new AnalysisException(
       errorClass = "_LEGACY_ERROR_TEMP_1332",

diff --git a/...core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/...core/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister
@@ -22,7 +22,7 @@ org.apache.spark.sql.execution.datasources.noop.NoopDataSource
 org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2
 org.apache.spark.sql.execution.datasources.v2.text.TextDataSourceV2
-org.apache.spark.sql.execution.datasources.xml.DefaultSource
+org.apache.spark.sql.execution.datasources.xml.XMLFileFormat
 org.apache.spark.sql.execution.streaming.ConsoleSinkProvider
 org.apache.spark.sql.execution.streaming.sources.RateStreamProvider
 org.apache.spark.sql.execution.streaming.sources.TextSocketSourceProvider

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -39,6 +39,9 @@ import org.apache.spark.sql.execution.datasources.jdbc._
 import org.apache.spark.sql.execution.datasources.json.JsonUtils.checkJsonSchema
 import org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource
 import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Utils
+import org.apache.spark.sql.execution.datasources.xml.{TextInputXMLDataSource, XmlOptions}
+import org.apache.spark.sql.execution.datasources.xml.parsers.StaxXmlParser
+import org.apache.spark.sql.execution.datasources.xml.util.XmlUtils.checkXmlSchema
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
@@ -537,6 +540,85 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
   @scala.annotation.varargs
   def csv(paths: String*): DataFrame = format("csv").load(paths : _*)
 
+  /**
+   * Loads a XML file and returns the result as a `DataFrame`. See the documentation on the
+   * other overloaded `xml()` method for more details.
+   *
+   */
+  def xml(path: String): DataFrame = {
+    // This method ensures that calls that explicit need single argument works, see SPARK-16009
+    xml(Seq(path): _*)
+  }
+
+  /**
+   * Loads XML files and returns the result as a `DataFrame`.
+   *
+   * This function will go through the input once to determine the input schema if `inferSchema`
+   * is enabled. To avoid going through the entire data once, disable `inferSchema` option or
+   * specify the schema explicitly using `schema`.
+   *
+   * You can find the XML-specific options for reading XML files in
+   * <a href="https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option">
+   * Data Source Option</a> in the version you use.
+   *
+   */
+  @scala.annotation.varargs
+  def xml(paths: String*): DataFrame = format("xml").load(paths: _*)
+
+  /**
+   * Loads an `RDD[String]` storing XML objects and returns the result as a `DataFrame`.
+   *
+   * Unless the schema is specified using `schema` function, this function goes through the
+   * input once to determine the input schema.
+   *
+   * @param xmlRDD input RDD with one XML object per record
+   * @since
+   */
+  // @deprecated("Use xml(Dataset[String]) instead.", "2.2.0")
+  def xml(xmlRDD: RDD[String]): DataFrame = {
+    xml(sparkSession.createDataset(xmlRDD)(Encoders.STRING))
+  }
+
+  /**
+   * Loads an `Dataset[String]` storing XML object and returns the result as a `DataFrame`.
+   *
+   * If the schema is not specified using `schema` function and `inferSchema` option is enabled,
+   * this function goes through the input once to determine the input schema.
+   *
+   * @param xmlDataset input Dataset with one XML object per record
+   */
+  def xml(xmlDataset: Dataset[String]): DataFrame = {
+    val parsedOptions: XmlOptions = new XmlOptions(
+      extraOptions.toMap,
+      sparkSession.sessionState.conf.sessionLocalTimeZone,
+      sparkSession.sessionState.conf.columnNameOfCorruptRecord)
+
+    userSpecifiedSchema.foreach(checkXmlSchema)
+
+    val schema = userSpecifiedSchema.map {
+      case s if !SQLConf.get.getConf(
+        SQLConf.LEGACY_RESPECT_NULLABILITY_IN_TEXT_DATASET_CONVERSION) => s.asNullable
+      case other => other
+    }.getOrElse {
+      TextInputXMLDataSource.inferFromDataset(xmlDataset, parsedOptions)
+    }
+
+    ExprUtils.verifyColumnNameOfCorruptRecord(schema, parsedOptions.columnNameOfCorruptRecord)
+    val actualSchema =
+      StructType(schema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
+
+    val parsed = xmlDataset.rdd.mapPartitions { iter =>
+      val rawParser = new StaxXmlParser(actualSchema, parsedOptions)
+      val parser = new FailureSafeParser[String](
+        input => rawParser.parse(input),
+        parsedOptions.parseMode,
+        schema,
+        parsedOptions.columnNameOfCorruptRecord)
+      iter.flatMap(parser.parse)
+    }
+    sparkSession.internalCreateDataFrame(parsed, schema, isStreaming = xmlDataset.isStreaming)
+  }
+
   /**
    * Loads a Parquet file, returning the result as a `DataFrame`. See the documentation
    * on the other overloaded `parquet()` method for more details.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala
@@ -850,6 +850,38 @@ final class DataFrameWriter[T] private[sql](ds: Dataset[T]) {
     format("csv").save(path)
   }
 
+  /**
+   * Saves the content of the `DataFrame` in XML format at the specified path.
+   * This is equivalent to:
+   * {{{
+   *   format("xml").save(path)
+   * }}}
+   *
+   * Note that writing a XML file from [[DataFrame]] having a field [[ArrayType]] with
+   * its element as [[ArrayType]] would have an additional nested field for the element.
+   * For example, the [[DataFrame]] having a field below,
+   *
+   *    fieldA [[data1, data2]]
+   *
+   * would produce a XML file below.
+   *
+   *    <fieldA>
+   *        <item>data1</item>
+   *    </fieldA>
+   *    <fieldA>
+   *        <item>data2</item>
+   *    </fieldA>
+   *
+   * Namely, roundtrip in writing and reading can end up in different schema structure.
+   *
+   * You can find the XML-specific options for writing XML files in
+   * <a href="https://spark.apache.org/docs/latest/sql-data-sources-xml.html#data-source-option">
+   * Data Source Option</a> in the version you use.
+   */
+  def xml(path: String): Unit = {
+    format("xml").save(path)
+  }
+
   /**
    * Wrap a DataFrameWriter action to track the QueryExecution and time cost, then report to the
    * user-registered callback functions.

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSource.scala
@@ -44,6 +44,7 @@ import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
 import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2
 import org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2
+import org.apache.spark.sql.execution.datasources.xml.XMLFileFormat
 import org.apache.spark.sql.execution.streaming._
 import org.apache.spark.sql.execution.streaming.sources.{RateStreamProvider, TextSocketSourceProvider}
 import org.apache.spark.sql.internal.SQLConf
@@ -567,6 +568,7 @@ object DataSource extends Logging {
   private val backwardCompatibilityMap: Map[String, String] = {
     val jdbc = classOf[JdbcRelationProvider].getCanonicalName
     val json = classOf[JsonFileFormat].getCanonicalName
+    val xml = classOf[XMLFileFormat].getCanonicalName
     val parquet = classOf[ParquetFileFormat].getCanonicalName
     val csv = classOf[CSVFileFormat].getCanonicalName
     val libsvm = "org.apache.spark.ml.source.libsvm.LibSVMFileFormat"
@@ -595,6 +597,8 @@ object DataSource extends Logging {
       "org.apache.spark.ml.source.libsvm.DefaultSource" -> libsvm,
       "org.apache.spark.ml.source.libsvm" -> libsvm,
       "com.databricks.spark.csv" -> csv,
+      "com.databricks.spark.xml" -> xml,
+      "org.apache.spark.sql.execution.datasources.xml" -> xml,
       "org.apache.spark.sql.execution.streaming.TextSocketSourceProvider" -> socket,
       "org.apache.spark.sql.execution.streaming.RateSourceProvider" -> rate
     )

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CSVDataSource.scala
@@ -188,7 +188,7 @@ object MultiLineCSVDataSource extends CSVDataSource {
       sparkSession: SparkSession,
       inputPaths: Seq[FileStatus],
       parsedOptions: CSVOptions): StructType = {
-    val csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
+    val  csv = createBaseRdd(sparkSession, inputPaths, parsedOptions)
     csv.flatMap { lines =>
       val path = new Path(lines.getPath())
       UnivocityParser.tokenizeStream(

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/DefaultSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/xml/DefaultSource.scala