| |
| |
| |
| |
| <!DOCTYPE html> |
| <html class="no-js"> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| |
| <title>XML Files - Spark 4.1.0-preview1 Documentation</title> |
| |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <link rel="preconnect" href="https://fonts.googleapis.com"> |
| <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> |
| <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet"> |
| <link href="css/custom.css" rel="stylesheet"> |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| <link rel="stylesheet" href="css/docsearch.min.css" /> |
| <link rel="stylesheet" href="css/docsearch.css"> |
| |
| |
| <!-- Matomo --> |
| <script> |
| var _paq = window._paq = window._paq || []; |
| /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ |
| _paq.push(["disableCookies"]); |
| _paq.push(['trackPageView']); |
| _paq.push(['enableLinkTracking']); |
| (function() { |
| var u="https://analytics.apache.org/"; |
| _paq.push(['setTrackerUrl', u+'matomo.php']); |
| _paq.push(['setSiteId', '40']); |
| var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; |
| g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); |
| })(); |
| </script> |
| <!-- End Matomo Code --> |
| |
| |
| </head> |
| <body class="global"> |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar"> |
| <div class="navbar-brand"><a href="index.html"> |
| <img src="https://spark.apache.org/images/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.1.0-preview1</span> |
| </div> |
| <button class="navbar-toggler" type="button" data-toggle="collapse" |
| data-target="#navbarCollapse" aria-controls="navbarCollapse" |
| aria-expanded="false" aria-label="Toggle navigation"> |
| <span class="navbar-toggler-icon"></span> |
| </button> |
| <div class="collapse navbar-collapse" id="navbarCollapse"> |
| <ul class="navbar-nav me-auto"> |
| <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> |
| <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> |
| <a class="dropdown-item" href="quick-start.html">Quick Start</a> |
| <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> |
| <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> |
| <a class="dropdown-item" href="streaming/index.html">Structured Streaming</a> |
| <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> |
| <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> |
| <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> |
| <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> |
| <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> |
| <a class="dropdown-item" href="declarative-pipelines-programming-guide.html">Declarative Pipelines</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> |
| <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> |
| <a class="dropdown-item" href="api/python/index.html">Python</a> |
| <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> |
| <a class="dropdown-item" href="api/java/index.html">Java</a> |
| <a class="dropdown-item" href="api/R/index.html">R</a> |
| <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> |
| <div class="dropdown-menu" aria-labelledby="navbarDeploying"> |
| <a class="dropdown-item" href="cluster-overview.html">Overview</a> |
| <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> |
| <a class="dropdown-item" href="running-on-yarn.html">YARN</a> |
| <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> |
| </div> |
| </li> |
| |
| <li class="nav-item dropdown"> |
| <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> |
| <div class="dropdown-menu" aria-labelledby="navbarMore"> |
| <a class="dropdown-item" href="configuration.html">Configuration</a> |
| <a class="dropdown-item" href="monitoring.html">Monitoring</a> |
| <a class="dropdown-item" href="tuning.html">Tuning Guide</a> |
| <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> |
| <a class="dropdown-item" href="security.html">Security</a> |
| <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> |
| <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> |
| <div class="dropdown-divider"></div> |
| <a class="dropdown-item" href="building-spark.html">Building Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> |
| <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> |
| </div> |
| </li> |
| |
| <li class="nav-item"> |
| <input type="text" id="docsearch-input" placeholder="Search the docs…"> |
| </li> |
| </ul> |
| <!--<span class="navbar-text navbar-right"><span class="version-text">v4.1.0-preview1</span></span>--> |
| </div> |
| </nav> |
| |
| |
| |
| <div class="container"> |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="sql-getting-started.html"> |
| |
| Getting Started |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources.html"> |
| |
| Data Sources |
| |
| </a> |
| </li> |
| |
| |
| |
| <ul> |
| |
| <li> |
| <a href="sql-data-sources-load-save-functions.html"> |
| |
| Generic Load/Save Functions |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-generic-options.html"> |
| |
| Generic File Source Options |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-parquet.html"> |
| |
| Parquet Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-orc.html"> |
| |
| ORC Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-json.html"> |
| |
| JSON Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-csv.html"> |
| |
| CSV Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-text.html"> |
| |
| Text Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-xml.html"> |
| |
| XML Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-hive-tables.html"> |
| |
| Hive Tables |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-jdbc.html"> |
| |
| JDBC To Other Databases |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-avro.html"> |
| |
| Avro Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-protobuf.html"> |
| |
| Protobuf data |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-binaryFile.html"> |
| |
| Whole Binary Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-troubleshooting.html"> |
| |
| Troubleshooting |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| |
| |
| <li> |
| <a href="sql-performance-tuning.html"> |
| |
| Performance Tuning |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-distributed-sql-engine.html"> |
| |
| Distributed SQL Engine |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-pyspark-pandas-with-arrow.html"> |
| |
| PySpark Usage Guide for Pandas with Apache Arrow |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-migration-guide.html"> |
| |
| Migration Guide |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref.html"> |
| |
| SQL Reference |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-error-conditions.html"> |
| |
| Error Conditions |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar mr-3" id="content"> |
| |
| <h1 class="title">XML Files</h1> |
| |
| |
| <p>Spark SQL provides <code class="language-plaintext highlighter-rouge">spark.read().xml("file_1_path","file_2_path")</code> to read a file or directory of files in XML format into a Spark DataFrame, and <code class="language-plaintext highlighter-rouge">dataframe.write().xml("path")</code> to write to a xml file. The <code class="language-plaintext highlighter-rouge">rowTag</code> option must be specified to indicate the XML element that maps to a <code class="language-plaintext highlighter-rouge">DataFrame row</code>. The option() function can be used to customize the behavior of reading or writing, such as controlling behavior of the XML attributes, XSD validation, compression, and so on.</p> |
| |
| <div class="codetabs"> |
| |
| <div data-lang="python"> |
| <div class="highlight"><pre class="codehilite"><code><span class="c1"># Primitive types (Int, String, etc) and Product types (case classes) encoders are |
| # supported by importing this when creating a Dataset. |
| # An XML dataset is pointed to by path. |
| # The path can be either a single xml file or more xml files |
| </span><span class="n">path</span> <span class="o">=</span> <span class="sh">"</span><span class="s">examples/src/main/resources/people.xml</span><span class="sh">"</span> |
| <span class="n">peopleDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span><span class="p">.</span><span class="nf">option</span><span class="p">(</span><span class="sh">"</span><span class="s">rowTag</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">person</span><span class="sh">"</span><span class="p">).</span><span class="nf">format</span><span class="p">(</span><span class="sh">"</span><span class="s">xml</span><span class="sh">"</span><span class="p">).</span><span class="nf">load</span><span class="p">(</span><span class="n">path</span><span class="p">)</span> |
| |
| <span class="c1"># The inferred schema can be visualized using the printSchema() method |
| </span><span class="n">peopleDF</span><span class="p">.</span><span class="nf">printSchema</span><span class="p">()</span> |
| <span class="c1"># root |
| # |-- age: long (nullable = true) |
| # |-- name: string (nullable = true) |
| </span> |
| <span class="c1"># Creates a temporary view using the DataFrame |
| </span><span class="n">peopleDF</span><span class="p">.</span><span class="nf">createOrReplaceTempView</span><span class="p">(</span><span class="sh">"</span><span class="s">people</span><span class="sh">"</span><span class="p">)</span> |
| |
| <span class="c1"># SQL statements can be run by using the sql methods provided by spark |
| </span><span class="n">teenagerNamesDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="nf">sql</span><span class="p">(</span><span class="sh">"</span><span class="s">SELECT name FROM people WHERE age BETWEEN 13 AND 19</span><span class="sh">"</span><span class="p">)</span> |
| <span class="n">teenagerNamesDF</span><span class="p">.</span><span class="nf">show</span><span class="p">()</span> |
| <span class="c1"># +------+ |
| # | name| |
| # +------+ |
| # |Justin| |
| # +------+ |
| </span> |
| <span class="c1"># Alternatively, a DataFrame can be created for an XML dataset represented by a Dataset[String] |
| </span><span class="n">xmlStrings</span> <span class="o">=</span> <span class="p">[</span><span class="sh">"""</span><span class="s"> |
| <person> |
| <name>laglangyue</name> |
| <job>Developer</job> |
| <age>28</age> |
| </person> |
| </span><span class="sh">"""</span><span class="p">]</span> |
| <span class="n">xmlRDD</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sparkContext</span><span class="p">.</span><span class="nf">parallelize</span><span class="p">(</span><span class="n">xmlStrings</span><span class="p">)</span> |
| <span class="n">otherPeople</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">read</span> \ |
| <span class="p">.</span><span class="nf">option</span><span class="p">(</span><span class="sh">"</span><span class="s">rowTag</span><span class="sh">"</span><span class="p">,</span> <span class="sh">"</span><span class="s">person</span><span class="sh">"</span><span class="p">)</span> \ |
| <span class="p">.</span><span class="nf">xml</span><span class="p">(</span><span class="n">xmlRDD</span><span class="p">)</span> |
| <span class="n">otherPeople</span><span class="p">.</span><span class="nf">show</span><span class="p">()</span> |
| <span class="c1"># +---+---------+----------+ |
| # |age| job| name| |
| # +---+---------+----------+ |
| # | 28|Developer|laglangyue| |
| # +---+---------+----------+</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/sql/datasource.py" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="scala"> |
| <div class="highlight"><pre class="codehilite"><code><span class="c1">// Primitive types (Int, String, etc) and Product types (case classes) encoders are</span> |
| <span class="c1">// supported by importing this when creating a Dataset.</span> |
| <span class="k">import</span> <span class="nn">spark.implicits._</span> |
| <span class="c1">// An XML dataset is pointed to by path.</span> |
| <span class="c1">// The path can be either a single xml file or more xml files</span> |
| <span class="k">val</span> <span class="nv">path</span> <span class="k">=</span> <span class="s">"examples/src/main/resources/people.xml"</span> |
| <span class="k">val</span> <span class="nv">peopleDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span><span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"rowTag"</span><span class="o">,</span> <span class="s">"person"</span><span class="o">).</span><span class="py">xml</span><span class="o">(</span><span class="n">path</span><span class="o">)</span> |
| |
| <span class="c1">// The inferred schema can be visualized using the printSchema() method</span> |
| <span class="nv">peopleDF</span><span class="o">.</span><span class="py">printSchema</span><span class="o">()</span> |
| <span class="c1">// root</span> |
| <span class="c1">// |-- age: long (nullable = true)</span> |
| <span class="c1">// |-- name: string (nullable = true)</span> |
| |
| <span class="c1">// Creates a temporary view using the DataFrame</span> |
| <span class="nv">peopleDF</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">)</span> |
| |
| <span class="c1">// SQL statements can be run by using the sql methods provided by spark</span> |
| <span class="k">val</span> <span class="nv">teenagerNamesDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">sql</span><span class="o">(</span><span class="s">"SELECT name FROM people WHERE age BETWEEN 13 AND 19"</span><span class="o">)</span> |
| <span class="nv">teenagerNamesDF</span><span class="o">.</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +------+</span> |
| <span class="c1">// | name|</span> |
| <span class="c1">// +------+</span> |
| <span class="c1">// |Justin|</span> |
| <span class="c1">// +------+</span> |
| |
| <span class="c1">// Alternatively, a DataFrame can be created for a XML dataset represented by a Dataset[String]</span> |
| <span class="k">val</span> <span class="nv">otherPeopleDataset</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">createDataset</span><span class="o">(</span> |
| <span class="s">""" |
| |<person> |
| | <name>laglangyue</name> |
| | <job>Developer</job> |
| | <age>28</age> |
| |</person> |
| |"""</span><span class="o">.</span><span class="py">stripMargin</span> <span class="o">::</span> <span class="nc">Nil</span><span class="o">)</span> |
| <span class="k">val</span> <span class="nv">otherPeople</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">read</span> |
| <span class="o">.</span><span class="py">option</span><span class="o">(</span><span class="s">"rowTag"</span><span class="o">,</span> <span class="s">"person"</span><span class="o">)</span> |
| <span class="o">.</span><span class="py">xml</span><span class="o">(</span><span class="n">otherPeopleDataset</span><span class="o">)</span> |
| <span class="nv">otherPeople</span><span class="o">.</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +---+---------+----------+</span> |
| <span class="c1">// |age| job| name|</span> |
| <span class="c1">// +---+---------+----------+</span> |
| <span class="c1">// | 28|Developer|laglangyue|</span> |
| <span class="c1">// +---+---------+----------+</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/SQLDataSourceExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| <div class="highlight"><pre class="codehilite"><code><span class="c1">// Primitive types (Int, String, etc) and Product types (case classes) encoders are</span> |
| <span class="c1">// supported by importing this when creating a Dataset.</span> |
| |
| <span class="c1">// An XML dataset is pointed to by path.</span> |
| <span class="c1">// The path can be either a single xml file or more xml files</span> |
| <span class="nc">String</span> <span class="n">path</span> <span class="o">=</span> <span class="s">"examples/src/main/resources/people.xml"</span><span class="o">;</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">peopleDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">().</span><span class="na">option</span><span class="o">(</span><span class="s">"rowTag"</span><span class="o">,</span> <span class="s">"person"</span><span class="o">).</span><span class="na">xml</span><span class="o">(</span><span class="n">path</span><span class="o">);</span> |
| |
| <span class="c1">// The inferred schema can be visualized using the printSchema() method</span> |
| <span class="n">peopleDF</span><span class="o">.</span><span class="na">printSchema</span><span class="o">();</span> |
| <span class="c1">// root</span> |
| <span class="c1">// |-- age: long (nullable = true)</span> |
| <span class="c1">// |-- name: string (nullable = true)</span> |
| |
| <span class="c1">// Creates a temporary view using the DataFrame</span> |
| <span class="n">peopleDF</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"people"</span><span class="o">);</span> |
| |
| <span class="c1">// SQL statements can be run by using the sql methods provided by spark</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">teenagerNamesDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span> |
| <span class="s">"SELECT name FROM people WHERE age BETWEEN 13 AND 19"</span><span class="o">);</span> |
| <span class="n">teenagerNamesDF</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| <span class="c1">// +------+</span> |
| <span class="c1">// | name|</span> |
| <span class="c1">// +------+</span> |
| <span class="c1">// |Justin|</span> |
| <span class="c1">// +------+</span> |
| |
| <span class="c1">// Alternatively, a DataFrame can be created for an XML dataset represented by a Dataset[String]</span> |
| <span class="nc">List</span><span class="o"><</span><span class="nc">String</span><span class="o">></span> <span class="n">xmlData</span> <span class="o">=</span> <span class="nc">Collections</span><span class="o">.</span><span class="na">singletonList</span><span class="o">(</span> |
| <span class="s">"<person>"</span> <span class="o">+</span> |
| <span class="s">"<name>laglangyue</name><job>Developer</job><age>28</age>"</span> <span class="o">+</span> |
| <span class="s">"</person>"</span><span class="o">);</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">String</span><span class="o">></span> <span class="n">otherPeopleDataset</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataset</span><span class="o">(</span><span class="nc">Lists</span><span class="o">.</span><span class="na">newArrayList</span><span class="o">(</span><span class="n">xmlData</span><span class="o">),</span> |
| <span class="nc">Encoders</span><span class="o">.</span><span class="na">STRING</span><span class="o">());</span> |
| |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">otherPeople</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">read</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">option</span><span class="o">(</span><span class="s">"rowTag"</span><span class="o">,</span> <span class="s">"person"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">xml</span><span class="o">(</span><span class="n">otherPeopleDataset</span><span class="o">);</span> |
| <span class="n">otherPeople</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| <span class="c1">// +---+---------+----------+</span> |
| <span class="c1">// |age| job| name|</span> |
| <span class="c1">// +---+---------+----------+</span> |
| <span class="c1">// | 28|Developer|laglangyue|</span> |
| <span class="c1">// +---+---------+----------+</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/JavaSQLDataSourceExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| </div> |
| |
| <h2 id="data-source-option">Data Source Option</h2> |
| |
| <p>Data source options of XML can be set via:</p> |
| |
| <ul> |
| <li>the <code class="language-plaintext highlighter-rouge">.option</code>/<code class="language-plaintext highlighter-rouge">.options</code> methods of |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">DataFrameReader</code></li> |
| <li><code class="language-plaintext highlighter-rouge">DataFrameWriter</code></li> |
| <li><code class="language-plaintext highlighter-rouge">DataStreamReader</code></li> |
| <li><code class="language-plaintext highlighter-rouge">DataStreamWriter</code></li> |
| </ul> |
| </li> |
| <li>the built-in functions below |
| <ul> |
| <li><code class="language-plaintext highlighter-rouge">from_xml</code></li> |
| <li><code class="language-plaintext highlighter-rouge">to_xml</code></li> |
| <li><code class="language-plaintext highlighter-rouge">schema_of_xml</code></li> |
| </ul> |
| </li> |
| <li><code class="language-plaintext highlighter-rouge">OPTIONS</code> clause at <a href="sql-ref-syntax-ddl-create-table-datasource.html">CREATE TABLE USING DATA_SOURCE</a></li> |
| </ul> |
| |
| <table> |
| <thead><tr><th><b>Property Name</b></th><th><b>Default</b></th><th><b>Meaning</b></th><th><b>Scope</b></th></tr></thead> |
| <tr> |
| <td><code>rowTag</code></td> |
| <td></td> |
| <td>The row tag of your xml files to treat as a row. For example, in this xml: |
| <code><xmp><books><book></book>...</books></xmp></code> |
| the appropriate value would be book. This is a required option for both read and write. |
| </td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>samplingRatio</code></td> |
| <td><code>1.0</code></td> |
| <td>Defines fraction of rows used for schema inferring. XML built-in functions ignore this option.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>excludeAttribute</code></td> |
| <td><code>false</code></td> |
| <td>Whether to exclude attributes in elements.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>mode</code></td> |
| <td><code>PERMISSIVE</code></td> |
| <td>Allows a mode for dealing with corrupt records during parsing.<br /> |
| <ul> |
| <li><code>PERMISSIVE</code>: when it meets a corrupted record, puts the malformed string into a field configured by columnNameOfCorruptRecord, and sets malformed fields to null. To keep corrupt records, an user can set a string type field named columnNameOfCorruptRecord in an user-defined schema. If a schema does not have the field, it drops corrupt records during parsing. When inferring a schema, it implicitly adds a columnNameOfCorruptRecord field in an output schema.</li> |
| <li><code>DROPMALFORMED</code>: ignores the whole corrupted records. This mode is unsupported in the XML built-in functions.</li> |
| <li><code>FAILFAST</code>: throws an exception when it meets corrupted records.</li> |
| </ul> |
| </td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>inferSchema</code></td> |
| <td><code>true</code></td> |
| <td>If true, attempts to infer an appropriate type for each resulting DataFrame column. If false, all resulting columns are of string type.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>columnNameOfCorruptRecord</code></td> |
| <td><code>spark.sql.columnNameOfCorruptRecord</code></td> |
| <td>Allows renaming the new field having a malformed string created by PERMISSIVE mode.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>attributePrefix</code></td> |
| <td><code>_</code></td> |
| <td>The prefix for attributes to differentiate attributes from elements. This will be the prefix for field names. Can be empty for reading XML, but not for writing.</td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>valueTag</code></td> |
| <td><code>_VALUE</code></td> |
| <td>The tag used for the value when there are attributes in the element having no child.</td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>encoding</code></td> |
| <td><code>UTF-8</code></td> |
| <td>For reading, decodes the XML files by the given encoding type. For writing, specifies encoding (charset) of saved XML files. XML built-in functions ignore this option. </td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>ignoreSurroundingSpaces</code></td> |
| <td><code>true</code></td> |
| <td>Defines whether surrounding whitespaces from values being read should be skipped.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>rowValidationXSDPath</code></td> |
| <td><code>null</code></td> |
| <td>Path to an optional XSD file that is used to validate the XML for each row individually. Rows that fail to validate are treated like parse errors as above. The XSD does not otherwise affect the schema provided, or inferred.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>ignoreNamespace</code></td> |
| <td><code>false</code></td> |
| <td>If true, namespaces prefixes on XML elements and attributes are ignored. Tags <abc:author> and <def:author> would, for example, be treated as if both are just <author>. Note that, at the moment, namespaces cannot be ignored on the rowTag element, only its children. Note that XML parsing is in general not namespace-aware even if false.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>timeZone</code></td> |
| <td>(value of <code>spark.sql.session.timeZone</code> configuration)</td> |
| <td>Sets the string that indicates a time zone ID to be used to format timestamps in the XML datasources or partition values. The following formats of timeZone are supported:<br /> |
| <ul> |
| <li>Region-based zone ID: It should have the form 'area/city', such as 'America/Los_Angeles'.</li> |
| <li>Zone offset: It should be in the format '(+|-)HH:mm', for example '-08:00' or '+01:00', also 'UTC' and 'Z' are supported as aliases of '+00:00'.</li> |
| </ul> |
| Other short names like 'CST' are not recommended to use because they can be ambiguous. |
| </td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>timestampFormat</code></td> |
| <td><code>yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]</code></td> |
| <td>Sets the string that indicates a timestamp format. Custom date formats follow the formats at <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html"> datetime pattern</a>. This applies to timestamp type.</td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>timestampNTZFormat</code></td> |
| <td>yyyy-MM-dd'T'HH:mm:ss[.SSS]</td> |
| <td>Sets the string that indicates a timestamp without timezone format. Custom date formats follow the formats at <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html">Datetime Patterns</a>. This applies to timestamp without timezone type, note that zone-offset and time-zone components are not supported when writing or reading this data type.</td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>dateFormat</code></td> |
| <td><code>yyyy-MM-dd</code></td> |
| <td>Sets the string that indicates a date format. Custom date formats follow the formats at <a href="https://spark.apache.org/docs/latest/sql-ref-datetime-pattern.html"> datetime pattern</a>. This applies to date type.</td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>locale</code></td> |
| <td><code>en-US</code></td> |
| <td>Sets a locale as a language tag in IETF BCP 47 format. For instance, locale is used while parsing dates and timestamps. </td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>rootTag</code></td> |
| <td><code>ROWS</code></td> |
| <td>Root tag of the xml files. For example, in this xml: |
| <code><xmp><books><book></book>...</books></xmp></code> |
| the appropriate value would be books. It can include basic attributes by specifying a value like 'books' |
| </td> |
| <td>write</td> |
| </tr> |
| |
| <tr> |
| <td><code>declaration</code></td> |
| <td>version="1.0" |
| <code>encoding="UTF-8"</code> |
| standalone="yes"</td> |
| <td>Content of XML declaration to write at the start of every output XML file, before the rootTag. For example, a value of foo causes <?xml foo?> to be written. Set to empty string to suppress</td> |
| <td>write</td> |
| </tr> |
| |
| <tr> |
| <td><code>arrayElementName</code></td> |
| <td><code>item</code></td> |
| <td>Name of XML element that encloses each element of an array-valued column when writing.</td> |
| <td>write</td> |
| </tr> |
| |
| <tr> |
| <td><code>nullValue</code></td> |
| <td>null</td> |
| <td>Sets the string representation of a null value. Default is string null. When this is null, it does not write attributes and elements for fields.</td> |
| <td>read/write</td> |
| </tr> |
| |
| <tr> |
| <td><code>wildcardColName</code></td> |
| <td><code>xs_any</code></td> |
| <td>Name of a column existing in the provided schema which is interpreted as a 'wildcard'. It must have type string or array of strings. It will match any XML child element that is not otherwise matched by the schema. The XML of the child becomes the string value of the column. If an array, then all unmatched elements will be returned as an array of strings. As its name implies, it is meant to emulate XSD's xs:any type.</td> |
| <td>read</td> |
| </tr> |
| |
| <tr> |
| <td><code>compression</code></td> |
| <td><code>none</code></td> |
| <td>Compression codec to use when saving to file. This can be one of the known case-insensitive shortened names (none, bzip2, gzip, lz4, snappy and deflate). XML built-in functions ignore this option.</td> |
| <td>write</td> |
| </tr> |
| |
| <tr> |
| <td><code>validateName</code></td> |
| <td><code>true</code></td> |
| <td>If true, throws error on XML element name validation failure. For example, SQL field names can have spaces, but XML element names cannot.</td> |
| <td>write</td> |
| </tr> |
| |
| </table> |
| <p>Other generic options can be found in <a href="https://spark.apache.org/docs/latest/sql-data-sources-generic-options.html"> Generic File Source Options</a>.</p> |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.5.1.min.js"></script> |
| <script src="js/vendor/bootstrap.bundle.min.js"></script> |
| |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <script type="text/javascript" src="js/vendor/docsearch.min.js"></script> |
| <script type="text/javascript"> |
| // DocSearch is entirely free and automated. DocSearch is built in two parts: |
| // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link |
| // in your website and extract content from every page it traverses. It then pushes this |
| // content to an Algolia index. |
| // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index |
| // to your search input and display its results in a dropdown UI. If you want to find more |
| // details on how works DocSearch, check the docs of DocSearch. |
| docsearch({ |
| apiKey: 'd62f962a82bc9abb53471cb7b89da35e', |
| appId: 'RAI69RXRSK', |
| indexName: 'apache_spark', |
| inputSelector: '#docsearch-input', |
| enhancedSearchInput: true, |
| algoliaOptions: { |
| 'facetFilters': ["version:4.1.0-preview1"] |
| }, |
| debug: false // Set debug to true if you want to inspect the dropdown |
| }); |
| |
| </script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |