| <!DOCTYPE html> | 
 | <html class="no-js"> | 
 |     <head> | 
 |         <meta charset="utf-8"> | 
 |         <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> | 
 |         <meta name="viewport" content="width=device-width, initial-scale=1.0"> | 
 |  | 
 |         <title>Hive Tables - Spark 4.0.0-preview1 Documentation</title> | 
 |          | 
 |  | 
 |          | 
 |  | 
 |  | 
 |         <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/css/bootstrap.min.css" rel="stylesheet" | 
 |         integrity="sha384-EVSTQN3/azprG1Anm3QDgpJLIm9Nao0Yz1ztcQTwFspd3yD65VohhpuuCOmLASjC" crossorigin="anonymous"> | 
 |         <link rel="preconnect" href="https://fonts.googleapis.com"> | 
 |         <link rel="preconnect" href="https://fonts.gstatic.com" crossorigin> | 
 |         <link href="https://fonts.googleapis.com/css2?family=DM+Sans:ital,wght@0,400;0,500;0,700;1,400;1,500;1,700&Courier+Prime:wght@400;700&display=swap" rel="stylesheet"> | 
 |         <link href="css/custom.css" rel="stylesheet"> | 
 |         <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> | 
 |  | 
 |         <link rel="stylesheet" href="css/pygments-default.css"> | 
 |         <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" /> | 
 |         <link rel="stylesheet" href="css/docsearch.css"> | 
 |  | 
 |          | 
 |         <!-- Matomo --> | 
 |         <script> | 
 |             var _paq = window._paq = window._paq || []; | 
 |             /* tracker methods like "setCustomDimension" should be called before "trackPageView" */ | 
 |             _paq.push(["disableCookies"]); | 
 |             _paq.push(['trackPageView']); | 
 |             _paq.push(['enableLinkTracking']); | 
 |             (function() { | 
 |               var u="https://analytics.apache.org/"; | 
 |               _paq.push(['setTrackerUrl', u+'matomo.php']); | 
 |               _paq.push(['setSiteId', '40']); | 
 |               var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0]; | 
 |               g.async=true; g.src=u+'matomo.js'; s.parentNode.insertBefore(g,s); | 
 |             })(); | 
 |         </script> | 
 |         <!-- End Matomo Code --> | 
 |          | 
 |  | 
 |     </head> | 
 |     <body class="global"> | 
 |         <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> | 
 |         <nav class="navbar navbar-expand-lg navbar-dark p-0 px-4 fixed-top" style="background: #1d6890;" id="topbar"> | 
 |             <div class="navbar-brand"><a href="index.html"> | 
 |                 <img src="img/spark-logo-rev.svg" width="141" height="72"/></a><span class="version">4.0.0-preview1</span> | 
 |             </div> | 
 |             <button class="navbar-toggler" type="button" data-toggle="collapse" | 
 |                     data-target="#navbarCollapse" aria-controls="navbarCollapse" | 
 |                     aria-expanded="false" aria-label="Toggle navigation"> | 
 |                 <span class="navbar-toggler-icon"></span> | 
 |             </button> | 
 |             <div class="collapse navbar-collapse" id="navbarCollapse"> | 
 |                 <ul class="navbar-nav me-auto"> | 
 |                     <li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li> | 
 |  | 
 |                     <li class="nav-item dropdown"> | 
 |                         <a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a> | 
 |                         <div class="dropdown-menu" aria-labelledby="navbarQuickStart"> | 
 |                             <a class="dropdown-item" href="quick-start.html">Quick Start</a> | 
 |                             <a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a> | 
 |                             <a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a> | 
 |                             <a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a> | 
 |                             <a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a> | 
 |                             <a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a> | 
 |                             <a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a> | 
 |                             <a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a> | 
 |                             <a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a> | 
 |                         </div> | 
 |                     </li> | 
 |  | 
 |                     <li class="nav-item dropdown"> | 
 |                         <a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a> | 
 |                         <div class="dropdown-menu" aria-labelledby="navbarAPIDocs"> | 
 |                             <a class="dropdown-item" href="api/python/index.html">Python</a> | 
 |                             <a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a> | 
 |                             <a class="dropdown-item" href="api/java/index.html">Java</a> | 
 |                             <a class="dropdown-item" href="api/R/index.html">R</a> | 
 |                             <a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a> | 
 |                         </div> | 
 |                     </li> | 
 |  | 
 |                     <li class="nav-item dropdown"> | 
 |                         <a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a> | 
 |                         <div class="dropdown-menu" aria-labelledby="navbarDeploying"> | 
 |                             <a class="dropdown-item" href="cluster-overview.html">Overview</a> | 
 |                             <a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a> | 
 |                             <div class="dropdown-divider"></div> | 
 |                             <a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a> | 
 |                             <a class="dropdown-item" href="running-on-yarn.html">YARN</a> | 
 |                             <a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a> | 
 |                         </div> | 
 |                     </li> | 
 |  | 
 |                     <li class="nav-item dropdown"> | 
 |                         <a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a> | 
 |                         <div class="dropdown-menu" aria-labelledby="navbarMore"> | 
 |                             <a class="dropdown-item" href="configuration.html">Configuration</a> | 
 |                             <a class="dropdown-item" href="monitoring.html">Monitoring</a> | 
 |                             <a class="dropdown-item" href="tuning.html">Tuning Guide</a> | 
 |                             <a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a> | 
 |                             <a class="dropdown-item" href="security.html">Security</a> | 
 |                             <a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a> | 
 |                             <a class="dropdown-item" href="migration-guide.html">Migration Guide</a> | 
 |                             <div class="dropdown-divider"></div> | 
 |                             <a class="dropdown-item" href="building-spark.html">Building Spark</a> | 
 |                             <a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a> | 
 |                             <a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a> | 
 |                         </div> | 
 |                     </li> | 
 |  | 
 |                     <li class="nav-item"> | 
 |                         <input type="text" id="docsearch-input" placeholder="Search the docs…"> | 
 |                     </li> | 
 |                 </ul> | 
 |                 <!--<span class="navbar-text navbar-right"><span class="version-text">v4.0.0-preview1</span></span>--> | 
 |             </div> | 
 |         </nav> | 
 |  | 
 |          | 
 |  | 
 |         <div class="container"> | 
 |  | 
 |              | 
 |                  | 
 |                     <div class="left-menu-wrapper"> | 
 |     <div class="left-menu"> | 
 |         <h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3> | 
 |          | 
 | <ul> | 
 |  | 
 |     <li> | 
 |         <a href="sql-getting-started.html"> | 
 |              | 
 |                 Getting Started | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources.html"> | 
 |              | 
 |                 Data Sources | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |          | 
 | <ul> | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-load-save-functions.html"> | 
 |              | 
 |                 Generic Load/Save Functions | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-generic-options.html"> | 
 |              | 
 |                 Generic File Source Options | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-parquet.html"> | 
 |              | 
 |                 Parquet Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-orc.html"> | 
 |              | 
 |                 ORC Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-json.html"> | 
 |              | 
 |                 JSON Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-csv.html"> | 
 |              | 
 |                 CSV Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-text.html"> | 
 |              | 
 |                 Text Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-xml.html"> | 
 |              | 
 |                 XML Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-hive-tables.html"> | 
 |              | 
 |                 Hive Tables | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-jdbc.html"> | 
 |              | 
 |                 JDBC To Other Databases | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-avro.html"> | 
 |              | 
 |                 Avro Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-protobuf.html"> | 
 |              | 
 |                 Protobuf data | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-binaryFile.html"> | 
 |              | 
 |                 Whole Binary Files | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-data-sources-troubleshooting.html"> | 
 |              | 
 |                 Troubleshooting | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 | </ul> | 
 |  | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-performance-tuning.html"> | 
 |              | 
 |                 Performance Tuning | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-distributed-sql-engine.html"> | 
 |              | 
 |                 Distributed SQL Engine | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-pyspark-pandas-with-arrow.html"> | 
 |              | 
 |                 PySpark Usage Guide for Pandas with Apache Arrow | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-migration-guide.html"> | 
 |              | 
 |                 Migration Guide | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-ref.html"> | 
 |              | 
 |                 SQL Reference | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 |     <li> | 
 |         <a href="sql-error-conditions.html"> | 
 |              | 
 |                 Error Conditions | 
 |              | 
 |         </a> | 
 |     </li> | 
 |      | 
 |      | 
 |  | 
 | </ul> | 
 |  | 
 |     </div> | 
 | </div> | 
 |                  | 
 |                 <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> | 
 |                 <label for="nav-trigger"></label> | 
 |                 <div class="content-with-sidebar mr-3" id="content"> | 
 |                      | 
 |                         <h1 class="title">Hive Tables</h1> | 
 |                      | 
 |  | 
 |                     <ul id="markdown-toc"> | 
 |   <li><a href="#specifying-storage-format-for-hive-tables" id="markdown-toc-specifying-storage-format-for-hive-tables">Specifying storage format for Hive tables</a></li> | 
 |   <li><a href="#interacting-with-different-versions-of-hive-metastore" id="markdown-toc-interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</a></li> | 
 | </ul> | 
 |  | 
 | <p>Spark SQL also supports reading and writing data stored in <a href="http://hive.apache.org/">Apache Hive</a>. | 
 | However, since Hive has a large number of dependencies, these dependencies are not included in the | 
 | default Spark distribution. If Hive dependencies can be found on the classpath, Spark will load them | 
 | automatically. Note that these Hive dependencies must also be present on all of the worker nodes, as | 
 | they will need access to the Hive serialization and deserialization libraries (SerDes) in order to | 
 | access data stored in Hive.</p> | 
 |  | 
 | <p>Configuration of Hive is done by placing your <code class="language-plaintext highlighter-rouge">hive-site.xml</code>, <code class="language-plaintext highlighter-rouge">core-site.xml</code> (for security configuration), | 
 | and <code class="language-plaintext highlighter-rouge">hdfs-site.xml</code> (for HDFS configuration) file in <code class="language-plaintext highlighter-rouge">conf/</code>.</p> | 
 |  | 
 | <p>When working with Hive, one must instantiate <code class="language-plaintext highlighter-rouge">SparkSession</code> with Hive support, including | 
 | connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined functions. | 
 | Users who do not have an existing Hive deployment can still enable Hive support. When not configured | 
 | by the <code class="language-plaintext highlighter-rouge">hive-site.xml</code>, the context automatically creates <code class="language-plaintext highlighter-rouge">metastore_db</code> in the current directory and | 
 | creates a directory configured by <code class="language-plaintext highlighter-rouge">spark.sql.warehouse.dir</code>, which defaults to the directory | 
 | <code class="language-plaintext highlighter-rouge">spark-warehouse</code> in the current directory that the Spark application is started. Note that | 
 | the <code class="language-plaintext highlighter-rouge">hive.metastore.warehouse.dir</code> property in <code class="language-plaintext highlighter-rouge">hive-site.xml</code> is deprecated since Spark 2.0.0. | 
 | Instead, use <code class="language-plaintext highlighter-rouge">spark.sql.warehouse.dir</code> to specify the default location of database in warehouse. | 
 | You may need to grant write privilege to the user who starts the Spark application.</p> | 
 |  | 
 | <div class="codetabs"> | 
 |  | 
 | <div data-lang="python"> | 
 |     <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">os.path</span> <span class="kn">import</span> <span class="n">abspath</span> | 
 |  | 
 | <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> | 
 | <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span> | 
 |  | 
 | <span class="c1"># warehouse_location points to the default location for managed databases and tables | 
 | </span><span class="n">warehouse_location</span> <span class="o">=</span> <span class="n">abspath</span><span class="p">(</span><span class="s">'spark-warehouse'</span><span class="p">)</span> | 
 |  | 
 | <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span> \ | 
 |     <span class="p">.</span><span class="n">builder</span> \ | 
 |     <span class="p">.</span><span class="n">appName</span><span class="p">(</span><span class="s">"Python Spark SQL Hive integration example"</span><span class="p">)</span> \ | 
 |     <span class="p">.</span><span class="n">config</span><span class="p">(</span><span class="s">"spark.sql.warehouse.dir"</span><span class="p">,</span> <span class="n">warehouse_location</span><span class="p">)</span> \ | 
 |     <span class="p">.</span><span class="n">enableHiveSupport</span><span class="p">()</span> \ | 
 |     <span class="p">.</span><span class="n">getOrCreate</span><span class="p">()</span> | 
 |  | 
 | <span class="c1"># spark is an existing SparkSession | 
 | </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="p">)</span> | 
 | <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="p">)</span> | 
 |  | 
 | <span class="c1"># Queries are expressed in HiveQL | 
 | </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM src"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span> | 
 | <span class="c1"># +---+-------+ | 
 | # |key|  value| | 
 | # +---+-------+ | 
 | # |238|val_238| | 
 | # | 86| val_86| | 
 | # |311|val_311| | 
 | # ... | 
 | </span> | 
 | <span class="c1"># Aggregation queries are also supported. | 
 | </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT COUNT(*) FROM src"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span> | 
 | <span class="c1"># +--------+ | 
 | # |count(1)| | 
 | # +--------+ | 
 | # |    500 | | 
 | # +--------+ | 
 | </span> | 
 | <span class="c1"># The results of SQL queries are themselves DataFrames and support all normal functions. | 
 | </span><span class="n">sqlDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT key, value FROM src WHERE key < 10 ORDER BY key"</span><span class="p">)</span> | 
 |  | 
 | <span class="c1"># The items in DataFrames are of type Row, which allows you to access each column by ordinal. | 
 | </span><span class="n">stringsDS</span> <span class="o">=</span> <span class="n">sqlDF</span><span class="p">.</span><span class="n">rdd</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="s">"Key: %d, Value: %s"</span> <span class="o">%</span> <span class="p">(</span><span class="n">row</span><span class="p">.</span><span class="n">key</span><span class="p">,</span> <span class="n">row</span><span class="p">.</span><span class="n">value</span><span class="p">))</span> | 
 | <span class="k">for</span> <span class="n">record</span> <span class="ow">in</span> <span class="n">stringsDS</span><span class="p">.</span><span class="n">collect</span><span class="p">():</span> | 
 |     <span class="k">print</span><span class="p">(</span><span class="n">record</span><span class="p">)</span> | 
 | <span class="c1"># Key: 0, Value: val_0 | 
 | # Key: 0, Value: val_0 | 
 | # Key: 0, Value: val_0 | 
 | # ... | 
 | </span> | 
 | <span class="c1"># You can also use DataFrames to create temporary views within a SparkSession. | 
 | </span><span class="n">Record</span> <span class="o">=</span> <span class="n">Row</span><span class="p">(</span><span class="s">"key"</span><span class="p">,</span> <span class="s">"value"</span><span class="p">)</span> | 
 | <span class="n">recordsDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">([</span><span class="n">Record</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="s">"val_"</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">101</span><span class="p">)])</span> | 
 | <span class="n">recordsDF</span><span class="p">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s">"records"</span><span class="p">)</span> | 
 |  | 
 | <span class="c1"># Queries can then join DataFrame data with data stored in Hive. | 
 | </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM records r JOIN src s ON r.key = s.key"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span> | 
 | <span class="c1"># +---+------+---+------+ | 
 | # |key| value|key| value| | 
 | # +---+------+---+------+ | 
 | # |  2| val_2|  2| val_2| | 
 | # |  4| val_4|  4| val_4| | 
 | # |  5| val_5|  5| val_5| | 
 | # ...</span></code></pre></div> | 
 |     <div><small>Find full example code at "examples/src/main/python/sql/hive.py" in the Spark repo.</small></div> | 
 |   </div> | 
 |  | 
 | <div data-lang="scala"> | 
 |     <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">java.io.File</span> | 
 |  | 
 | <span class="k">import</span> <span class="nn">org.apache.spark.sql.</span><span class="o">{</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">SaveMode</span><span class="o">,</span> <span class="nc">SparkSession</span><span class="o">}</span> | 
 |  | 
 | <span class="k">case</span> <span class="k">class</span> <span class="nc">Record</span><span class="o">(</span><span class="n">key</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> <span class="n">value</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> | 
 |  | 
 | <span class="c1">// warehouseLocation points to the default location for managed databases and tables</span> | 
 | <span class="k">val</span> <span class="nv">warehouseLocation</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">File</span><span class="o">(</span><span class="s">"spark-warehouse"</span><span class="o">).</span><span class="py">getAbsolutePath</span> | 
 |  | 
 | <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nc">SparkSession</span> | 
 |   <span class="o">.</span><span class="py">builder</span><span class="o">()</span> | 
 |   <span class="o">.</span><span class="py">appName</span><span class="o">(</span><span class="s">"Spark Hive Example"</span><span class="o">)</span> | 
 |   <span class="o">.</span><span class="py">config</span><span class="o">(</span><span class="s">"spark.sql.warehouse.dir"</span><span class="o">,</span> <span class="n">warehouseLocation</span><span class="o">)</span> | 
 |   <span class="o">.</span><span class="py">enableHiveSupport</span><span class="o">()</span> | 
 |   <span class="o">.</span><span class="py">getOrCreate</span><span class="o">()</span> | 
 |  | 
 | <span class="k">import</span> <span class="nn">spark.implicits._</span> | 
 | <span class="k">import</span> <span class="nn">spark.sql</span> | 
 |  | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="o">)</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="o">)</span> | 
 |  | 
 | <span class="c1">// Queries are expressed in HiveQL</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM src"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +---+-------+</span> | 
 | <span class="c1">// |key|  value|</span> | 
 | <span class="c1">// +---+-------+</span> | 
 | <span class="c1">// |238|val_238|</span> | 
 | <span class="c1">// | 86| val_86|</span> | 
 | <span class="c1">// |311|val_311|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="c1">// Aggregation queries are also supported.</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT COUNT(*) FROM src"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +--------+</span> | 
 | <span class="c1">// |count(1)|</span> | 
 | <span class="c1">// +--------+</span> | 
 | <span class="c1">// |    500 |</span> | 
 | <span class="c1">// +--------+</span> | 
 |  | 
 | <span class="c1">// The results of SQL queries are themselves DataFrames and support all normal functions.</span> | 
 | <span class="k">val</span> <span class="nv">sqlDF</span> <span class="k">=</span> <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT key, value FROM src WHERE key < 10 ORDER BY key"</span><span class="o">)</span> | 
 |  | 
 | <span class="c1">// The items in DataFrames are of type Row, which allows you to access each column by ordinal.</span> | 
 | <span class="k">val</span> <span class="nv">stringsDS</span> <span class="k">=</span> <span class="nv">sqlDF</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> | 
 |   <span class="k">case</span> <span class="nc">Row</span><span class="o">(</span><span class="n">key</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> <span class="n">value</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> <span class="k">=></span> <span class="n">s</span><span class="s">"Key: $key, Value: $value"</span> | 
 | <span class="o">}</span> | 
 | <span class="nv">stringsDS</span><span class="o">.</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +--------------------+</span> | 
 | <span class="c1">// |               value|</span> | 
 | <span class="c1">// +--------------------+</span> | 
 | <span class="c1">// |Key: 0, Value: val_0|</span> | 
 | <span class="c1">// |Key: 0, Value: val_0|</span> | 
 | <span class="c1">// |Key: 0, Value: val_0|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="c1">// You can also use DataFrames to create temporary views within a SparkSession.</span> | 
 | <span class="k">val</span> <span class="nv">recordsDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">createDataFrame</span><span class="o">((</span><span class="mi">1</span> <span class="n">to</span> <span class="mi">100</span><span class="o">).</span><span class="py">map</span><span class="o">(</span><span class="n">i</span> <span class="k">=></span> <span class="nc">Record</span><span class="o">(</span><span class="n">i</span><span class="o">,</span> <span class="n">s</span><span class="s">"val_$i"</span><span class="o">)))</span> | 
 | <span class="nv">recordsDF</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"records"</span><span class="o">)</span> | 
 |  | 
 | <span class="c1">// Queries can then join DataFrame data with data stored in Hive.</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM records r JOIN src s ON r.key = s.key"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +---+------+---+------+</span> | 
 | <span class="c1">// |key| value|key| value|</span> | 
 | <span class="c1">// +---+------+---+------+</span> | 
 | <span class="c1">// |  2| val_2|  2| val_2|</span> | 
 | <span class="c1">// |  4| val_4|  4| val_4|</span> | 
 | <span class="c1">// |  5| val_5|  5| val_5|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="c1">// Create a Hive managed Parquet table, with HQL syntax instead of the Spark SQL native syntax</span> | 
 | <span class="c1">// `USING hive`</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"CREATE TABLE hive_records(key int, value string) STORED AS PARQUET"</span><span class="o">)</span> | 
 | <span class="c1">// Save DataFrame to the Hive managed table</span> | 
 | <span class="k">val</span> <span class="nv">df</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">table</span><span class="o">(</span><span class="s">"src"</span><span class="o">)</span> | 
 | <span class="nv">df</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">mode</span><span class="o">(</span><span class="nv">SaveMode</span><span class="o">.</span><span class="py">Overwrite</span><span class="o">).</span><span class="py">saveAsTable</span><span class="o">(</span><span class="s">"hive_records"</span><span class="o">)</span> | 
 | <span class="c1">// After insertion, the Hive managed table has data now</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM hive_records"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +---+-------+</span> | 
 | <span class="c1">// |key|  value|</span> | 
 | <span class="c1">// +---+-------+</span> | 
 | <span class="c1">// |238|val_238|</span> | 
 | <span class="c1">// | 86| val_86|</span> | 
 | <span class="c1">// |311|val_311|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="c1">// Prepare a Parquet data directory</span> | 
 | <span class="k">val</span> <span class="nv">dataDir</span> <span class="k">=</span> <span class="s">"/tmp/parquet_data"</span> | 
 | <span class="nv">spark</span><span class="o">.</span><span class="py">range</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="py">write</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="n">dataDir</span><span class="o">)</span> | 
 | <span class="c1">// Create a Hive external Parquet table</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="n">s</span><span class="s">"CREATE EXTERNAL TABLE hive_bigints(id bigint) STORED AS PARQUET LOCATION '$dataDir'"</span><span class="o">)</span> | 
 | <span class="c1">// The Hive external table should already have data</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM hive_bigints"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +---+</span> | 
 | <span class="c1">// | id|</span> | 
 | <span class="c1">// +---+</span> | 
 | <span class="c1">// |  0|</span> | 
 | <span class="c1">// |  1|</span> | 
 | <span class="c1">// |  2|</span> | 
 | <span class="c1">// ... Order may vary, as spark processes the partitions in parallel.</span> | 
 |  | 
 | <span class="c1">// Turn on flag for Hive Dynamic Partitioning</span> | 
 | <span class="nv">spark</span><span class="o">.</span><span class="py">sqlContext</span><span class="o">.</span><span class="py">setConf</span><span class="o">(</span><span class="s">"hive.exec.dynamic.partition"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span> | 
 | <span class="nv">spark</span><span class="o">.</span><span class="py">sqlContext</span><span class="o">.</span><span class="py">setConf</span><span class="o">(</span><span class="s">"hive.exec.dynamic.partition.mode"</span><span class="o">,</span> <span class="s">"nonstrict"</span><span class="o">)</span> | 
 | <span class="c1">// Create a Hive partitioned table using DataFrame API</span> | 
 | <span class="nv">df</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">partitionBy</span><span class="o">(</span><span class="s">"key"</span><span class="o">).</span><span class="py">format</span><span class="o">(</span><span class="s">"hive"</span><span class="o">).</span><span class="py">saveAsTable</span><span class="o">(</span><span class="s">"hive_part_tbl"</span><span class="o">)</span> | 
 | <span class="c1">// Partitioned column `key` will be moved to the end of the schema.</span> | 
 | <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM hive_part_tbl"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> | 
 | <span class="c1">// +-------+---+</span> | 
 | <span class="c1">// |  value|key|</span> | 
 | <span class="c1">// +-------+---+</span> | 
 | <span class="c1">// |val_238|238|</span> | 
 | <span class="c1">// | val_86| 86|</span> | 
 | <span class="c1">// |val_311|311|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="nv">spark</span><span class="o">.</span><span class="py">stop</span><span class="o">()</span></code></pre></div> | 
 |     <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala" in the Spark repo.</small></div> | 
 |   </div> | 
 |  | 
 | <div data-lang="java"> | 
 |     <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">java.io.File</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">java.io.Serializable</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">java.util.ArrayList</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> | 
 |  | 
 | <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.MapFunction</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoders</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> | 
 | <span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span> | 
 |  | 
 | <span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Record</span> <span class="kd">implements</span> <span class="nc">Serializable</span> <span class="o">{</span> | 
 |   <span class="kd">private</span> <span class="kt">int</span> <span class="n">key</span><span class="o">;</span> | 
 |   <span class="kd">private</span> <span class="nc">String</span> <span class="n">value</span><span class="o">;</span> | 
 |  | 
 |   <span class="kd">public</span> <span class="kt">int</span> <span class="nf">getKey</span><span class="o">()</span> <span class="o">{</span> | 
 |     <span class="k">return</span> <span class="n">key</span><span class="o">;</span> | 
 |   <span class="o">}</span> | 
 |  | 
 |   <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setKey</span><span class="o">(</span><span class="kt">int</span> <span class="n">key</span><span class="o">)</span> <span class="o">{</span> | 
 |     <span class="k">this</span><span class="o">.</span><span class="na">key</span> <span class="o">=</span> <span class="n">key</span><span class="o">;</span> | 
 |   <span class="o">}</span> | 
 |  | 
 |   <span class="kd">public</span> <span class="nc">String</span> <span class="nf">getValue</span><span class="o">()</span> <span class="o">{</span> | 
 |     <span class="k">return</span> <span class="n">value</span><span class="o">;</span> | 
 |   <span class="o">}</span> | 
 |  | 
 |   <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setValue</span><span class="o">(</span><span class="nc">String</span> <span class="n">value</span><span class="o">)</span> <span class="o">{</span> | 
 |     <span class="k">this</span><span class="o">.</span><span class="na">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">;</span> | 
 |   <span class="o">}</span> | 
 | <span class="o">}</span> | 
 |  | 
 | <span class="c1">// warehouseLocation points to the default location for managed databases and tables</span> | 
 | <span class="nc">String</span> <span class="n">warehouseLocation</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">File</span><span class="o">(</span><span class="s">"spark-warehouse"</span><span class="o">).</span><span class="na">getAbsolutePath</span><span class="o">();</span> | 
 | <span class="nc">SparkSession</span> <span class="n">spark</span> <span class="o">=</span> <span class="nc">SparkSession</span> | 
 |   <span class="o">.</span><span class="na">builder</span><span class="o">()</span> | 
 |   <span class="o">.</span><span class="na">appName</span><span class="o">(</span><span class="s">"Java Spark Hive Example"</span><span class="o">)</span> | 
 |   <span class="o">.</span><span class="na">config</span><span class="o">(</span><span class="s">"spark.sql.warehouse.dir"</span><span class="o">,</span> <span class="n">warehouseLocation</span><span class="o">)</span> | 
 |   <span class="o">.</span><span class="na">enableHiveSupport</span><span class="o">()</span> | 
 |   <span class="o">.</span><span class="na">getOrCreate</span><span class="o">();</span> | 
 |  | 
 | <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="o">);</span> | 
 | <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="o">);</span> | 
 |  | 
 | <span class="c1">// Queries are expressed in HiveQL</span> | 
 | <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM src"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> | 
 | <span class="c1">// +---+-------+</span> | 
 | <span class="c1">// |key|  value|</span> | 
 | <span class="c1">// +---+-------+</span> | 
 | <span class="c1">// |238|val_238|</span> | 
 | <span class="c1">// | 86| val_86|</span> | 
 | <span class="c1">// |311|val_311|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="c1">// Aggregation queries are also supported.</span> | 
 | <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT COUNT(*) FROM src"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> | 
 | <span class="c1">// +--------+</span> | 
 | <span class="c1">// |count(1)|</span> | 
 | <span class="c1">// +--------+</span> | 
 | <span class="c1">// |    500 |</span> | 
 | <span class="c1">// +--------+</span> | 
 |  | 
 | <span class="c1">// The results of SQL queries are themselves DataFrames and support all normal functions.</span> | 
 | <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">sqlDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT key, value FROM src WHERE key < 10 ORDER BY key"</span><span class="o">);</span> | 
 |  | 
 | <span class="c1">// The items in DataFrames are of type Row, which lets you to access each column by ordinal.</span> | 
 | <span class="nc">Dataset</span><span class="o"><</span><span class="nc">String</span><span class="o">></span> <span class="n">stringsDS</span> <span class="o">=</span> <span class="n">sqlDF</span><span class="o">.</span><span class="na">map</span><span class="o">(</span> | 
 |     <span class="o">(</span><span class="nc">MapFunction</span><span class="o"><</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">String</span><span class="o">>)</span> <span class="n">row</span> <span class="o">-></span> <span class="s">"Key: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="mi">0</span><span class="o">)</span> <span class="o">+</span> <span class="s">", Value: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="mi">1</span><span class="o">),</span> | 
 |     <span class="nc">Encoders</span><span class="o">.</span><span class="na">STRING</span><span class="o">());</span> | 
 | <span class="n">stringsDS</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> | 
 | <span class="c1">// +--------------------+</span> | 
 | <span class="c1">// |               value|</span> | 
 | <span class="c1">// +--------------------+</span> | 
 | <span class="c1">// |Key: 0, Value: val_0|</span> | 
 | <span class="c1">// |Key: 0, Value: val_0|</span> | 
 | <span class="c1">// |Key: 0, Value: val_0|</span> | 
 | <span class="c1">// ...</span> | 
 |  | 
 | <span class="c1">// You can also use DataFrames to create temporary views within a SparkSession.</span> | 
 | <span class="nc">List</span><span class="o"><</span><span class="nc">Record</span><span class="o">></span> <span class="n">records</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ArrayList</span><span class="o"><>();</span> | 
 | <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">key</span> <span class="o">=</span> <span class="mi">1</span><span class="o">;</span> <span class="n">key</span> <span class="o"><</span> <span class="mi">100</span><span class="o">;</span> <span class="n">key</span><span class="o">++)</span> <span class="o">{</span> | 
 |   <span class="nc">Record</span> <span class="n">record</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Record</span><span class="o">();</span> | 
 |   <span class="n">record</span><span class="o">.</span><span class="na">setKey</span><span class="o">(</span><span class="n">key</span><span class="o">);</span> | 
 |   <span class="n">record</span><span class="o">.</span><span class="na">setValue</span><span class="o">(</span><span class="s">"val_"</span> <span class="o">+</span> <span class="n">key</span><span class="o">);</span> | 
 |   <span class="n">records</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">record</span><span class="o">);</span> | 
 | <span class="o">}</span> | 
 | <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">recordsDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">records</span><span class="o">,</span> <span class="nc">Record</span><span class="o">.</span><span class="na">class</span><span class="o">);</span> | 
 | <span class="n">recordsDF</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"records"</span><span class="o">);</span> | 
 |  | 
 | <span class="c1">// Queries can then join DataFrames data with data stored in Hive.</span> | 
 | <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM records r JOIN src s ON r.key = s.key"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> | 
 | <span class="c1">// +---+------+---+------+</span> | 
 | <span class="c1">// |key| value|key| value|</span> | 
 | <span class="c1">// +---+------+---+------+</span> | 
 | <span class="c1">// |  2| val_2|  2| val_2|</span> | 
 | <span class="c1">// |  2| val_2|  2| val_2|</span> | 
 | <span class="c1">// |  4| val_4|  4| val_4|</span> | 
 | <span class="c1">// ...</span></code></pre></div> | 
 |     <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java" in the Spark repo.</small></div> | 
 |   </div> | 
 |  | 
 | <div data-lang="r"> | 
 |  | 
 |     <p>When working with Hive one must instantiate <code class="language-plaintext highlighter-rouge">SparkSession</code> with Hive support. This | 
 | adds support for finding tables in the MetaStore and writing queries using HiveQL.</p> | 
 |  | 
 |     <div class="highlight"><pre class="codehilite"><code><span class="c1"># enableHiveSupport defaults to TRUE</span><span class="w"> | 
 | </span><span class="n">sparkR.session</span><span class="p">(</span><span class="n">enableHiveSupport</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="kc">TRUE</span><span class="p">)</span><span class="w"> | 
 | </span><span class="n">sql</span><span class="p">(</span><span class="s2">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="p">)</span><span class="w"> | 
 | </span><span class="n">sql</span><span class="p">(</span><span class="s2">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="p">)</span><span class="w"> | 
 |  | 
 | </span><span class="c1"># Queries can be expressed in HiveQL.</span><span class="w"> | 
 | </span><span class="n">results</span><span class="w"> </span><span class="o"><-</span><span class="w"> </span><span class="n">collect</span><span class="p">(</span><span class="n">sql</span><span class="p">(</span><span class="s2">"FROM src SELECT key, value"</span><span class="p">))</span></code></pre></div> | 
 |     <div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div> | 
 |  | 
 |   </div> | 
 | </div> | 
 |  | 
 | <h3 id="specifying-storage-format-for-hive-tables">Specifying storage format for Hive tables</h3> | 
 |  | 
 | <p>When you create a Hive table, you need to define how this table should read/write data from/to file system, | 
 | i.e. the “input format” and “output format”. You also need to define how this table should deserialize the data | 
 | to rows, or serialize rows to data, i.e. the “serde”. The following options can be used to specify the storage | 
 | format(“serde”, “input format”, “output format”), e.g. <code class="language-plaintext highlighter-rouge">CREATE TABLE src(id int) USING hive OPTIONS(fileFormat 'parquet')</code>. | 
 | By default, we will read the table files as plain text. Note that, Hive storage handler is not supported yet when | 
 | creating table, you can create a table using storage handler at Hive side, and use Spark SQL to read it.</p> | 
 |  | 
 | <table> | 
 |   <thead><tr><th>Property Name</th><th>Meaning</th></tr></thead> | 
 |   <tr> | 
 |     <td><code>fileFormat</code></td> | 
 |     <td> | 
 |       A fileFormat is kind of a package of storage format specifications, including "serde", "input format" and | 
 |       "output format". Currently we support 6 fileFormats: 'sequencefile', 'rcfile', 'orc', 'parquet', 'textfile' and 'avro'. | 
 |     </td> | 
 |   </tr> | 
 |  | 
 |   <tr> | 
 |     <td><code>inputFormat, outputFormat</code></td> | 
 |     <td> | 
 |       These 2 options specify the name of a corresponding <code>InputFormat</code> and <code>OutputFormat</code> class as a string literal, | 
 |       e.g. <code>org.apache.hadoop.hive.ql.io.orc.OrcInputFormat</code>. These 2 options must be appeared in a pair, and you can not | 
 |       specify them if you already specified the <code>fileFormat</code> option. | 
 |     </td> | 
 |   </tr> | 
 |  | 
 |   <tr> | 
 |     <td><code>serde</code></td> | 
 |     <td> | 
 |       This option specifies the name of a serde class. When the <code>fileFormat</code> option is specified, do not specify this option | 
 |       if the given <code>fileFormat</code> already include the information of serde. Currently "sequencefile", "textfile" and "rcfile" | 
 |       don't include the serde information and you can use this option with these 3 fileFormats. | 
 |     </td> | 
 |   </tr> | 
 |  | 
 |   <tr> | 
 |     <td><code>fieldDelim, escapeDelim, collectionDelim, mapkeyDelim, lineDelim</code></td> | 
 |     <td> | 
 |       These options can only be used with "textfile" fileFormat. They define how to read delimited files into rows. | 
 |     </td> | 
 |   </tr> | 
 | </table> | 
 |  | 
 | <p>All other properties defined with <code class="language-plaintext highlighter-rouge">OPTIONS</code> will be regarded as Hive serde properties.</p> | 
 |  | 
 | <h3 id="interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</h3> | 
 |  | 
 | <p>One of the most important pieces of Spark SQL’s Hive support is interaction with Hive metastore, | 
 | which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary | 
 | build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below. | 
 | Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL | 
 | will compile against built-in Hive and use those classes for internal execution (serdes, UDFs, UDAFs, etc).</p> | 
 |  | 
 | <p>The following options can be used to configure the version of Hive that is used to retrieve metadata:</p> | 
 |  | 
 | <table class="spark-config"> | 
 |   <thead><tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr></thead> | 
 |   <tr> | 
 |     <td><code>spark.sql.hive.metastore.version</code></td> | 
 |     <td><code>2.3.10</code></td> | 
 |     <td> | 
 |       Version of the Hive metastore. Available | 
 |       options are <code>2.0.0</code> through <code>2.3.10</code> and <code>3.0.0</code> through <code>3.1.3</code>. | 
 |     </td> | 
 |     <td>1.4.0</td> | 
 |   </tr> | 
 |   <tr> | 
 |     <td><code>spark.sql.hive.metastore.jars</code></td> | 
 |     <td><code>builtin</code></td> | 
 |     <td> | 
 |       Location of the jars that should be used to instantiate the HiveMetastoreClient. This | 
 |       property can be one of four options: | 
 |       <ol> | 
 |         <li><code>builtin</code></li> | 
 |         Use Hive 2.3.10, which is bundled with the Spark assembly when <code>-Phive</code> is | 
 |         enabled. When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be | 
 |         either <code>2.3.10</code> or not defined. | 
 |         <li><code>maven</code></li> | 
 |         Use Hive jars of specified version downloaded from Maven repositories. This configuration | 
 |         is not generally recommended for production deployments. | 
 |         <li><code>path</code></li> | 
 |         Use Hive jars configured by <code>spark.sql.hive.metastore.jars.path</code> | 
 |         in comma separated format. Support both local or remote paths. The provided jars should be | 
 |         the same version as <code>spark.sql.hive.metastore.version</code>. | 
 |         <li>A classpath in the standard format for the JVM. This classpath must include all of Hive | 
 |         and its dependencies, including the correct version of Hadoop. The provided jars should be | 
 |         the same version as <code>spark.sql.hive.metastore.version</code>. These jars only need to be present on the | 
 |         driver, but if you are running in yarn cluster mode then you must ensure they are packaged | 
 |         with your application.</li> | 
 |       </ol> | 
 |     </td> | 
 |     <td>1.4.0</td> | 
 |   </tr> | 
 |   <tr> | 
 |     <td><code>spark.sql.hive.metastore.jars.path</code></td> | 
 |     <td><code>(empty)</code></td> | 
 |     <td> | 
 |       Comma-separated paths of the jars that used to instantiate the HiveMetastoreClient. | 
 |       This configuration is useful only when <code>spark.sql.hive.metastore.jars</code> is set as <code>path</code>. | 
 |       <br /> | 
 |       The paths can be any of the following format: | 
 |       <ol> | 
 |         <li><code>file://path/to/jar/foo.jar</code></li> | 
 |         <li><code>hdfs://nameservice/path/to/jar/foo.jar</code></li> | 
 |         <li><code>/path/to/jar/</code>(path without URI scheme follow conf <code>fs.defaultFS</code>'s URI schema)</li> | 
 |         <li><code>[http/https/ftp]://path/to/jar/foo.jar</code></li> | 
 |       </ol> | 
 |       Note that 1, 2, and 3 support wildcard. For example: | 
 |       <ol> | 
 |         <li><code>file://path/to/jar/*,file://path2/to/jar/*/*.jar</code></li> | 
 |         <li><code>hdfs://nameservice/path/to/jar/*,hdfs://nameservice2/path/to/jar/*/*.jar</code></li> | 
 |       </ol> | 
 |     </td> | 
 |     <td>3.1.0</td> | 
 |   </tr> | 
 |   <tr> | 
 |     <td><code>spark.sql.hive.metastore.sharedPrefixes</code></td> | 
 |     <td><code>com.mysql.jdbc,<br />org.postgresql,<br />com.microsoft.sqlserver,<br />oracle.jdbc</code></td> | 
 |     <td> | 
 |       <p> | 
 |         A comma-separated list of class prefixes that should be loaded using the classloader that is | 
 |         shared between Spark SQL and a specific version of Hive. An example of classes that should | 
 |         be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need | 
 |         to be shared are those that interact with classes that are already shared. For example, | 
 |         custom appenders that are used by log4j. | 
 |       </p> | 
 |     </td> | 
 |     <td>1.4.0</td> | 
 |   </tr> | 
 |   <tr> | 
 |     <td><code>spark.sql.hive.metastore.barrierPrefixes</code></td> | 
 |     <td><code>(empty)</code></td> | 
 |     <td> | 
 |       <p> | 
 |         A comma separated list of class prefixes that should explicitly be reloaded for each version | 
 |         of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a | 
 |         prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>). | 
 |       </p> | 
 |     </td> | 
 |     <td>1.4.0</td> | 
 |   </tr> | 
 | </table> | 
 |  | 
 |  | 
 |                 </div> | 
 |              | 
 |              <!-- /container --> | 
 |         </div> | 
 |  | 
 |         <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.0.2/dist/js/bootstrap.bundle.min.js" | 
 |         integrity="sha384-MrcW6ZMFYlzcLA8Nl+NtUVF0sA7MsXsP1UyJoMp4YLEuNSfAP+JcXn/tWtIaxVXM" | 
 |         crossorigin="anonymous"></script> | 
 |         <script src="https://code.jquery.com/jquery.js"></script> | 
 |  | 
 |         <script src="js/vendor/anchor.min.js"></script> | 
 |         <script src="js/main.js"></script> | 
 |  | 
 |         <script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script> | 
 |         <script type="text/javascript"> | 
 |             // DocSearch is entirely free and automated. DocSearch is built in two parts: | 
 |             // 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link | 
 |             //    in your website and extract content from every page it traverses. It then pushes this | 
 |             //    content to an Algolia index. | 
 |             // 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index | 
 |             //    to your search input and display its results in a dropdown UI. If you want to find more | 
 |             //    details on how works DocSearch, check the docs of DocSearch. | 
 |             docsearch({ | 
 |     apiKey: 'd62f962a82bc9abb53471cb7b89da35e', | 
 |     appId: 'RAI69RXRSK', | 
 |     indexName: 'apache_spark', | 
 |     inputSelector: '#docsearch-input', | 
 |     enhancedSearchInput: true, | 
 |     algoliaOptions: { | 
 |       'facetFilters': ["version:4.0.0-preview1"] | 
 |     }, | 
 |     debug: false // Set debug to true if you want to inspect the dropdown | 
 | }); | 
 |  | 
 |         </script> | 
 |  | 
 |         <!-- MathJax Section --> | 
 |         <script type="text/x-mathjax-config"> | 
 |             MathJax.Hub.Config({ | 
 |                 TeX: { equationNumbers: { autoNumber: "AMS" } } | 
 |             }); | 
 |         </script> | 
 |         <script> | 
 |             // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. | 
 |             // We could use "//cdn.mathjax...", but that won't support "file://". | 
 |             (function(d, script) { | 
 |                 script = d.createElement('script'); | 
 |                 script.type = 'text/javascript'; | 
 |                 script.async = true; | 
 |                 script.onload = function(){ | 
 |                     MathJax.Hub.Config({ | 
 |                         tex2jax: { | 
 |                             inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], | 
 |                             displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], | 
 |                             processEscapes: true, | 
 |                             skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] | 
 |                         } | 
 |                     }); | 
 |                 }; | 
 |                 script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + | 
 |                     'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + | 
 |                     '?config=TeX-AMS-MML_HTMLorMML'; | 
 |                 d.getElementsByTagName('head')[0].appendChild(script); | 
 |             }(document)); | 
 |         </script> | 
 |     </body> | 
 | </html> |