| |
| <!DOCTYPE html> |
| <!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]--> |
| <!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]--> |
| <!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]--> |
| <!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]--> |
| <head> |
| <meta charset="utf-8"> |
| <meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"> |
| <title>Hive Tables - Spark 3.0.1 Documentation</title> |
| |
| |
| |
| |
| <link rel="stylesheet" href="css/bootstrap.min.css"> |
| <style> |
| body { |
| padding-top: 60px; |
| padding-bottom: 40px; |
| } |
| </style> |
| <meta name="viewport" content="width=device-width"> |
| <link rel="stylesheet" href="css/bootstrap-responsive.min.css"> |
| <link rel="stylesheet" href="css/main.css"> |
| |
| <script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script> |
| |
| <link rel="stylesheet" href="css/pygments-default.css"> |
| |
| |
| <!-- Google analytics script --> |
| <script type="text/javascript"> |
| var _gaq = _gaq || []; |
| _gaq.push(['_setAccount', 'UA-32518208-2']); |
| _gaq.push(['_trackPageview']); |
| |
| (function() { |
| var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true; |
| ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js'; |
| var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s); |
| })(); |
| </script> |
| |
| |
| </head> |
| <body> |
| <!--[if lt IE 7]> |
| <p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p> |
| <![endif]--> |
| |
| <!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html --> |
| |
| <div class="navbar navbar-fixed-top" id="topbar"> |
| <div class="navbar-inner"> |
| <div class="container"> |
| <div class="brand"><a href="index.html"> |
| <img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.0.1</span> |
| </div> |
| <ul class="nav"> |
| <!--TODO(andyk): Add class="active" attribute to li some how.--> |
| <li><a href="index.html">Overview</a></li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Programming Guides<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="quick-start.html">Quick Start</a></li> |
| <li><a href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a></li> |
| <li><a href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a></li> |
| <li><a href="structured-streaming-programming-guide.html">Structured Streaming</a></li> |
| <li><a href="streaming-programming-guide.html">Spark Streaming (DStreams)</a></li> |
| <li><a href="ml-guide.html">MLlib (Machine Learning)</a></li> |
| <li><a href="graphx-programming-guide.html">GraphX (Graph Processing)</a></li> |
| <li><a href="sparkr.html">SparkR (R on Spark)</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">API Docs<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="api/scala/org/apache/spark/index.html">Scala</a></li> |
| <li><a href="api/java/index.html">Java</a></li> |
| <li><a href="api/python/index.html">Python</a></li> |
| <li><a href="api/R/index.html">R</a></li> |
| <li><a href="api/sql/index.html">SQL, Built-in Functions</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="#" class="dropdown-toggle" data-toggle="dropdown">Deploying<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="cluster-overview.html">Overview</a></li> |
| <li><a href="submitting-applications.html">Submitting Applications</a></li> |
| <li class="divider"></li> |
| <li><a href="spark-standalone.html">Spark Standalone</a></li> |
| <li><a href="running-on-mesos.html">Mesos</a></li> |
| <li><a href="running-on-yarn.html">YARN</a></li> |
| <li><a href="running-on-kubernetes.html">Kubernetes</a></li> |
| </ul> |
| </li> |
| |
| <li class="dropdown"> |
| <a href="api.html" class="dropdown-toggle" data-toggle="dropdown">More<b class="caret"></b></a> |
| <ul class="dropdown-menu"> |
| <li><a href="configuration.html">Configuration</a></li> |
| <li><a href="monitoring.html">Monitoring</a></li> |
| <li><a href="tuning.html">Tuning Guide</a></li> |
| <li><a href="job-scheduling.html">Job Scheduling</a></li> |
| <li><a href="security.html">Security</a></li> |
| <li><a href="hardware-provisioning.html">Hardware Provisioning</a></li> |
| <li><a href="migration-guide.html">Migration Guide</a></li> |
| <li class="divider"></li> |
| <li><a href="building-spark.html">Building Spark</a></li> |
| <li><a href="https://spark.apache.org/contributing.html">Contributing to Spark</a></li> |
| <li><a href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a></li> |
| </ul> |
| </li> |
| </ul> |
| <!--<p class="navbar-text pull-right"><span class="version-text">v3.0.1</span></p>--> |
| </div> |
| </div> |
| </div> |
| |
| <div class="container-wrapper"> |
| |
| |
| |
| <div class="left-menu-wrapper"> |
| <div class="left-menu"> |
| <h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3> |
| |
| <ul> |
| |
| <li> |
| <a href="sql-getting-started.html"> |
| |
| Getting Started |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources.html"> |
| |
| Data Sources |
| |
| </a> |
| </li> |
| |
| |
| |
| <ul> |
| |
| <li> |
| <a href="sql-data-sources-load-save-functions.html"> |
| |
| Generic Load/Save Functions |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-generic-options.html"> |
| |
| Generic File Source Options |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-parquet.html"> |
| |
| Parquet Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-orc.html"> |
| |
| ORC Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-json.html"> |
| |
| JSON Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-hive-tables.html"> |
| |
| <b>Hive Tables</b> |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-jdbc.html"> |
| |
| JDBC To Other Databases |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-avro.html"> |
| |
| Avro Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-binaryFile.html"> |
| |
| Whole Binary Files |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-data-sources-troubleshooting.html"> |
| |
| Troubleshooting |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| |
| |
| <li> |
| <a href="sql-performance-tuning.html"> |
| |
| Performance Tuning |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-distributed-sql-engine.html"> |
| |
| Distributed SQL Engine |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-pyspark-pandas-with-arrow.html"> |
| |
| PySpark Usage Guide for Pandas with Apache Arrow |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-migration-old.html"> |
| |
| Migration Guide |
| |
| </a> |
| </li> |
| |
| |
| |
| <li> |
| <a href="sql-ref.html"> |
| |
| SQL Reference |
| |
| </a> |
| </li> |
| |
| |
| |
| </ul> |
| |
| </div> |
| </div> |
| |
| <input id="nav-trigger" class="nav-trigger" checked type="checkbox"> |
| <label for="nav-trigger"></label> |
| <div class="content-with-sidebar" id="content"> |
| |
| <h1 class="title">Hive Tables</h1> |
| |
| |
| <ul id="markdown-toc"> |
| <li><a href="#specifying-storage-format-for-hive-tables" id="markdown-toc-specifying-storage-format-for-hive-tables">Specifying storage format for Hive tables</a></li> |
| <li><a href="#interacting-with-different-versions-of-hive-metastore" id="markdown-toc-interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</a></li> |
| </ul> |
| |
| <p>Spark SQL also supports reading and writing data stored in <a href="http://hive.apache.org/">Apache Hive</a>. |
| However, since Hive has a large number of dependencies, these dependencies are not included in the |
| default Spark distribution. If Hive dependencies can be found on the classpath, Spark will load them |
| automatically. Note that these Hive dependencies must also be present on all of the worker nodes, as |
| they will need access to the Hive serialization and deserialization libraries (SerDes) in order to |
| access data stored in Hive.</p> |
| |
| <p>Configuration of Hive is done by placing your <code class="highlighter-rouge">hive-site.xml</code>, <code class="highlighter-rouge">core-site.xml</code> (for security configuration), |
| and <code class="highlighter-rouge">hdfs-site.xml</code> (for HDFS configuration) file in <code class="highlighter-rouge">conf/</code>.</p> |
| |
| <p>When working with Hive, one must instantiate <code class="highlighter-rouge">SparkSession</code> with Hive support, including |
| connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined functions. |
| Users who do not have an existing Hive deployment can still enable Hive support. When not configured |
| by the <code class="highlighter-rouge">hive-site.xml</code>, the context automatically creates <code class="highlighter-rouge">metastore_db</code> in the current directory and |
| creates a directory configured by <code class="highlighter-rouge">spark.sql.warehouse.dir</code>, which defaults to the directory |
| <code class="highlighter-rouge">spark-warehouse</code> in the current directory that the Spark application is started. Note that |
| the <code class="highlighter-rouge">hive.metastore.warehouse.dir</code> property in <code class="highlighter-rouge">hive-site.xml</code> is deprecated since Spark 2.0.0. |
| Instead, use <code class="highlighter-rouge">spark.sql.warehouse.dir</code> to specify the default location of database in warehouse. |
| You may need to grant write privilege to the user who starts the Spark application.</p> |
| |
| <div class="codetabs"> |
| |
| <div data-lang="scala"> |
| <div class="highlight"><pre class="codehilite"><code><span class="k">import</span> <span class="nn">java.io.File</span> |
| |
| <span class="k">import</span> <span class="nn">org.apache.spark.sql.</span><span class="o">{</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">SaveMode</span><span class="o">,</span> <span class="nc">SparkSession</span><span class="o">}</span> |
| |
| <span class="k">case</span> <span class="k">class</span> <span class="nc">Record</span><span class="o">(</span><span class="n">key</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> <span class="n">value</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> |
| |
| <span class="c1">// warehouseLocation points to the default location for managed databases and tables</span> |
| <span class="k">val</span> <span class="nv">warehouseLocation</span> <span class="k">=</span> <span class="k">new</span> <span class="nc">File</span><span class="o">(</span><span class="s">"spark-warehouse"</span><span class="o">).</span><span class="py">getAbsolutePath</span> |
| |
| <span class="k">val</span> <span class="nv">spark</span> <span class="k">=</span> <span class="nc">SparkSession</span> |
| <span class="o">.</span><span class="py">builder</span><span class="o">()</span> |
| <span class="o">.</span><span class="py">appName</span><span class="o">(</span><span class="s">"Spark Hive Example"</span><span class="o">)</span> |
| <span class="o">.</span><span class="py">config</span><span class="o">(</span><span class="s">"spark.sql.warehouse.dir"</span><span class="o">,</span> <span class="n">warehouseLocation</span><span class="o">)</span> |
| <span class="o">.</span><span class="py">enableHiveSupport</span><span class="o">()</span> |
| <span class="o">.</span><span class="py">getOrCreate</span><span class="o">()</span> |
| |
| <span class="k">import</span> <span class="nn">spark.implicits._</span> |
| <span class="k">import</span> <span class="nn">spark.sql</span> |
| |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="o">)</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="o">)</span> |
| |
| <span class="c1">// Queries are expressed in HiveQL</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM src"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +---+-------+</span> |
| <span class="c1">// |key| value|</span> |
| <span class="c1">// +---+-------+</span> |
| <span class="c1">// |238|val_238|</span> |
| <span class="c1">// | 86| val_86|</span> |
| <span class="c1">// |311|val_311|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="c1">// Aggregation queries are also supported.</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT COUNT(*) FROM src"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +--------+</span> |
| <span class="c1">// |count(1)|</span> |
| <span class="c1">// +--------+</span> |
| <span class="c1">// | 500 |</span> |
| <span class="c1">// +--------+</span> |
| |
| <span class="c1">// The results of SQL queries are themselves DataFrames and support all normal functions.</span> |
| <span class="k">val</span> <span class="nv">sqlDF</span> <span class="k">=</span> <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT key, value FROM src WHERE key < 10 ORDER BY key"</span><span class="o">)</span> |
| |
| <span class="c1">// The items in DataFrames are of type Row, which allows you to access each column by ordinal.</span> |
| <span class="k">val</span> <span class="nv">stringsDS</span> <span class="k">=</span> <span class="nv">sqlDF</span><span class="o">.</span><span class="py">map</span> <span class="o">{</span> |
| <span class="k">case</span> <span class="nc">Row</span><span class="o">(</span><span class="n">key</span><span class="k">:</span> <span class="kt">Int</span><span class="o">,</span> <span class="n">value</span><span class="k">:</span> <span class="kt">String</span><span class="o">)</span> <span class="k">=></span> <span class="n">s</span><span class="s">"Key: $key, Value: $value"</span> |
| <span class="o">}</span> |
| <span class="nv">stringsDS</span><span class="o">.</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +--------------------+</span> |
| <span class="c1">// | value|</span> |
| <span class="c1">// +--------------------+</span> |
| <span class="c1">// |Key: 0, Value: val_0|</span> |
| <span class="c1">// |Key: 0, Value: val_0|</span> |
| <span class="c1">// |Key: 0, Value: val_0|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="c1">// You can also use DataFrames to create temporary views within a SparkSession.</span> |
| <span class="k">val</span> <span class="nv">recordsDF</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">createDataFrame</span><span class="o">((</span><span class="mi">1</span> <span class="n">to</span> <span class="mi">100</span><span class="o">).</span><span class="py">map</span><span class="o">(</span><span class="n">i</span> <span class="k">=></span> <span class="nc">Record</span><span class="o">(</span><span class="n">i</span><span class="o">,</span> <span class="n">s</span><span class="s">"val_$i"</span><span class="o">)))</span> |
| <span class="nv">recordsDF</span><span class="o">.</span><span class="py">createOrReplaceTempView</span><span class="o">(</span><span class="s">"records"</span><span class="o">)</span> |
| |
| <span class="c1">// Queries can then join DataFrame data with data stored in Hive.</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM records r JOIN src s ON r.key = s.key"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +---+------+---+------+</span> |
| <span class="c1">// |key| value|key| value|</span> |
| <span class="c1">// +---+------+---+------+</span> |
| <span class="c1">// | 2| val_2| 2| val_2|</span> |
| <span class="c1">// | 4| val_4| 4| val_4|</span> |
| <span class="c1">// | 5| val_5| 5| val_5|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="c1">// Create a Hive managed Parquet table, with HQL syntax instead of the Spark SQL native syntax</span> |
| <span class="c1">// `USING hive`</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"CREATE TABLE hive_records(key int, value string) STORED AS PARQUET"</span><span class="o">)</span> |
| <span class="c1">// Save DataFrame to the Hive managed table</span> |
| <span class="k">val</span> <span class="nv">df</span> <span class="k">=</span> <span class="nv">spark</span><span class="o">.</span><span class="py">table</span><span class="o">(</span><span class="s">"src"</span><span class="o">)</span> |
| <span class="nv">df</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">mode</span><span class="o">(</span><span class="nv">SaveMode</span><span class="o">.</span><span class="py">Overwrite</span><span class="o">).</span><span class="py">saveAsTable</span><span class="o">(</span><span class="s">"hive_records"</span><span class="o">)</span> |
| <span class="c1">// After insertion, the Hive managed table has data now</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM hive_records"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +---+-------+</span> |
| <span class="c1">// |key| value|</span> |
| <span class="c1">// +---+-------+</span> |
| <span class="c1">// |238|val_238|</span> |
| <span class="c1">// | 86| val_86|</span> |
| <span class="c1">// |311|val_311|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="c1">// Prepare a Parquet data directory</span> |
| <span class="k">val</span> <span class="nv">dataDir</span> <span class="k">=</span> <span class="s">"/tmp/parquet_data"</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">range</span><span class="o">(</span><span class="mi">10</span><span class="o">).</span><span class="py">write</span><span class="o">.</span><span class="py">parquet</span><span class="o">(</span><span class="n">dataDir</span><span class="o">)</span> |
| <span class="c1">// Create a Hive external Parquet table</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="n">s</span><span class="s">"CREATE EXTERNAL TABLE hive_bigints(id bigint) STORED AS PARQUET LOCATION '$dataDir'"</span><span class="o">)</span> |
| <span class="c1">// The Hive external table should already have data</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM hive_bigints"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +---+</span> |
| <span class="c1">// | id|</span> |
| <span class="c1">// +---+</span> |
| <span class="c1">// | 0|</span> |
| <span class="c1">// | 1|</span> |
| <span class="c1">// | 2|</span> |
| <span class="c1">// ... Order may vary, as spark processes the partitions in parallel.</span> |
| |
| <span class="c1">// Turn on flag for Hive Dynamic Partitioning</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">sqlContext</span><span class="o">.</span><span class="py">setConf</span><span class="o">(</span><span class="s">"hive.exec.dynamic.partition"</span><span class="o">,</span> <span class="s">"true"</span><span class="o">)</span> |
| <span class="nv">spark</span><span class="o">.</span><span class="py">sqlContext</span><span class="o">.</span><span class="py">setConf</span><span class="o">(</span><span class="s">"hive.exec.dynamic.partition.mode"</span><span class="o">,</span> <span class="s">"nonstrict"</span><span class="o">)</span> |
| <span class="c1">// Create a Hive partitioned table using DataFrame API</span> |
| <span class="nv">df</span><span class="o">.</span><span class="py">write</span><span class="o">.</span><span class="py">partitionBy</span><span class="o">(</span><span class="s">"key"</span><span class="o">).</span><span class="py">format</span><span class="o">(</span><span class="s">"hive"</span><span class="o">).</span><span class="py">saveAsTable</span><span class="o">(</span><span class="s">"hive_part_tbl"</span><span class="o">)</span> |
| <span class="c1">// Partitioned column `key` will be moved to the end of the schema.</span> |
| <span class="nf">sql</span><span class="o">(</span><span class="s">"SELECT * FROM hive_part_tbl"</span><span class="o">).</span><span class="py">show</span><span class="o">()</span> |
| <span class="c1">// +-------+---+</span> |
| <span class="c1">// | value|key|</span> |
| <span class="c1">// +-------+---+</span> |
| <span class="c1">// |val_238|238|</span> |
| <span class="c1">// | val_86| 86|</span> |
| <span class="c1">// |val_311|311|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="nv">spark</span><span class="o">.</span><span class="py">stop</span><span class="o">()</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/scala/org/apache/spark/examples/sql/hive/SparkHiveExample.scala" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="java"> |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">import</span> <span class="nn">java.io.File</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.io.Serializable</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.ArrayList</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">java.util.List</span><span class="o">;</span> |
| |
| <span class="kn">import</span> <span class="nn">org.apache.spark.api.java.function.MapFunction</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Dataset</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Encoders</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.Row</span><span class="o">;</span> |
| <span class="kn">import</span> <span class="nn">org.apache.spark.sql.SparkSession</span><span class="o">;</span> |
| |
| <span class="kd">public</span> <span class="kd">static</span> <span class="kd">class</span> <span class="nc">Record</span> <span class="kd">implements</span> <span class="nc">Serializable</span> <span class="o">{</span> |
| <span class="kd">private</span> <span class="kt">int</span> <span class="n">key</span><span class="o">;</span> |
| <span class="kd">private</span> <span class="nc">String</span> <span class="n">value</span><span class="o">;</span> |
| |
| <span class="kd">public</span> <span class="kt">int</span> <span class="nf">getKey</span><span class="o">()</span> <span class="o">{</span> |
| <span class="k">return</span> <span class="n">key</span><span class="o">;</span> |
| <span class="o">}</span> |
| |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setKey</span><span class="o">(</span><span class="kt">int</span> <span class="n">key</span><span class="o">)</span> <span class="o">{</span> |
| <span class="k">this</span><span class="o">.</span><span class="na">key</span> <span class="o">=</span> <span class="n">key</span><span class="o">;</span> |
| <span class="o">}</span> |
| |
| <span class="kd">public</span> <span class="nc">String</span> <span class="nf">getValue</span><span class="o">()</span> <span class="o">{</span> |
| <span class="k">return</span> <span class="n">value</span><span class="o">;</span> |
| <span class="o">}</span> |
| |
| <span class="kd">public</span> <span class="kt">void</span> <span class="nf">setValue</span><span class="o">(</span><span class="nc">String</span> <span class="n">value</span><span class="o">)</span> <span class="o">{</span> |
| <span class="k">this</span><span class="o">.</span><span class="na">value</span> <span class="o">=</span> <span class="n">value</span><span class="o">;</span> |
| <span class="o">}</span> |
| <span class="o">}</span> |
| |
| <span class="c1">// warehouseLocation points to the default location for managed databases and tables</span> |
| <span class="nc">String</span> <span class="n">warehouseLocation</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">File</span><span class="o">(</span><span class="s">"spark-warehouse"</span><span class="o">).</span><span class="na">getAbsolutePath</span><span class="o">();</span> |
| <span class="nc">SparkSession</span> <span class="n">spark</span> <span class="o">=</span> <span class="nc">SparkSession</span> |
| <span class="o">.</span><span class="na">builder</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">appName</span><span class="o">(</span><span class="s">"Java Spark Hive Example"</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">config</span><span class="o">(</span><span class="s">"spark.sql.warehouse.dir"</span><span class="o">,</span> <span class="n">warehouseLocation</span><span class="o">)</span> |
| <span class="o">.</span><span class="na">enableHiveSupport</span><span class="o">()</span> |
| <span class="o">.</span><span class="na">getOrCreate</span><span class="o">();</span> |
| |
| <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="o">);</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="o">);</span> |
| |
| <span class="c1">// Queries are expressed in HiveQL</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM src"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| <span class="c1">// +---+-------+</span> |
| <span class="c1">// |key| value|</span> |
| <span class="c1">// +---+-------+</span> |
| <span class="c1">// |238|val_238|</span> |
| <span class="c1">// | 86| val_86|</span> |
| <span class="c1">// |311|val_311|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="c1">// Aggregation queries are also supported.</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT COUNT(*) FROM src"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| <span class="c1">// +--------+</span> |
| <span class="c1">// |count(1)|</span> |
| <span class="c1">// +--------+</span> |
| <span class="c1">// | 500 |</span> |
| <span class="c1">// +--------+</span> |
| |
| <span class="c1">// The results of SQL queries are themselves DataFrames and support all normal functions.</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">sqlDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT key, value FROM src WHERE key < 10 ORDER BY key"</span><span class="o">);</span> |
| |
| <span class="c1">// The items in DataFrames are of type Row, which lets you to access each column by ordinal.</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">String</span><span class="o">></span> <span class="n">stringsDS</span> <span class="o">=</span> <span class="n">sqlDF</span><span class="o">.</span><span class="na">map</span><span class="o">(</span> |
| <span class="o">(</span><span class="nc">MapFunction</span><span class="o"><</span><span class="nc">Row</span><span class="o">,</span> <span class="nc">String</span><span class="o">>)</span> <span class="n">row</span> <span class="o">-></span> <span class="s">"Key: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="mi">0</span><span class="o">)</span> <span class="o">+</span> <span class="s">", Value: "</span> <span class="o">+</span> <span class="n">row</span><span class="o">.</span><span class="na">get</span><span class="o">(</span><span class="mi">1</span><span class="o">),</span> |
| <span class="nc">Encoders</span><span class="o">.</span><span class="na">STRING</span><span class="o">());</span> |
| <span class="n">stringsDS</span><span class="o">.</span><span class="na">show</span><span class="o">();</span> |
| <span class="c1">// +--------------------+</span> |
| <span class="c1">// | value|</span> |
| <span class="c1">// +--------------------+</span> |
| <span class="c1">// |Key: 0, Value: val_0|</span> |
| <span class="c1">// |Key: 0, Value: val_0|</span> |
| <span class="c1">// |Key: 0, Value: val_0|</span> |
| <span class="c1">// ...</span> |
| |
| <span class="c1">// You can also use DataFrames to create temporary views within a SparkSession.</span> |
| <span class="nc">List</span><span class="o"><</span><span class="nc">Record</span><span class="o">></span> <span class="n">records</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">ArrayList</span><span class="o"><>();</span> |
| <span class="k">for</span> <span class="o">(</span><span class="kt">int</span> <span class="n">key</span> <span class="o">=</span> <span class="mi">1</span><span class="o">;</span> <span class="n">key</span> <span class="o"><</span> <span class="mi">100</span><span class="o">;</span> <span class="n">key</span><span class="o">++)</span> <span class="o">{</span> |
| <span class="nc">Record</span> <span class="n">record</span> <span class="o">=</span> <span class="k">new</span> <span class="nc">Record</span><span class="o">();</span> |
| <span class="n">record</span><span class="o">.</span><span class="na">setKey</span><span class="o">(</span><span class="n">key</span><span class="o">);</span> |
| <span class="n">record</span><span class="o">.</span><span class="na">setValue</span><span class="o">(</span><span class="s">"val_"</span> <span class="o">+</span> <span class="n">key</span><span class="o">);</span> |
| <span class="n">records</span><span class="o">.</span><span class="na">add</span><span class="o">(</span><span class="n">record</span><span class="o">);</span> |
| <span class="o">}</span> |
| <span class="nc">Dataset</span><span class="o"><</span><span class="nc">Row</span><span class="o">></span> <span class="n">recordsDF</span> <span class="o">=</span> <span class="n">spark</span><span class="o">.</span><span class="na">createDataFrame</span><span class="o">(</span><span class="n">records</span><span class="o">,</span> <span class="nc">Record</span><span class="o">.</span><span class="na">class</span><span class="o">);</span> |
| <span class="n">recordsDF</span><span class="o">.</span><span class="na">createOrReplaceTempView</span><span class="o">(</span><span class="s">"records"</span><span class="o">);</span> |
| |
| <span class="c1">// Queries can then join DataFrames data with data stored in Hive.</span> |
| <span class="n">spark</span><span class="o">.</span><span class="na">sql</span><span class="o">(</span><span class="s">"SELECT * FROM records r JOIN src s ON r.key = s.key"</span><span class="o">).</span><span class="na">show</span><span class="o">();</span> |
| <span class="c1">// +---+------+---+------+</span> |
| <span class="c1">// |key| value|key| value|</span> |
| <span class="c1">// +---+------+---+------+</span> |
| <span class="c1">// | 2| val_2| 2| val_2|</span> |
| <span class="c1">// | 2| val_2| 2| val_2|</span> |
| <span class="c1">// | 4| val_4| 4| val_4|</span> |
| <span class="c1">// ...</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/java/org/apache/spark/examples/sql/hive/JavaSparkHiveExample.java" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="python"> |
| <div class="highlight"><pre class="codehilite"><code><span class="kn">from</span> <span class="nn">os.path</span> <span class="kn">import</span> <span class="n">join</span><span class="p">,</span> <span class="n">abspath</span> |
| |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">SparkSession</span> |
| <span class="kn">from</span> <span class="nn">pyspark.sql</span> <span class="kn">import</span> <span class="n">Row</span> |
| |
| <span class="c1"># warehouse_location points to the default location for managed databases and tables |
| </span><span class="n">warehouse_location</span> <span class="o">=</span> <span class="n">abspath</span><span class="p">(</span><span class="s">'spark-warehouse'</span><span class="p">)</span> |
| |
| <span class="n">spark</span> <span class="o">=</span> <span class="n">SparkSession</span> \ |
| <span class="p">.</span><span class="n">builder</span> \ |
| <span class="p">.</span><span class="n">appName</span><span class="p">(</span><span class="s">"Python Spark SQL Hive integration example"</span><span class="p">)</span> \ |
| <span class="p">.</span><span class="n">config</span><span class="p">(</span><span class="s">"spark.sql.warehouse.dir"</span><span class="p">,</span> <span class="n">warehouse_location</span><span class="p">)</span> \ |
| <span class="p">.</span><span class="n">enableHiveSupport</span><span class="p">()</span> \ |
| <span class="p">.</span><span class="n">getOrCreate</span><span class="p">()</span> |
| |
| <span class="c1"># spark is an existing SparkSession |
| </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="p">)</span> |
| <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="p">)</span> |
| |
| <span class="c1"># Queries are expressed in HiveQL |
| </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM src"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span> |
| <span class="c1"># +---+-------+ |
| # |key| value| |
| # +---+-------+ |
| # |238|val_238| |
| # | 86| val_86| |
| # |311|val_311| |
| # ... |
| </span> |
| <span class="c1"># Aggregation queries are also supported. |
| </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT COUNT(*) FROM src"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span> |
| <span class="c1"># +--------+ |
| # |count(1)| |
| # +--------+ |
| # | 500 | |
| # +--------+ |
| </span> |
| <span class="c1"># The results of SQL queries are themselves DataFrames and support all normal functions. |
| </span><span class="n">sqlDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT key, value FROM src WHERE key < 10 ORDER BY key"</span><span class="p">)</span> |
| |
| <span class="c1"># The items in DataFrames are of type Row, which allows you to access each column by ordinal. |
| </span><span class="n">stringsDS</span> <span class="o">=</span> <span class="n">sqlDF</span><span class="p">.</span><span class="n">rdd</span><span class="p">.</span><span class="nb">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">row</span><span class="p">:</span> <span class="s">"Key: %d, Value: %s"</span> <span class="o">%</span> <span class="p">(</span><span class="n">row</span><span class="p">.</span><span class="n">key</span><span class="p">,</span> <span class="n">row</span><span class="p">.</span><span class="n">value</span><span class="p">))</span> |
| <span class="k">for</span> <span class="n">record</span> <span class="ow">in</span> <span class="n">stringsDS</span><span class="p">.</span><span class="n">collect</span><span class="p">():</span> |
| <span class="k">print</span><span class="p">(</span><span class="n">record</span><span class="p">)</span> |
| <span class="c1"># Key: 0, Value: val_0 |
| # Key: 0, Value: val_0 |
| # Key: 0, Value: val_0 |
| # ... |
| </span> |
| <span class="c1"># You can also use DataFrames to create temporary views within a SparkSession. |
| </span><span class="n">Record</span> <span class="o">=</span> <span class="n">Row</span><span class="p">(</span><span class="s">"key"</span><span class="p">,</span> <span class="s">"value"</span><span class="p">)</span> |
| <span class="n">recordsDF</span> <span class="o">=</span> <span class="n">spark</span><span class="p">.</span><span class="n">createDataFrame</span><span class="p">([</span><span class="n">Record</span><span class="p">(</span><span class="n">i</span><span class="p">,</span> <span class="s">"val_"</span> <span class="o">+</span> <span class="nb">str</span><span class="p">(</span><span class="n">i</span><span class="p">))</span> <span class="k">for</span> <span class="n">i</span> <span class="ow">in</span> <span class="nb">range</span><span class="p">(</span><span class="mi">1</span><span class="p">,</span> <span class="mi">101</span><span class="p">)])</span> |
| <span class="n">recordsDF</span><span class="p">.</span><span class="n">createOrReplaceTempView</span><span class="p">(</span><span class="s">"records"</span><span class="p">)</span> |
| |
| <span class="c1"># Queries can then join DataFrame data with data stored in Hive. |
| </span><span class="n">spark</span><span class="p">.</span><span class="n">sql</span><span class="p">(</span><span class="s">"SELECT * FROM records r JOIN src s ON r.key = s.key"</span><span class="p">).</span><span class="n">show</span><span class="p">()</span> |
| <span class="c1"># +---+------+---+------+ |
| # |key| value|key| value| |
| # +---+------+---+------+ |
| # | 2| val_2| 2| val_2| |
| # | 4| val_4| 4| val_4| |
| # | 5| val_5| 5| val_5| |
| # ...</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/python/sql/hive.py" in the Spark repo.</small></div> |
| </div> |
| |
| <div data-lang="r"> |
| |
| <p>When working with Hive one must instantiate <code class="highlighter-rouge">SparkSession</code> with Hive support. This |
| adds support for finding tables in the MetaStore and writing queries using HiveQL.</p> |
| |
| <div class="highlight"><pre class="codehilite"><code><span class="c1"># enableHiveSupport defaults to TRUE</span><span class="w"> |
| </span><span class="n">sparkR.session</span><span class="p">(</span><span class="n">enableHiveSupport</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="kc">TRUE</span><span class="p">)</span><span class="w"> |
| </span><span class="n">sql</span><span class="p">(</span><span class="s2">"CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive"</span><span class="p">)</span><span class="w"> |
| </span><span class="n">sql</span><span class="p">(</span><span class="s2">"LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src"</span><span class="p">)</span><span class="w"> |
| |
| </span><span class="c1"># Queries can be expressed in HiveQL.</span><span class="w"> |
| </span><span class="n">results</span><span class="w"> </span><span class="o"><-</span><span class="w"> </span><span class="n">collect</span><span class="p">(</span><span class="n">sql</span><span class="p">(</span><span class="s2">"FROM src SELECT key, value"</span><span class="p">))</span></code></pre></div> |
| <div><small>Find full example code at "examples/src/main/r/RSparkSQLExample.R" in the Spark repo.</small></div> |
| |
| </div> |
| </div> |
| |
| <h3 id="specifying-storage-format-for-hive-tables">Specifying storage format for Hive tables</h3> |
| |
| <p>When you create a Hive table, you need to define how this table should read/write data from/to file system, |
| i.e. the “input format” and “output format”. You also need to define how this table should deserialize the data |
| to rows, or serialize rows to data, i.e. the “serde”. The following options can be used to specify the storage |
| format(“serde”, “input format”, “output format”), e.g. <code class="highlighter-rouge">CREATE TABLE src(id int) USING hive OPTIONS(fileFormat 'parquet')</code>. |
| By default, we will read the table files as plain text. Note that, Hive storage handler is not supported yet when |
| creating table, you can create a table using storage handler at Hive side, and use Spark SQL to read it.</p> |
| |
| <table class="table"> |
| <tr><th>Property Name</th><th>Meaning</th></tr> |
| <tr> |
| <td><code>fileFormat</code></td> |
| <td> |
| A fileFormat is kind of a package of storage format specifications, including "serde", "input format" and |
| "output format". Currently we support 6 fileFormats: 'sequencefile', 'rcfile', 'orc', 'parquet', 'textfile' and 'avro'. |
| </td> |
| </tr> |
| |
| <tr> |
| <td><code>inputFormat, outputFormat</code></td> |
| <td> |
| These 2 options specify the name of a corresponding <code>InputFormat</code> and <code>OutputFormat</code> class as a string literal, |
| e.g. <code>org.apache.hadoop.hive.ql.io.orc.OrcInputFormat</code>. These 2 options must be appeared in a pair, and you can not |
| specify them if you already specified the <code>fileFormat</code> option. |
| </td> |
| </tr> |
| |
| <tr> |
| <td><code>serde</code></td> |
| <td> |
| This option specifies the name of a serde class. When the <code>fileFormat</code> option is specified, do not specify this option |
| if the given <code>fileFormat</code> already include the information of serde. Currently "sequencefile", "textfile" and "rcfile" |
| don't include the serde information and you can use this option with these 3 fileFormats. |
| </td> |
| </tr> |
| |
| <tr> |
| <td><code>fieldDelim, escapeDelim, collectionDelim, mapkeyDelim, lineDelim</code></td> |
| <td> |
| These options can only be used with "textfile" fileFormat. They define how to read delimited files into rows. |
| </td> |
| </tr> |
| </table> |
| |
| <p>All other properties defined with <code class="highlighter-rouge">OPTIONS</code> will be regarded as Hive serde properties.</p> |
| |
| <h3 id="interacting-with-different-versions-of-hive-metastore">Interacting with Different Versions of Hive Metastore</h3> |
| |
| <p>One of the most important pieces of Spark SQL’s Hive support is interaction with Hive metastore, |
| which enables Spark SQL to access metadata of Hive tables. Starting from Spark 1.4.0, a single binary |
| build of Spark SQL can be used to query different versions of Hive metastores, using the configuration described below. |
| Note that independent of the version of Hive that is being used to talk to the metastore, internally Spark SQL |
| will compile against built-in Hive and use those classes for internal execution (serdes, UDFs, UDAFs, etc).</p> |
| |
| <p>The following options can be used to configure the version of Hive that is used to retrieve metadata:</p> |
| |
| <table class="table"> |
| <tr><th>Property Name</th><th>Default</th><th>Meaning</th><th>Since Version</th></tr> |
| <tr> |
| <td><code>spark.sql.hive.metastore.version</code></td> |
| <td><code>2.3.7</code></td> |
| <td> |
| Version of the Hive metastore. Available |
| options are <code>0.12.0</code> through <code>2.3.7</code> and <code>3.0.0</code> through <code>3.1.2</code>. |
| </td> |
| <td>1.4.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.sql.hive.metastore.jars</code></td> |
| <td><code>builtin</code></td> |
| <td> |
| Location of the jars that should be used to instantiate the HiveMetastoreClient. This |
| property can be one of three options: |
| <ol> |
| <li><code>builtin</code></li> |
| Use Hive 2.3.7, which is bundled with the Spark assembly when <code>-Phive</code> is |
| enabled. When this option is chosen, <code>spark.sql.hive.metastore.version</code> must be |
| either <code>2.3.7</code> or not defined. |
| <li><code>maven</code></li> |
| Use Hive jars of specified version downloaded from Maven repositories. This configuration |
| is not generally recommended for production deployments. |
| <li>A classpath in the standard format for the JVM. This classpath must include all of Hive |
| and its dependencies, including the correct version of Hadoop. These jars only need to be |
| present on the driver, but if you are running in yarn cluster mode then you must ensure |
| they are packaged with your application.</li> |
| </ol> |
| </td> |
| <td>1.4.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.sql.hive.metastore.sharedPrefixes</code></td> |
| <td><code>com.mysql.jdbc,<br />org.postgresql,<br />com.microsoft.sqlserver,<br />oracle.jdbc</code></td> |
| <td> |
| <p> |
| A comma-separated list of class prefixes that should be loaded using the classloader that is |
| shared between Spark SQL and a specific version of Hive. An example of classes that should |
| be shared is JDBC drivers that are needed to talk to the metastore. Other classes that need |
| to be shared are those that interact with classes that are already shared. For example, |
| custom appenders that are used by log4j. |
| </p> |
| </td> |
| <td>1.4.0</td> |
| </tr> |
| <tr> |
| <td><code>spark.sql.hive.metastore.barrierPrefixes</code></td> |
| <td><code>(empty)</code></td> |
| <td> |
| <p> |
| A comma separated list of class prefixes that should explicitly be reloaded for each version |
| of Hive that Spark SQL is communicating with. For example, Hive UDFs that are declared in a |
| prefix that typically would be shared (i.e. <code>org.apache.spark.*</code>). |
| </p> |
| </td> |
| <td>1.4.0</td> |
| </tr> |
| </table> |
| |
| |
| </div> |
| |
| <!-- /container --> |
| </div> |
| |
| <script src="js/vendor/jquery-3.4.1.min.js"></script> |
| <script src="js/vendor/bootstrap.min.js"></script> |
| <script src="js/vendor/anchor.min.js"></script> |
| <script src="js/main.js"></script> |
| |
| <!-- MathJax Section --> |
| <script type="text/x-mathjax-config"> |
| MathJax.Hub.Config({ |
| TeX: { equationNumbers: { autoNumber: "AMS" } } |
| }); |
| </script> |
| <script> |
| // Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS. |
| // We could use "//cdn.mathjax...", but that won't support "file://". |
| (function(d, script) { |
| script = d.createElement('script'); |
| script.type = 'text/javascript'; |
| script.async = true; |
| script.onload = function(){ |
| MathJax.Hub.Config({ |
| tex2jax: { |
| inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ], |
| displayMath: [ ["$$","$$"], ["\\[", "\\]"] ], |
| processEscapes: true, |
| skipTags: ['script', 'noscript', 'style', 'textarea', 'pre'] |
| } |
| }); |
| }; |
| script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') + |
| 'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' + |
| '?config=TeX-AMS-MML_HTMLorMML'; |
| d.getElementsByTagName('head')[0].appendChild(script); |
| }(document)); |
| </script> |
| </body> |
| </html> |