blob: 4bff2aadb56e419a4f163aea3ba051c6b3b15f5e [file] [log] [blame]
<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7"> <![endif]-->
<!--[if IE 7]> <html class="no-js lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]> <html class="no-js lt-ie9"> <![endif]-->
<!--[if gt IE 8]><!--> <html class="no-js"> <!--<![endif]-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">
<title>CREATE DATASOURCE TABLE - Spark 3.2.3 Documentation</title>
<link rel="stylesheet" href="css/bootstrap.min.css">
<style>
body {
padding-top: 60px;
padding-bottom: 40px;
}
</style>
<meta name="viewport" content="width=device-width">
<link rel="stylesheet" href="css/main.css">
<script src="js/vendor/modernizr-2.6.1-respond-1.1.0.min.js"></script>
<link rel="stylesheet" href="css/pygments-default.css">
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.css" />
<link rel="stylesheet" href="css/docsearch.css">
</head>
<body>
<!--[if lt IE 7]>
<p class="chromeframe">You are using an outdated browser. <a href="https://browsehappy.com/">Upgrade your browser today</a> or <a href="http://www.google.com/chromeframe/?redirect=true">install Google Chrome Frame</a> to better experience this site.</p>
<![endif]-->
<!-- This code is taken from http://twitter.github.com/bootstrap/examples/hero.html -->
<nav class="navbar fixed-top navbar-expand-md navbar-light bg-light" id="topbar">
<div class="container">
<div class="navbar-header">
<div class="navbar-brand"><a href="index.html">
<img src="img/spark-logo-hd.png" style="height:50px;"/></a><span class="version">3.2.3</span>
</div>
</div>
<button class="navbar-toggler" type="button" data-toggle="collapse"
data-target="#navbarCollapse" aria-controls="navbarCollapse"
aria-expanded="false" aria-label="Toggle navigation">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarCollapse">
<ul class="navbar-nav">
<!--TODO(andyk): Add class="active" attribute to li some how.-->
<li class="nav-item"><a href="index.html" class="nav-link">Overview</a></li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarQuickStart" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Programming Guides</a>
<div class="dropdown-menu" aria-labelledby="navbarQuickStart">
<a class="dropdown-item" href="quick-start.html">Quick Start</a>
<a class="dropdown-item" href="rdd-programming-guide.html">RDDs, Accumulators, Broadcasts Vars</a>
<a class="dropdown-item" href="sql-programming-guide.html">SQL, DataFrames, and Datasets</a>
<a class="dropdown-item" href="structured-streaming-programming-guide.html">Structured Streaming</a>
<a class="dropdown-item" href="streaming-programming-guide.html">Spark Streaming (DStreams)</a>
<a class="dropdown-item" href="ml-guide.html">MLlib (Machine Learning)</a>
<a class="dropdown-item" href="graphx-programming-guide.html">GraphX (Graph Processing)</a>
<a class="dropdown-item" href="sparkr.html">SparkR (R on Spark)</a>
<a class="dropdown-item" href="api/python/getting_started/index.html">PySpark (Python on Spark)</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarAPIDocs" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">API Docs</a>
<div class="dropdown-menu" aria-labelledby="navbarAPIDocs">
<a class="dropdown-item" href="api/scala/org/apache/spark/index.html">Scala</a>
<a class="dropdown-item" href="api/java/index.html">Java</a>
<a class="dropdown-item" href="api/python/index.html">Python</a>
<a class="dropdown-item" href="api/R/index.html">R</a>
<a class="dropdown-item" href="api/sql/index.html">SQL, Built-in Functions</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarDeploying" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">Deploying</a>
<div class="dropdown-menu" aria-labelledby="navbarDeploying">
<a class="dropdown-item" href="cluster-overview.html">Overview</a>
<a class="dropdown-item" href="submitting-applications.html">Submitting Applications</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="spark-standalone.html">Spark Standalone</a>
<a class="dropdown-item" href="running-on-mesos.html">Mesos</a>
<a class="dropdown-item" href="running-on-yarn.html">YARN</a>
<a class="dropdown-item" href="running-on-kubernetes.html">Kubernetes</a>
</div>
</li>
<li class="nav-item dropdown">
<a href="#" class="nav-link dropdown-toggle" id="navbarMore" role="button" data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">More</a>
<div class="dropdown-menu" aria-labelledby="navbarMore">
<a class="dropdown-item" href="configuration.html">Configuration</a>
<a class="dropdown-item" href="monitoring.html">Monitoring</a>
<a class="dropdown-item" href="tuning.html">Tuning Guide</a>
<a class="dropdown-item" href="job-scheduling.html">Job Scheduling</a>
<a class="dropdown-item" href="security.html">Security</a>
<a class="dropdown-item" href="hardware-provisioning.html">Hardware Provisioning</a>
<a class="dropdown-item" href="migration-guide.html">Migration Guide</a>
<div class="dropdown-divider"></div>
<a class="dropdown-item" href="building-spark.html">Building Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/contributing.html">Contributing to Spark</a>
<a class="dropdown-item" href="https://spark.apache.org/third-party-projects.html">Third Party Projects</a>
</div>
</li>
<li class="nav-item">
<input type="text" id="docsearch-input" placeholder="Search the docs…">
</li>
</ul>
<!--<span class="navbar-text navbar-right"><span class="version-text">v3.2.3</span></span>-->
</div>
</div>
</nav>
<div class="container-wrapper">
<div class="left-menu-wrapper">
<div class="left-menu">
<h3><a href="sql-programming-guide.html">Spark SQL Guide</a></h3>
<ul>
<li>
<a href="sql-getting-started.html">
Getting Started
</a>
</li>
<li>
<a href="sql-data-sources.html">
Data Sources
</a>
</li>
<li>
<a href="sql-performance-tuning.html">
Performance Tuning
</a>
</li>
<li>
<a href="sql-distributed-sql-engine.html">
Distributed SQL Engine
</a>
</li>
<li>
<a href="sql-pyspark-pandas-with-arrow.html">
PySpark Usage Guide for Pandas with Apache Arrow
</a>
</li>
<li>
<a href="sql-migration-old.html">
Migration Guide
</a>
</li>
<li>
<a href="sql-ref.html">
SQL Reference
</a>
</li>
<ul>
<li>
<a href="sql-ref-ansi-compliance.html">
ANSI Compliance
</a>
</li>
<li>
<a href="sql-ref-datatypes.html">
Data Types
</a>
</li>
<li>
<a href="sql-ref-datetime-pattern.html">
Datetime Pattern
</a>
</li>
<li>
<a href="sql-ref-functions.html">
Functions
</a>
</li>
<li>
<a href="sql-ref-identifier.html">
Identifiers
</a>
</li>
<li>
<a href="sql-ref-literals.html">
Literals
</a>
</li>
<li>
<a href="sql-ref-null-semantics.html">
Null Semantics
</a>
</li>
<li>
<a href="sql-ref-syntax.html">
SQL Syntax
</a>
</li>
<ul>
<li>
<a href="sql-ref-syntax.html#ddl-statements">
Data Definition Statements
</a>
</li>
<li>
<a href="sql-ref-syntax.html#dml-statements">
Data Manipulation Statements
</a>
</li>
<li>
<a href="sql-ref-syntax.html#data-retrieval-statements">
Data Retrieval(Queries)
</a>
</li>
<li>
<a href="sql-ref-syntax.html#auxiliary-statements">
Auxiliary Statements
</a>
</li>
</ul>
</ul>
</ul>
</div>
</div>
<input id="nav-trigger" class="nav-trigger" checked type="checkbox">
<label for="nav-trigger"></label>
<div class="content-with-sidebar mr-3" id="content">
<h1 class="title">CREATE DATASOURCE TABLE</h1>
<h3 id="description">Description</h3>
<p>The <code class="language-plaintext highlighter-rouge">CREATE TABLE</code> statement defines a new table using a Data Source.</p>
<h3 id="syntax">Syntax</h3>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code><span class="k">CREATE</span> <span class="k">TABLE</span> <span class="p">[</span> <span class="n">IF</span> <span class="k">NOT</span> <span class="k">EXISTS</span> <span class="p">]</span> <span class="n">table_identifier</span>
<span class="p">[</span> <span class="p">(</span> <span class="n">col_name1</span> <span class="n">col_type1</span> <span class="p">[</span> <span class="k">COMMENT</span> <span class="n">col_comment1</span> <span class="p">],</span> <span class="p">...</span> <span class="p">)</span> <span class="p">]</span>
<span class="k">USING</span> <span class="n">data_source</span>
<span class="p">[</span> <span class="k">OPTIONS</span> <span class="p">(</span> <span class="n">key1</span><span class="o">=</span><span class="n">val1</span><span class="p">,</span> <span class="n">key2</span><span class="o">=</span><span class="n">val2</span><span class="p">,</span> <span class="p">...</span> <span class="p">)</span> <span class="p">]</span>
<span class="p">[</span> <span class="n">PARTITIONED</span> <span class="k">BY</span> <span class="p">(</span> <span class="n">col_name1</span><span class="p">,</span> <span class="n">col_name2</span><span class="p">,</span> <span class="p">...</span> <span class="p">)</span> <span class="p">]</span>
<span class="p">[</span> <span class="n">CLUSTERED</span> <span class="k">BY</span> <span class="p">(</span> <span class="n">col_name3</span><span class="p">,</span> <span class="n">col_name4</span><span class="p">,</span> <span class="p">...</span> <span class="p">)</span>
<span class="p">[</span> <span class="n">SORTED</span> <span class="k">BY</span> <span class="p">(</span> <span class="n">col_name</span> <span class="p">[</span> <span class="k">ASC</span> <span class="o">|</span> <span class="k">DESC</span> <span class="p">],</span> <span class="p">...</span> <span class="p">)</span> <span class="p">]</span>
<span class="k">INTO</span> <span class="n">num_buckets</span> <span class="n">BUCKETS</span> <span class="p">]</span>
<span class="p">[</span> <span class="k">LOCATION</span> <span class="n">path</span> <span class="p">]</span>
<span class="p">[</span> <span class="k">COMMENT</span> <span class="n">table_comment</span> <span class="p">]</span>
<span class="p">[</span> <span class="n">TBLPROPERTIES</span> <span class="p">(</span> <span class="n">key1</span><span class="o">=</span><span class="n">val1</span><span class="p">,</span> <span class="n">key2</span><span class="o">=</span><span class="n">val2</span><span class="p">,</span> <span class="p">...</span> <span class="p">)</span> <span class="p">]</span>
<span class="p">[</span> <span class="k">AS</span> <span class="n">select_statement</span> <span class="p">]</span>
</code></pre></div></div>
<p>Note that, the clauses between the USING clause and the AS SELECT clause can come in
as any order. For example, you can write COMMENT table_comment after TBLPROPERTIES.</p>
<h3 id="parameters">Parameters</h3>
<ul>
<li>
<p><strong>table_identifier</strong></p>
<p>Specifies a table name, which may be optionally qualified with a database name.</p>
<p><strong>Syntax:</strong> <code class="language-plaintext highlighter-rouge">[ database_name. ] table_name</code></p>
</li>
<li>
<p><strong>USING data_source</strong></p>
<p>Data Source is the input format used to create the table. Data source can be CSV, TXT, ORC, JDBC, PARQUET, etc.</p>
</li>
<li>
<p><strong>PARTITIONED BY</strong></p>
<p>Partitions are created on the table, based on the columns specified.</p>
</li>
<li>
<p><strong>CLUSTERED BY</strong></p>
<p>Partitions created on the table will be bucketed into fixed buckets based on the column specified for bucketing.</p>
<p><strong>NOTE:</strong> Bucketing is an optimization technique that uses buckets (and bucketing columns) to determine data partitioning and avoid data shuffle.</p>
</li>
<li>
<p><strong>SORTED BY</strong></p>
<p>Specifies an ordering of bucket columns. Optionally, one can use ASC for an ascending order or DESC for a descending order after any column names in the SORTED BY clause.
If not specified, ASC is assumed by default.</p>
</li>
<li>
<p><strong>INTO num_buckets BUCKETS</strong></p>
<p>Specifies buckets numbers, which is used in <code class="language-plaintext highlighter-rouge">CLUSTERED BY</code> clause.</p>
</li>
<li>
<p><strong>LOCATION</strong></p>
<p>Path to the directory where table data is stored, which could be a path on distributed storage like HDFS, etc.</p>
</li>
<li>
<p><strong>COMMENT</strong></p>
<p>A string literal to describe the table.</p>
</li>
<li>
<p><strong>TBLPROPERTIES</strong></p>
<p>A list of key-value pairs that is used to tag the table definition.</p>
</li>
<li>
<p><strong>AS select_statement</strong></p>
<p>The table is populated using the data from the select statement.</p>
</li>
</ul>
<h3 id="data-source-interaction">Data Source Interaction</h3>
<p>A Data Source table acts like a pointer to the underlying data source. For example, you can create
a table &#8220;foo&#8221; in Spark which points to a table &#8220;bar&#8221; in MySQL using JDBC Data Source. When you
read/write table &#8220;foo&#8221;, you actually read/write table &#8220;bar&#8221;.</p>
<p>In general CREATE TABLE is creating a &#8220;pointer&#8221;, and you need to make sure it points to something
existing. An exception is file source such as parquet, json. If you don&#8217;t specify the LOCATION,
Spark will create a default table location for you.</p>
<p>For CREATE TABLE AS SELECT, Spark will overwrite the underlying data source with the data of the
input query, to make sure the table gets created contains exactly the same data as the input query.</p>
<h3 id="examples">Examples</h3>
<div class="language-sql highlighter-rouge"><div class="highlight"><pre class="highlight"><code>
<span class="c1">--Use data source</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student</span> <span class="p">(</span><span class="n">id</span> <span class="nb">INT</span><span class="p">,</span> <span class="n">name</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">age</span> <span class="nb">INT</span><span class="p">)</span> <span class="k">USING</span> <span class="n">CSV</span><span class="p">;</span>
<span class="c1">--Use data from another table</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student_copy</span> <span class="k">USING</span> <span class="n">CSV</span>
<span class="k">AS</span> <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">student</span><span class="p">;</span>
<span class="c1">--Omit the USING clause, which uses the default data source (parquet by default)</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student</span> <span class="p">(</span><span class="n">id</span> <span class="nb">INT</span><span class="p">,</span> <span class="n">name</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">age</span> <span class="nb">INT</span><span class="p">);</span>
<span class="c1">--Specify table comment and properties</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student</span> <span class="p">(</span><span class="n">id</span> <span class="nb">INT</span><span class="p">,</span> <span class="n">name</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">age</span> <span class="nb">INT</span><span class="p">)</span> <span class="k">USING</span> <span class="n">CSV</span>
<span class="k">COMMENT</span> <span class="s1">'this is a comment'</span>
<span class="n">TBLPROPERTIES</span> <span class="p">(</span><span class="s1">'foo'</span><span class="o">=</span><span class="s1">'bar'</span><span class="p">);</span>
<span class="c1">--Specify table comment and properties with different clauses order</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student</span> <span class="p">(</span><span class="n">id</span> <span class="nb">INT</span><span class="p">,</span> <span class="n">name</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">age</span> <span class="nb">INT</span><span class="p">)</span> <span class="k">USING</span> <span class="n">CSV</span>
<span class="n">TBLPROPERTIES</span> <span class="p">(</span><span class="s1">'foo'</span><span class="o">=</span><span class="s1">'bar'</span><span class="p">)</span>
<span class="k">COMMENT</span> <span class="s1">'this is a comment'</span><span class="p">;</span>
<span class="c1">--Create partitioned and bucketed table</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student</span> <span class="p">(</span><span class="n">id</span> <span class="nb">INT</span><span class="p">,</span> <span class="n">name</span> <span class="n">STRING</span><span class="p">,</span> <span class="n">age</span> <span class="nb">INT</span><span class="p">)</span>
<span class="k">USING</span> <span class="n">CSV</span>
<span class="n">PARTITIONED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">age</span><span class="p">)</span>
<span class="n">CLUSTERED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">Id</span><span class="p">)</span> <span class="k">INTO</span> <span class="mi">4</span> <span class="n">buckets</span><span class="p">;</span>
<span class="c1">--Create partitioned and bucketed table through CTAS</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student_partition_bucket</span>
<span class="k">USING</span> <span class="n">parquet</span>
<span class="n">PARTITIONED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">age</span><span class="p">)</span>
<span class="n">CLUSTERED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">id</span><span class="p">)</span> <span class="k">INTO</span> <span class="mi">4</span> <span class="n">buckets</span>
<span class="k">AS</span> <span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">student</span><span class="p">;</span>
<span class="c1">--Create bucketed table through CTAS and CTE</span>
<span class="k">CREATE</span> <span class="k">TABLE</span> <span class="n">student_bucket</span>
<span class="k">USING</span> <span class="n">parquet</span>
<span class="n">CLUSTERED</span> <span class="k">BY</span> <span class="p">(</span><span class="n">id</span><span class="p">)</span> <span class="k">INTO</span> <span class="mi">4</span> <span class="n">buckets</span> <span class="p">(</span>
<span class="k">WITH</span> <span class="n">tmpTable</span> <span class="k">AS</span> <span class="p">(</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">student</span> <span class="k">WHERE</span> <span class="n">id</span> <span class="o">&gt;</span> <span class="mi">100</span>
<span class="p">)</span>
<span class="k">SELECT</span> <span class="o">*</span> <span class="k">FROM</span> <span class="n">tmpTable</span>
<span class="p">);</span>
</code></pre></div></div>
<h3 id="related-statements">Related Statements</h3>
<ul>
<li><a href="sql-ref-syntax-ddl-create-table-hiveformat.html">CREATE TABLE USING HIVE FORMAT</a></li>
<li><a href="sql-ref-syntax-ddl-create-table-like.html">CREATE TABLE LIKE</a></li>
</ul>
</div>
<!-- /container -->
</div>
<script src="js/vendor/jquery-3.5.1.min.js"></script>
<script src="js/vendor/bootstrap.bundle.min.js"></script>
<script src="js/vendor/anchor.min.js"></script>
<script src="js/main.js"></script>
<script type="text/javascript" src="https://cdn.jsdelivr.net/npm/docsearch.js@2/dist/cdn/docsearch.min.js"></script>
<script type="text/javascript">
// DocSearch is entirely free and automated. DocSearch is built in two parts:
// 1. a crawler which we run on our own infrastructure every 24 hours. It follows every link
// in your website and extract content from every page it traverses. It then pushes this
// content to an Algolia index.
// 2. a JavaScript snippet to be inserted in your website that will bind this Algolia index
// to your search input and display its results in a dropdown UI. If you want to find more
// details on how works DocSearch, check the docs of DocSearch.
docsearch({
apiKey: 'd62f962a82bc9abb53471cb7b89da35e',
appId: 'RAI69RXRSK',
indexName: 'apache_spark',
inputSelector: '#docsearch-input',
enhancedSearchInput: true,
algoliaOptions: {
'facetFilters': ["version:3.2.3"]
},
debug: false // Set debug to true if you want to inspect the dropdown
});
</script>
<!-- MathJax Section -->
<script type="text/x-mathjax-config">
MathJax.Hub.Config({
TeX: { equationNumbers: { autoNumber: "AMS" } }
});
</script>
<script>
// Note that we load MathJax this way to work with local file (file://), HTTP and HTTPS.
// We could use "//cdn.mathjax...", but that won't support "file://".
(function(d, script) {
script = d.createElement('script');
script.type = 'text/javascript';
script.async = true;
script.onload = function(){
MathJax.Hub.Config({
tex2jax: {
inlineMath: [ ["$", "$"], ["\\\\(","\\\\)"] ],
displayMath: [ ["$$","$$"], ["\\[", "\\]"] ],
processEscapes: true,
skipTags: ['script', 'noscript', 'style', 'textarea', 'pre']
}
});
};
script.src = ('https:' == document.location.protocol ? 'https://' : 'http://') +
'cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js' +
'?config=TeX-AMS-MML_HTMLorMML';
d.getElementsByTagName('head')[0].appendChild(script);
}(document));
</script>
</body>
</html>