docs/topics/impala_create_table.xml - impala - Git at Google

 <?xml version="1.0" encoding="UTF-8"?>
 <!--
 Licensed to the Apache Software Foundation (ASF) under one
 or more contributor license agreements.  See the NOTICE file
 distributed with this work for additional information
 regarding copyright ownership.  The ASF licenses this file
 to you under the Apache License, Version 2.0 (the
 "License"); you may not use this file except in compliance
 with the License.  You may obtain a copy of the License at

   http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing,
 software distributed under the License is distributed on an
 "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 KIND, either express or implied.  See the License for the
 specific language governing permissions and limitations
 under the License.
 -->
 <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
 <concept id="create_table" outputclass="impala sql_statement">

   <title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title>

   <titlealts audience="PDF">

     <navtitle>CREATE TABLE</navtitle>

   </titlealts>

   <prolog>
     <metadata>
       <data name="Category" value="Impala"/>
       <data name="Category" value="SQL"/>
       <data name="Category" value="DDL"/>
       <data name="Category" value="Impala Data Types"/>
       <data name="Category" value="Developers"/>
       <data name="Category" value="Data Analysts"/>
       <data name="Category" value="HDFS Caching"/>
       <data name="Category" value="Tables"/>
       <data name="Category" value="Schemas"/>
       <data name="Category" value="S3"/>
       <data name="Category" value="Kudu"/>
     </metadata>
   </prolog>

   <conbody>

     <p>
       Creates a new table and specifies its characteristics. While creating a table, you
       optionally specify aspects such as:
     </p>

     <ul>
       <li>
         Whether the table is internal or external.
       </li>

       <li>
         The columns and associated data types.
       </li>

       <li>
         The columns used for physically partitioning the data.
       </li>

       <li>
         The file format for data files.
       </li>

       <li>
         The HDFS directory where the data files are located.
       </li>
     </ul>

     <p conref="../shared/impala_common.xml#common/syntax_blurb"/>

     <p>
       The general syntax for creating a table and specifying its columns is as follows:
     </p>

     <p>
       <b>Explicit column definitions:</b>
     </p>

 <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
   (<varname>col_name</varname> <varname>data_type</varname>
     [<varname>constraint_specification</varname>]
     [COMMENT '<varname>col_comment</varname>']
     [, ...]
   )
   [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
   <ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph>
   [COMMENT '<varname>table_comment</varname>']
   [ROW FORMAT <varname>row_format</varname>]
   [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
   [STORED AS <varname>file_format</varname>]
   [LOCATION '<varname>hdfs_path</varname>']
   <ph rev="1.4.0">[CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
   [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
 </codeblock>

     <p>
       <b>CREATE TABLE AS SELECT:</b>
     </p>

 <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
   <ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph>
   <ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph>
   [COMMENT '<varname>table_comment</varname>']
   [ROW FORMAT <varname>row_format</varname>]
   [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
   <ph rev="">[STORED AS <varname>ctas_file_format</varname>]</ph>
   [LOCATION '<varname>hdfs_path</varname>']
   <ph rev="1.4.0">  [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
   [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
 AS
   <varname>select_statement</varname></codeblock>

 <codeblock>primitive_type:
     TINYINT
   | SMALLINT
   | INT
   | BIGINT
   | BOOLEAN
   | FLOAT
   | DOUBLE
   <ph rev="1.4.0">| DECIMAL</ph>
   | STRING
   <ph rev="2.0.0">| CHAR</ph>
   <ph rev="2.0.0">| VARCHAR</ph>
   | TIMESTAMP

 <ph rev="2.3.0">complex_type:
     struct_type
   | array_type
   | map_type

 struct_type: STRUCT &lt; <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... &gt;

 array_type: ARRAY &lt; <varname>primitive_or_complex_type</varname> &gt;

 map_type: MAP &lt; <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> &gt;
 </ph>

 constraint_specification:
   PRIMARY KEY (<varname>col_name</varname>, ...) [DISABLE] [NOVALIDATE] [RELY], [<varname>foreign_key_specification</varname>, ...]

 foreign_key_specification:
   FOREIGN KEY (<varname>col_name</varname>, ...) REFERENCES table_name(<varname>col_name</varname>, ...) [DISABLE] [NOVALIDATE] [RELY]

 row_format:
   DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']]
   [LINES TERMINATED BY '<varname>char</varname>']

 file_format:
     PARQUET
   | TEXTFILE
   | AVRO
   | SEQUENCEFILE
   | RCFILE

 <ph rev="">ctas_file_format:
     PARQUET
   | TEXTFILE</ph>
 </codeblock>

     <p>
       <b>Column definitions inferred from data file:</b>
     </p>

 <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
   LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'
   [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)]
   <ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph>
   [COMMENT '<varname>table_comment</varname>']
   [ROW FORMAT <varname>row_format</varname>]
   [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
   [STORED AS <varname>file_format</varname>]
   [LOCATION '<varname>hdfs_path</varname>']
 <ph rev="1.4.0">  [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED]
   [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
 data_type:
     <varname>primitive_type</varname>
   | array_type
   | map_type
   | struct_type
 </codeblock>

     <p>
       <b>Internal Kudu tables:</b>
     </p>

 <codeblock rev="kudu">CREATE TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
   (<varname>col_name</varname> <varname>data_type</varname>
     <ph rev="kudu IMPALA-3719">[<varname>kudu_column_attribute</varname> ...]</ph>
     [COMMENT '<varname>col_comment</varname>']
     [, ...]
     [PRIMARY KEY (<varname>col_name</varname>[, ...])]
   )
   <ph rev="kudu">[PARTITION BY <varname>kudu_partition_clause</varname>]</ph>
   [COMMENT '<varname>table_comment</varname>']
   STORED AS KUDU
   [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
 </codeblock>

     <p rev="kudu IMPALA-3719">
       <b>Kudu column attributes:</b>
 <codeblock rev="kudu">
   PRIMARY KEY
 | [NOT] NULL
 | ENCODING <varname>codec</varname>
 | COMPRESSION <varname>algorithm</varname>
 | DEFAULT <varname>constant</varname>
 | BLOCK_SIZE <varname>number</varname>
 </codeblock>
     </p>

     <p rev="kudu IMPALA-3719">
       <b>kudu_partition_clause:</b>
 <codeblock rev="kudu">
 kudu_partition_clause ::= [ <varname>hash_clause</varname> [, ...]] [, <varname>range_clause</varname> ]

 hash_clause ::=
   HASH [ (<varname>pk_col</varname> [, ...]) ]
     PARTITIONS <varname>n</varname>

 range_clause ::=
   RANGE [ (<varname>pk_col</varname> [, ...]) ]
   (
     {
       PARTITION <varname>constant_expression</varname> <varname>range_comparison_operator</varname> VALUES <varname>range_comparison_operator</varname> <varname>constant_expression</varname>
       | PARTITION VALUE = <varname>constant_expression_or_tuple</varname>
     }
    [, ...]
   )

 range_comparison_operator ::= { &lt; | &lt;= }
 </codeblock>
     </p>

     <p>
       <b>External Kudu tables:</b>
     </p>

     <p>In Impala 3.4 and earlier, you can create an external Kudu table based on a pre-existing Kudu
       schema using the table property <codeph>'kudu.table_name'='internal_kudu_name'</codeph>. </p>

 <codeblock>CREATE EXTERNAL TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
   [COMMENT '<varname>col_comment</varname>']
   STORED AS KUDU
   [TBLPROPERTIES ('<varname>kudu.table.name</varname>'='<varname>internal_kudu_name</varname>', '<varname>key1</varname>'='<varname>value1</varname>',...)]
 </codeblock>

     <p>Alternatively, in Impala 3.4 and higher, you can also create an external Kudu table as follows:</p>

 <codeblock rev="kudu">CREATE EXTERNAL TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
   (<varname>col_name</varname> <varname>data_type</varname>
   [<varname>kudu_column_attribute</varname> ...]
   [COMMENT '<varname>col_comment</varname>']
   [, ...]
   [PRIMARY KEY (<varname>col_name</varname>[, ...])]
   )
   [PARTITION BY <varname>kudu_partition_clause</varname>]
   [COMMENT '<varname>table_comment</varname>']
   STORED AS KUDU
   [TBLPROPERTIES ('external.table.purge'='true', '<varname>key1</varname>'='<varname>value1</varname>',...)]
 </codeblock>

     <ul>
       <li>Use a Hive metastore (HMS) 3 or later.</li>
       <li>Provide column specifications to define the schema when you create the table, similar to
         creating an internal table. </li>
       <li>Omit the <codeph>kudu.table_name</codeph> table property as there is no pre-existing
         schema. </li>
       <li>Include the required <codeph>external.table.purge</codeph> property.</li>
     </ul>

     <p> Only the schema metadata is stored in HMS when you create an external table; however, using
       this create table syntax, drop table on the Kudu external table deletes the data stored
       outside HMS in Kudu as well as the metadata (schema) inside HMS. Likewise, renaming the table
       changes the name of the table in HMS and in Kudu. Kudu synchronizes changes to the actual data
       and metadata; consequently, operations such as dropping a table or altering a table name
       simulate internal table operations. </p>

     <p>
       <b>CREATE TABLE AS SELECT for Kudu tables:</b>
     </p>

 <codeblock rev="kudu">CREATE TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname>
   [PRIMARY KEY (<varname>col_name</varname>[, ...])]
   [PARTITION BY <varname>kudu_partition_clause</varname>]
   [COMMENT '<varname>table_comment</varname>']
   STORED AS KUDU
   [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)]
 AS
   <varname>select_statement</varname></codeblock>

     <p conref="../shared/impala_common.xml#common/ddl_blurb"/>

 <!-- Should really have some info up front about all the data types and file formats.
          Consider adding here, or at least making inline links to the relevant keywords
          in the syntax spec above. -->

     <p>
       <b>Column definitions:</b>
     </p>

     <p>
       Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column
       definitions are required or not allowed.
     </p>

     <p>
       With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph>
       syntax, you do not specify the columns at all; the column names and types are derived from
       the source table, query, or data file.
     </p>

     <p> With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns, its
       name, type, optionally constraints, and optionally a comment, in addition to any columns used
       as partitioning keys. There is one exception where the column list is not required: when
       creating an Avro table with the <codeph>STORED AS AVRO</codeph> clause, you can omit the list
       of columns and specify the same metadata as part of the <codeph>TBLPROPERTIES</codeph> clause. </p>

     <p rev="3.4.0">
       <b>Constraints:</b>
     </p>
     <p>Constraints are advisory and intended for estimating cardinality during query planning in a
       future release; there is no attempt to enforce constraints. Add primary and foreign key
       information after column definitions. Do not include a constraint name; the constraint name is
       generated internally as a UUID. The following constraint states are supported: <ul
         id="ul_gbz_3kl_4kb">
         <li>DISABLE</li>
         <li>NOVALIDATE</li>
         <li>RELY</li>
       </ul>The ENABLE, VALIDATE, and NORELY options are not supported. The foreign key must be
       defined as the primary key in the referenced table. </p>
     <p> Constraint examples: <codeblock>CREATE TABLE pk(col1 INT, col2 STRING, PRIMARY KEY(col1, col2));</codeblock>
       <codeblock>CREATE TABLE fk(id INT, col1 INT, col2 STRING, PRIMARY KEY(id),
   FOREIGN KEY(col1, col2) REFERENCES pk(col1, col2));</codeblock>
       <codeblock>CREATE TABLE pk(id INT, PRIMARY KEY(id) DISABLE, NOVALIDATE, RELY);</codeblock>
       <codeblock>CREATE TABLE fk(id INT, col1 INT, col2 STRING, PRIMARY KEY(id),
   FOREIGN KEY(col1, col2) REFERENCES pk(col1, col2));</codeblock>
     </p>

     <p conref="../shared/impala_common.xml#common/complex_types_blurb"/>

     <p rev="2.3.0">
       The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or
       <codeph>MAP</codeph>) are available in <keyword keyref="impala23_full"/> and higher.
       Because you can nest these types (for example, to make an array of maps or a struct with
       an array field), these types are also sometimes referred to as nested types. See
       <xref href="impala_complex_types.xml#complex_types"/> for usage details.
     </p>

 <!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down
          after some of the more common clauses. -->

     <p rev="2.3.0">
       Impala can create tables containing complex type columns, with any supported file format.
       Because currently Impala can only query complex type columns in Parquet tables, creating
       tables with complex type columns and other file formats such as text is of limited use.
       For example, you might create a text table including some columns with complex types with
       Impala, and use Hive as part of your to ingest the nested type data and copy it to an
       identical Parquet table. Or you might create a partitioned table containing complex type
       columns using one file format, and use <codeph>ALTER TABLE</codeph> to change the file
       format of individual partitions to Parquet; Impala can then query only the Parquet-format
       partitions in that table.
     </p>

     <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/>

     <p>
       <b>Internal and external tables (EXTERNAL and LOCATION clauses):</b>
     </p>

     <p> By default, Impala creates an internal table, where Impala manages the underlying data files
       for the table, and physically deletes the data files when you drop the table. If you specify
       the <codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table,
       where the data files are typically produced outside Impala and queried from their original
       locations in HDFS, and Impala leaves the data files in place when you drop the table. For
       details about internal and external tables, see <xref href="impala_tables.xml#tables"/>. </p>

     <p>
       Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify
       the path to the HDFS directory where Impala reads and writes files for the table. For
       example, if your data pipeline produces Parquet files in the HDFS directory
       <filepath>/user/etl/destination</filepath>, you might create an external table as follows:
     </p>

 <codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP)
   STORED AS PARQUET LOCATION '/user/etl/destination';
 </codeblock>

     <p>
       Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often
       specified together, <codeph>LOCATION</codeph> is optional for external tables, and you can
       also specify <codeph>LOCATION</codeph> for internal tables. The difference is all about
       whether Impala <q>takes control</q> of the underlying data files and moves them when you
       rename the table, or deletes them when you drop the table. For more about internal and
       external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see
       <xref
         href="impala_tables.xml#tables"/>.
     </p>

     <p>
       <b>Partitioned tables (PARTITIONED BY clause):</b>
     </p>

     <p>
       The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from
       one or more specified columns. Impala queries can use the partition metadata to minimize
       the amount of data that is read from disk or transmitted across the network, particularly
       during join queries. For details about partitioning, see
       <xref
         href="impala_partitioning.xml#partitioning"/>.
     </p>

     <note rev="IMPALA-3719">
       <p>
         All Kudu tables require partitioning, which involves different syntax than non-Kudu
         tables. See the <codeph>PARTITION BY</codeph> clause, rather than <codeph>PARTITIONED
         BY</codeph>, for Kudu tables.
       </p>

       <p rev="IMPALA-5546">
         In <keyword keyref="impala210_full"/> and higher, the <codeph>PARTITION BY</codeph>
         clause is optional for Kudu tables. If the clause is omitted, Impala automatically
         constructs a single partition that is not connected to any column. Because such a table
         cannot take advantage of Kudu features for parallelized queries and query optimizations,
         omitting the <codeph>PARTITION BY</codeph> clause is only appropriate for small lookup
         tables.
       </p>
     </note>

     <p rev="2.5.0">
       Prior to <keyword keyref="impala25_full"/>, you could use a partitioned table as the
       source and copy data from it, but could not specify any partitioning clauses for the new
       table. In <keyword
         keyref="impala25_full"/> and higher, you can now use the
       <codeph>PARTITIONED BY</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph>
       statement. See the examples under the following discussion of the <codeph>CREATE TABLE AS
       SELECT</codeph> syntax variation.
     </p>

     <p rev="2.9.0 IMPALA-4166">
       <b>Sorted tables (SORT BY clause):</b>
     </p>

     <p rev="2.9.0 IMPALA-4166">
       The optional <codeph>SORT BY</codeph> clause lets you specify zero or more columns that
       are sorted in the data files created by each Impala <codeph>INSERT</codeph> or
       <codeph>CREATE TABLE AS SELECT</codeph> operation. Creating data files that are sorted is
       most useful for Parquet tables, where the metadata stored inside each file includes the
       minimum and maximum values for each column in the file. (The statistics apply to each row
       group within the file; for simplicity, Impala writes a single row group in each file.)
       Grouping data values together in relatively narrow ranges within each data file makes it
       possible for Impala to quickly skip over data files that do not contain value ranges
       indicated in the <codeph>WHERE</codeph> clause of a query, and can improve the
       effectiveness of Parquet encoding and compression.
     </p>

     <p rev="2.9.0 IMPALA-4166">
       This clause is not applicable for Kudu tables or HBase tables. Although it works for other
       HDFS file formats besides Parquet, the more efficient layout is most evident with Parquet
       tables, because each Parquet data file includes statistics about the data values in that
       file.
     </p>

     <p rev="2.9.0 IMPALA-4166">
       The <codeph>SORT BY</codeph> columns cannot include any partition key columns for a
       partitioned table, because those column values are not represented in the underlying data
       files.
     </p>

     <p rev="2.9.0 IMPALA-4166">
       Because data files can arrive in Impala tables by mechanisms that do not respect the
       <codeph>SORT BY</codeph> clause, such as <codeph>LOAD DATA</codeph> or ETL tools that
       create HDFS files, Impala does not guarantee or rely on the data being sorted. The sorting
       aspect is only used to create a more efficient layout for Parquet files generated by
       Impala, which helps to optimize the processing of those Parquet files during Impala
       queries. During an <codeph>INSERT</codeph> or <codeph>CREATE TABLE AS SELECT</codeph>
       operation, the sorting occurs when the <codeph>SORT BY</codeph> clause applies to the
       destination table for the data, regardless of whether the source table has a <codeph>SORT
       BY</codeph> clause.
     </p>

     <p rev="2.9.0 IMPALA-4166">
       For example, when creating a table intended to contain census data, you might define sort
       columns such as last name and state. If a data file in this table contains a narrow range
       of last names, for example from <codeph>Smith</codeph> to <codeph>Smythe</codeph>, Impala
       can quickly detect that this data file contains no matches for a <codeph>WHERE</codeph>
       clause such as <codeph>WHERE last_name = 'Jones'</codeph> and avoid reading the entire
       file.
     </p>

 <codeblock rev="2.9.0 IMPALA-4166">CREATE TABLE census_data (last_name STRING, first_name STRING, state STRING, address STRING)
   SORT BY (last_name, state)
   STORED AS PARQUET;
 </codeblock>

     <p rev="2.9.0 IMPALA-4166">
       Likewise, if an existing table contains data without any sort order, you can reorganize
       the data in a more efficient way by using <codeph>INSERT</codeph> or <codeph>CREATE TABLE
       AS SELECT</codeph> to copy that data into a new table with a <codeph>SORT BY</codeph>
       clause:
     </p>

 <codeblock rev="2.9.0 IMPALA-4166">CREATE TABLE sorted_census_data
   SORT BY (last_name, state)
   STORED AS PARQUET
   AS SELECT last_name, first_name, state, address
     FROM unsorted_census_data;
 </codeblock>

     <p rev="2.9.0 IMPALA-4166">
       The metadata for the <codeph>SORT BY</codeph> clause is stored in the
       <codeph>TBLPROPERTIES</codeph> fields for the table. Other SQL engines that can
       interoperate with Impala tables, such as Hive and Spark SQL, do not recognize this
       property when inserting into a table that has a <codeph>SORT BY</codeph> clause.
     </p>

     <p>
       <b>Transactional tables:</b>
     </p>

     <p>
       In the version 3.3 and higher, when integrated with Hive 3, Impala can create, read, and
       insert into transactional tables.
     </p>

     <p>
       To create a table that supports transactions, use the <codeph>TBLPROPERTIES</codeph>
       clause and set the <codeph>'transactional'</codeph> and
       <codeph>'transactional_properties'</codeph> as below. Currently, Impala only supports
       insert-only transactional tables.
 <codeblock>TBLPROPERTIES('transactional'='true', 'transactional_properties'='insert_only')</codeblock>
     </p>

     <p>
       When integrated with Hive3 and the <codeph>DEFAULT_TRANSACTIONAL_TYPE</codeph> query
       option is set to <codeph>INSERT_ONLY</codeph>, tables are created as insert-only
       transactional table by default.
     </p>

     <p>
       Transactional tables are not supported for Kudu and HBase.
     </p>

     <p rev="kudu" conref="../shared/impala_common.xml#common/kudu_blurb"/>

     <p rev="kudu">
       Because Kudu tables do not support clauses related to HDFS and S3 data files and
       partitioning mechanisms, the syntax associated with the <codeph>STORED AS KUDU</codeph>
       clause is shown separately in the above syntax descriptions. Kudu tables have their own
       syntax for <codeph>CREATE TABLE</codeph>, <codeph>CREATE EXTERNAL TABLE</codeph>, and
       <codeph>CREATE TABLE AS SELECT</codeph>. <ph rev="IMPALA-2256">Prior to
       <keyword keyref="impala210_full"/>, all internal Kudu tables require a <codeph>PARTITION
       BY</codeph> clause, different than the <codeph>PARTITIONED BY</codeph> clause for
       HDFS-backed tables.</ph>
     </p>

     <p>
       Here are some examples of creating empty Kudu tables:
     </p>

 <codeblock>
 <ph rev="IMPALA-2256">-- Single partition. Only for <keyword keyref="impala210_full"/> and higher.
 -- Only suitable for small lookup tables.
 CREATE TABLE kudu_no_partition_by_clause
   (
     id bigint PRIMARY KEY, s STRING, b BOOLEAN
   )
   STORED AS KUDU;</ph>

 -- Single-column primary key.
 CREATE TABLE kudu_t1 (id BIGINT PRIMARY key, s STRING, b BOOLEAN)
   PARTITION BY HASH (id) PARTITIONS 20 STORED AS KUDU;

 -- Multi-column primary key.
 CREATE TABLE kudu_t2 (id BIGINT, s STRING, b BOOLEAN, PRIMARY KEY (id,s))
   PARTITION BY HASH (s) PARTITIONS 30 STORED AS KUDU;

 <![CDATA[-- Meaningful primary key column is good for range partitioning.
 CREATE TABLE kudu_t3 (id BIGINT, year INT, s STRING,
     b BOOLEAN, PRIMARY KEY (id,year))
   PARTITION BY HASH (id) PARTITIONS 20,
   RANGE (year) (PARTITION 1980 <= VALUES < 1990,
     PARTITION 1990 <= VALUES < 2000,
     PARTITION VALUE = 2001,
     PARTITION 2001 < VALUES)
   STORED AS KUDU;
 ]]>
 </codeblock>

     <p>
       Here is an example of creating an external Kudu table based on an pre-existing table
       identified by the table property: </p>

 <codeblock><![CDATA[
 -- Inherits column definitions from original table.
 -- For tables created through Impala, the kudu.table_name property
 -- comes from DESCRIBE FORMATTED output from the original table.
 CREATE EXTERNAL TABLE external_t1 STORED AS KUDU
   TBLPROPERTIES ('kudu.table_name'='kudu_tbl_created_via_api');
 ]]>
 </codeblock>

     <p>
       In Impala 3.4 and higher, by default HMS implicitly translates internal Kudu tables to
       external Kudu tables with the 'external.table.purge' property set to true. You can explicitly
       create such external Kudu tables similar to the way you create internal Kudu tables. You must
       set the table property <codeph>'external.table.purge'</codeph> to true. Here is an example of
       creating an external Kudu table:
     </p>

 <codeblock>CREATE EXTERNAL TABLE myextkudutbl (
   id int PRIMARY KEY,
   name string)
   PARTITION BY HASH PARTITIONS 8
   STORED AS KUDU
   TBLPROPERTIES ('external.table.purge'='true');
 </codeblock>
     <p>
       Operations on the resulting external table in Impala, HMS, and Kudu table metadata is
       synchronized. HMS-Kudu integration does not need to be enabled for external table
       synchronization. Such synchronized tables behave similar to internal tables. For example,
       dropping a table removes the underlying Kudu table data as well as the table metadata in HMS.
     </p>
     <p>
       If you want to drop only the HMS metadata and not drop the Kudu table, you
       can set <codeph>external.table.purge</codeph> to false, as shown in the following example:
     </p>

 <codeblock>
   ALTER TABLE myextkudutbl set tblproperties('external.table.purge'='false');
 </codeblock>

     <p>
       Here is an example of <codeph>CREATE TABLE AS SELECT</codeph> syntax for a Kudu table:
     </p>

 <codeblock><![CDATA[
 -- The CTAS statement defines the primary key and partitioning scheme.
 -- The rest of the column definitions are derived from the select list.
 CREATE TABLE ctas_t1
   PRIMARY KEY (id) PARTITION BY HASH (id) PARTITIONS 10
   STORED AS KUDU
   AS SELECT id, s FROM kudu_t1;
 ]]>
 </codeblock>

     <p rev="kudu">
       The following <codeph>CREATE TABLE</codeph> clauses are not supported for Kudu tables:
     </p>

     <ul rev="kudu">
       <li>
         <codeph>PARTITIONED BY</codeph> (Kudu tables use the clause <codeph>PARTITION
         BY</codeph> instead)
       </li>

       <li>
         <codeph>LOCATION</codeph>
       </li>

       <li>
         <codeph>ROWFORMAT</codeph>
       </li>

       <li>
         <codeph>CACHED IN | UNCACHED</codeph>
       </li>

       <li>
         <codeph>WITH SERDEPROPERTIES</codeph>
       </li>
     </ul>

     <p rev="IMPALA-3719">
       For more on the <codeph>PRIMARY KEY</codeph> clause, see
       <xref href="impala_kudu.xml#kudu_primary_key"/> and
       <xref
         href="impala_kudu.xml#kudu_primary_key_attribute"/>.
     </p>

     <p>
       For more on creating a Kudu table with a specific replication factor, see
       <xref href="impala_kudu.xml#kudu_replication_factor"/>.
     </p>

     <p rev="IMPALA-3719">
       For more on the <codeph>NULL</codeph> and <codeph>NOT NULL</codeph> attributes, see
       <xref
         href="impala_kudu.xml#kudu_not_null_attribute"/>.
     </p>

     <p rev="IMPALA-3719">
       For more on the <codeph>ENCODING</codeph> attribute, see
       <xref href="impala_kudu.xml#kudu_encoding_attribute"/>.
     </p>

     <p rev="IMPALA-3719">
       For more on the <codeph>COMPRESSION</codeph> attribute, see
       <xref href="impala_kudu.xml#kudu_compression_attribute"/>.
     </p>

     <p rev="IMPALA-3719">
       For more on the <codeph>DEFAULT</codeph> attribute, see
       <xref href="impala_kudu.xml#kudu_default_attribute"/>.
     </p>

     <p rev="IMPALA-3719">
       For more on the <codeph>BLOCK_SIZE</codeph> attribute, see
       <xref href="impala_kudu.xml#kudu_block_size_attribute"/>.
     </p>

     <p rev="kudu IMPALA-3719">
       <b>Partitioning for Kudu tables (PARTITION BY clause)</b>
     </p>

     <p rev="kudu IMPALA-3719">
       For Kudu tables, you specify logical partitioning across one or more columns using the
       <codeph>PARTITION BY</codeph> clause. In contrast to partitioning for HDFS-based tables,
       multiple values for a partition key column can be located in the same partition. The
       optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key
       columns into a specified number of buckets. You can use more than one
       <codeph>HASH</codeph> clause, specifying a distinct set of partition key columns for each.
       The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on a
       set of comparison operations for the partition key columns.
     </p>

     <p rev="kudu IMPALA-3719">
       Here are some examples of the <codeph>PARTITION BY HASH</codeph> syntax:
     </p>

 <codeblock rev="kudu IMPALA-3719"><![CDATA[
 -- Apply hash function to 1 primary key column.
 create table hash_t1 (x bigint, y bigint, s string, primary key (x,y))
   partition by hash (x) partitions 10
   stored as kudu;

 -- Apply hash function to a different primary key column.
 create table hash_t2 (x bigint, y bigint, s string, primary key (x,y))
   partition by hash (y) partitions 10
   stored as kudu;

 -- Apply hash function to both primary key columns.
 -- In this case, the total number of partitions is 10.
 create table hash_t3 (x bigint, y bigint, s string, primary key (x,y))
   partition by hash (x,y) partitions 10
   stored as kudu;

 -- When the column list is omitted, apply hash function to all primary key columns.
 create table hash_t4 (x bigint, y bigint, s string, primary key (x,y))
   partition by hash partitions 10
   stored as kudu;

 -- Hash the X values independently from the Y values.
 -- In this case, the total number of partitions is 10 x 20.
 create table hash_t5 (x bigint, y bigint, s string, primary key (x,y))
   partition by hash (x) partitions 10, hash (y) partitions 20
   stored as kudu;
 ]]>
 </codeblock>

     <p rev="kudu IMPALA-3719">
       Here are some examples of the <codeph>PARTITION BY RANGE</codeph> syntax:
     </p>

 <codeblock rev="kudu IMPALA-3719"><![CDATA[
 -- Create partitions that cover every possible value of X.
 -- Ranges that span multiple values use the keyword VALUES between
 -- a pair of < and <= comparisons.
 create table range_t1 (x bigint, s string, s2 string, primary key (x, s))
   partition by range (x)
     (
       partition 0 <= values <= 49, partition 50 <= values <= 100,
       partition values < 0, partition 100 < values
     )
   stored as kudu;

 -- Create partitions that cover some possible values of X.
 -- Values outside the covered range(s) are rejected.
 -- New range partitions can be added through ALTER TABLE.
 create table range_t2 (x bigint, s string, s2 string, primary key (x, s))
   partition by range (x)
     (
       partition 0 <= values <= 49, partition 50 <= values <= 100
     )
   stored as kudu;

 -- A range can also specify a single specific value, using the keyword VALUE
 -- with an = comparison.
 create table range_t3 (x bigint, s string, s2 string, primary key (x, s))
   partition by range (s)
     (
       partition value = 'Yes', partition value = 'No', partition value = 'Maybe'
     )
   stored as kudu;

 -- Using multiple columns in the RANGE clause and tuples inside the partition spec
 -- only works for partitions specified with the VALUE= syntax.
 create table range_t4 (x bigint, s string, s2 string, primary key (x, s))
   partition by range (x,s)
     (
       partition value = (0,'zero'), partition value = (1,'one'), partition value = (2,'two')
     )
   stored as kudu;
 ]]>
 </codeblock>

     <p rev="kudu IMPALA-3719">
       Here are some examples combining both <codeph>HASH</codeph> and <codeph>RANGE</codeph>
       syntax for the <codeph>PARTITION BY</codeph> clause:
     </p>

 <codeblock rev="kudu IMPALA-3719"><![CDATA[
 -- Values from each range partition are hashed into 10 associated buckets.
 -- Total number of partitions in this case is 10 x 2.
 create table combined_t1 (x bigint, s string, s2 string, primary key (x, s))
   partition by hash (x) partitions 10, range (x)
     (
       partition 0 <= values <= 49, partition 50 <= values <= 100
     )
   stored as kudu;

 -- The hash partitioning and range partitioning can apply to different columns.
 -- But all the columns used in either partitioning scheme must be from the primary key.
 create table combined_t2 (x bigint, s string, s2 string, primary key (x, s))
   partition by hash (s) partitions 10, range (x)
     (
       partition 0 <= values <= 49, partition 50 <= values <= 100
     )
   stored as kudu;
 ]]>
 </codeblock>

     <p rev="kudu IMPALA-3719">
       For more usage details and examples of the Kudu partitioning syntax, see
       <xref keyref="impala_kudu"/>.
     </p>

     <p>
       <b>Specifying file format (STORED AS and ROW FORMAT clauses):</b>
     </p>

     <p rev="DOCS-1523">
       The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files.
       Currently, Impala can query more types of file formats than it can create or insert into.
       Use Hive to perform any create or data load operations that are not currently available in
       Impala. For example, Impala can create an Avro, SequenceFile, or RCFile table but cannot
       insert data into it. There are also Impala-specific procedures for using compression with
       each kind of file format. For details about working with data files of various formats,
       see <xref href="impala_file_formats.xml#file_formats"/>.
     </p>

     <note>
       In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing
       the <codeph>CREATE TABLE</codeph> statement in Hive. See
       <xref href="impala_avro.xml#avro"/> for details and examples.
     </note>

     <p>
       By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala
       tables are created as text files with Ctrl-A (hex 01) characters as the delimiter.
 <!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? -->
       Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files
       that use a different delimiter character such as tab or <codeph>|</codeph>, or a different
       line end character such as carriage return or newline. When specifying delimiter and line
       end characters with the <codeph>FIELDS TERMINATED BY</codeph> and <codeph>LINES TERMINATED
       BY</codeph> clauses, use <codeph>'\t'</codeph> for tab, <codeph>'\n'</codeph> for newline
       or linefeed, <codeph>'\r'</codeph> for carriage return, and
       <codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more
       examples of text tables, see <xref href="impala_txtfile.xml#txtfile"/>.
     </p>

     <p>
       The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through
       an <codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to
       existing data files that you put into an Impala table directory. (You can ingest existing
       data files either by creating the table with <codeph>CREATE EXTERNAL TABLE ...
       LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or through an HDFS operation
       such as <codeph>hdfs dfs -put <varname>file</varname>
       <varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used
       anywhere else in the file, and put it in front of each instance of the delimiter character
       that occurs within a field value. Surrounding field values with quotation marks does not
       help Impala to parse fields with embedded delimiter characters; the quotation marks are
       considered to be part of the column value. If you want to use <codeph>\</codeph> as the
       escape character, specify the clause in <cmdname>impala-shell</cmdname> as <codeph>ESCAPED
       BY '\\'</codeph>.
     </p>

     <note conref="../shared/impala_common.xml#common/thorn"/>

     <p>
       <b>Cloning tables (LIKE clause):</b>
     </p>

     <p>
       To create an empty table with the same columns, comments, and other attributes as another
       table, use the following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows
       a restricted set of clauses, currently only the <codeph>LOCATION</codeph>,
       <codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph> clauses.
     </p>

 <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname>
   <ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph>
   [COMMENT '<varname>table_comment</varname>']
   [STORED AS <varname>file_format</varname>]
   [LOCATION '<varname>hdfs_path</varname>']</codeblock>

     <note rev="1.2.0">
       <p rev="1.2.0">
         To clone the structure of a table and transfer data into it in a single operation, use
         the <codeph>CREATE TABLE AS SELECT</codeph> syntax described in the next subsection.
       </p>
     </note>

     <p>
       When you clone the structure of an existing table using the <codeph>CREATE TABLE ...
       LIKE</codeph> syntax, the new table keeps the same file format as the original one, so you
       only need to specify the <codeph>STORED AS</codeph> clause if you want to use a different
       file format, or when specifying a view as the original table. (Creating a table
       <q>like</q> a view produces a text table by default.)
     </p>

     <p>
       Although normally Impala cannot create an HBase table directly, Impala can clone the
       structure of an existing HBase table with the <codeph>CREATE TABLE ... LIKE</codeph>
       syntax, preserving the file format and metadata from the original table.
     </p>

     <p>
       There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph>
       with an Avro table. For example, you cannot use this technique for an Avro table that is
       specified with an Avro schema but no columns. When in doubt, check if a <codeph>CREATE
       TABLE ... LIKE</codeph> operation works in Hive; if not, it typically will not work in
       Impala either.
     </p>

     <p>
       If the original table is partitioned, the new table inherits the same partition key
       columns. Because the new table is initially empty, it does not inherit the actual
       partitions that exist in the original one. To create partitions in the new table, insert
       data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements.
     </p>

     <p conref="../shared/impala_common.xml#common/create_table_like_view"/>

     <p>
       Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the
       physical data of the table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward
       to copy any data from the original table into the new one, optionally converting the data
       to a new file format. (For some file formats, Impala can do a <codeph>CREATE TABLE ...
       LIKE</codeph> to create the table, but Impala cannot insert data in that file format; in
       these cases, you must load the data in Hive. See
       <xref
         href="impala_file_formats.xml#file_formats"/> for details.)
     </p>

     <p rev="1.2" id="ctas">
       <b>CREATE TABLE AS SELECT:</b>
     </p>

     <p>
       The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a
       table based on column definitions from another table, and copy data from the source table
       to the destination table without issuing any separate <codeph>INSERT</codeph> statement.
       This idiom is so popular that it has its own acronym, <q>CTAS</q>.
     </p>

     <p>
       The following examples show how to copy data from a source table <codeph>T1</codeph> to a
       variety of destinations tables, applying various transformations to the table properties,
       table layout, or the data itself as part of the operation:
     </p>

 <codeblock>
 -- Sample table to be the source of CTAS operations.
 CREATE TABLE t1 (x INT, y STRING);
 INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three');

 -- Clone all the columns and data from one table to another.
 CREATE TABLE clone_of_t1 AS SELECT * FROM t1;
 +-------------------+
 | summary           |
 +-------------------+
 | Inserted 3 row(s) |
 +-------------------+

 -- Clone the columns and data, and convert the data to a different file format.
 CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1;
 +-------------------+
 | summary           |
 +-------------------+
 | Inserted 3 row(s) |
 +-------------------+

 -- Copy only some rows to the new table.
 CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2;
 +-------------------+
 | summary           |
 +-------------------+
 | Inserted 2 row(s) |
 +-------------------+

 -- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data.
 CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0;
 +-------------------+
 | summary           |
 +-------------------+
 | Inserted 0 row(s) |
 +-------------------+

 -- Reorder and rename columns and transform the data.
 CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1;
 +-------------------+
 | summary           |
 +-------------------+
 | Inserted 3 row(s) |
 +-------------------+
 SELECT * FROM t5;
 +-------+---+---------------------+
 | s     | a | n                   |
 +-------+---+---------------------+
 | ONE   | 2 | Entirely new column |
 | TWO   | 3 | Entirely new column |
 | THREE | 4 | Entirely new column |
 +-------+---+---------------------+
 </codeblock>

 <!-- These are a little heavyweight to get into here. Therefore commenting out.
      Some overlap with the new column-changing examples in the code listing above.
 Create tables with different column order, names, or types than the original.
 CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1;
 CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1;
 CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock>
 -->

 <!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. -->

     <p rev="1.2">
       See <xref href="impala_select.xml#select"/> for details about query syntax for the
       <codeph>SELECT</codeph> portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement.
     </p>

     <p rev="1.2">
       The newly created table inherits the column names that you select from the original table,
       which you can override by specifying column aliases in the query. Any column or table
       comments from the original table are not carried over to the new table.
     </p>

     <note rev="DOCS-1523">
       When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS
       SELECT</codeph> statement, the destination table must be a file format that Impala can
       write to: currently, text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile
       table as the destination table for a CTAS operation.
     </note>

     <p rev="2.5.0">
       Prior to <keyword keyref="impala25_full"/> you could use a partitioned table as the source
       and copy data from it, but could not specify any partitioning clauses for the new table.
       In <keyword
         keyref="impala25_full"/> and higher, you can now use the
       <codeph>PARTITIONED BY</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph>
       statement. The following example demonstrates how you can copy data from an unpartitioned
       table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new partitioned
       table in the process. The main syntax consideration is the column order in the
       <codeph>PARTITIONED BY</codeph> clause and the select list: the partition key columns must
       be listed last in the select list, in the same order as in the <codeph>PARTITIONED
       BY</codeph> clause. Therefore, in this case, the column order in the destination table is
       different from the source table. You also only specify the column names in the
       <codeph>PARTITIONED BY</codeph> clause, not the data types or column comments.
     </p>

 <codeblock rev="2.5.0">
 create table partitions_no (year smallint, month tinyint, s string);
 insert into partitions_no values (2016, 1, 'January 2016'),
   (2016, 2, 'February 2016'), (2016, 3, 'March 2016');

 -- Prove that the source table is not partitioned.
 show partitions partitions_no;
 ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no

 -- Create new table with partitions based on column values from source table.
 <b>create table partitions_yes partitioned by (year, month)
   as select s, year, month from partitions_no;</b>
 +-------------------+
 | summary           |
 +-------------------+
 | Inserted 3 row(s) |
 +-------------------+

 -- Prove that the destination table is partitioned.
 show partitions partitions_yes;
 +-------+-------+-------+--------+------+...
 | year  | month | #Rows | #Files | Size |...
 +-------+-------+-------+--------+------+...
 | 2016  | 1     | -1    | 1      | 13B  |...
 | 2016  | 2     | -1    | 1      | 14B  |...
 | 2016  | 3     | -1    | 1      | 11B  |...
 | Total |       | -1    | 3      | 38B  |...
 +-------+-------+-------+--------+------+...
 </codeblock>

     <p rev="2.5.0">
       The most convenient layout for partitioned tables is with all the partition key columns at
       the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax requires that column order in the
       select list, resulting in that same column order in the destination table.
     </p>

 <codeblock rev="2.5.0">
 describe partitions_no;
 +-------+----------+---------+
 | name  | type     | comment |
 +-------+----------+---------+
 | year  | smallint |         |
 | month | tinyint  |         |
 | s     | string   |         |
 +-------+----------+---------+

 -- The CTAS operation forced us to put the partition key columns last.
 -- Having those columns last works better with idioms such as SELECT *
 -- for partitioned tables.
 describe partitions_yes;
 +-------+----------+---------+
 | name  | type     | comment |
 +-------+----------+---------+
 | s     | string   |         |
 | year  | smallint |         |
 | month | tinyint  |         |
 +-------+----------+---------+
 </codeblock>

     <p rev="2.5.0">
       Attempting to use a select list with the partition key columns not at the end results in
       an error due to a column name mismatch:
     </p>

 <codeblock rev="2.5.0">
 -- We expect this CTAS to fail because non-key column S
 -- comes after key columns YEAR and MONTH in the select list.
 create table partitions_maybe partitioned by (year, month)
   as select year, month, s from partitions_no;
 ERROR: AnalysisException: Partition column name mismatch: year != month
 </codeblock>

     <p rev="1.2">
       For example, the following statements show how you can clone all the data in a table, or a
       subset of the columns and/or rows, or reorder columns, rename them, or construct them out
       of expressions:
     </p>

     <p rev="1.2">
       As part of a CTAS operation, you can convert the data to any file format that Impala can
       write (currently, <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot
       specify the lower-level properties of a text table, such as the delimiter.
     </p>

     <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/>

     <p rev="1.4.0">
       <b>CREATE TABLE LIKE PARQUET:</b>
     </p>

     <p rev="1.4.0">
       The variation <codeph>CREATE TABLE ... LIKE PARQUET
       '<varname>hdfs_path_of_parquet_file</varname>'</codeph> lets you skip the column
       definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and data
       types are automatically configured based on the organization of the specified Parquet data
       file, which must already reside in HDFS. You can use a data file located outside the
       Impala database directories, or a file from an existing Impala Parquet table; either way,
       Impala only uses the column definitions from the file and does not use the HDFS location
       for the <codeph>LOCATION</codeph> attribute of the new table. (Although you can also
       specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use
       the same schema as the data file and point the Impala table at the associated directory
       for querying.)
     </p>

     <p rev="1.4.0">
       The following considerations apply when you use the <codeph>CREATE TABLE LIKE
       PARQUET</codeph> technique:
     </p>

     <ul rev="1.4.0">
       <li>
         Any column comments from the original table are not preserved in the new table. Each
         column in the new table has a comment stating the low-level Parquet field type used to
         deduce the appropriate SQL column type.
       </li>

       <li>
         If you use a data file from a partitioned Impala table, any partition key columns from
         the original table are left out of the new table, because they are represented in HDFS
         directory names rather than stored in the data file. To preserve the partition
         information, repeat the same <codeph>PARTITION</codeph> clause as in the original
         <codeph>CREATE TABLE</codeph> statement.
       </li>

       <li>
         The file format of the new table defaults to text, as with other kinds of <codeph>CREATE
         TABLE</codeph> statements. To make the new table also use Parquet format, include the
         clause <codeph>STORED AS PARQUET</codeph> in the <codeph>CREATE TABLE LIKE
         PARQUET</codeph> statement.
       </li>

       <li>
         If the Parquet data file comes from an existing Impala table, currently, any
         <codeph>TINYINT</codeph> or <codeph>SMALLINT</codeph> columns are turned into
         <codeph>INT</codeph> columns in the new table. Internally, Parquet stores such values as
         32-bit integers.
       </li>

       <li>
         When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS
         SELECT</codeph> and <codeph>INSERT ... SELECT</codeph> statements always create at least
         one data file, even if the <codeph>SELECT</codeph> part of the statement does not match
         any rows. You can use such an empty Parquet data file as a template for subsequent
         <codeph>CREATE TABLE LIKE PARQUET</codeph> statements.
       </li>
     </ul>

     <p>
       For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE
       LIKE PARQUET</codeph> syntax, see <xref
         href="impala_parquet.xml#parquet"/>.
     </p>

     <p>
       <b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b>
     </p>

     <p rev="1.2">
       You can associate arbitrary items of metadata with a table by specifying the
       <codeph>TBLPROPERTIES</codeph> clause. This clause takes a comma-separated list of
       key-value pairs and stores those items in the metastore database. You can also change the
       table properties later with an <codeph>ALTER TABLE</codeph> statement. You can observe the
       table properties for different delimiter and escape characters using the <codeph>DESCRIBE
       FORMATTED</codeph> command, and change those settings for an existing table with
       <codeph>ALTER TABLE ... SET TBLPROPERTIES</codeph>.
     </p>

     <p rev="1.2">
       You can also associate SerDes properties with the table by specifying key-value pairs
       through the <codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by
       Impala, which has its own built-in serializer and deserializer for the file formats it
       supports. Particular property values might be needed for Hive compatibility with certain
       variations of file formats, particularly Avro.
     </p>

     <p>
       Some DDL operations that interact with other Hadoop components require specifying
       particular values in the <codeph>SERDEPROPERTIES</codeph> or
       <codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or an HBase table.
       (You typically create HBase tables in Hive, because they require additional clauses not
       currently available in Impala.)
 <!-- Haven't got a working example from Lenni, so suppressing this recommendation for now.
         The Avro schema properties can be specified through either
         <codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>;
         for best compatibility with future versions of Hive,
         use <codeph>SERDEPROPERTIES</codeph> in this case.
 -->
     </p>

     <p>
       To see the column definitions and column comments for an existing table, for example
       before issuing a <codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS
       SELECT</codeph> statement, issue the statement <codeph>DESCRIBE
       <varname>table_name</varname></codeph>. To see even more detail, such as the location of
       data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and
       <codeph>STORED AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED
       <varname>table_name</varname></codeph>. <codeph>DESCRIBE FORMATTED</codeph> is also needed
       to see any overall table comment (as opposed to individual column comments).
     </p>

     <p>
       After creating a table, your <cmdname>impala-shell</cmdname> session or another
       <cmdname>impala-shell</cmdname> connected to the same node can immediately query that
       table. There might be a brief interval (one statestore heartbeat) before the table can be
       queried through a different Impala node. To make the <codeph>CREATE TABLE</codeph>
       statement return only when the table is recognized by all Impala nodes in the cluster,
       enable the <codeph>SYNC_DDL</codeph> query option.
     </p>

     <p rev="1.4.0">
       <b>HDFS caching (CACHED IN clause):</b>
     </p>

     <p rev="1.4.0">
       If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in
       the table directory or the partition subdirectories are designated to be loaded into
       memory with the HDFS caching mechanism. See
       <xref
         href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using
       the HDFS caching feature.
     </p>

     <p
       conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/>

 <!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. -->

     <p>
       <b>Column order</b>:
     </p>

     <p>
       If you intend to use the table to hold data files produced by some external source,
       specify the columns in the same order as they appear in the data files.
     </p>

     <p>
       If you intend to insert or copy data into the table through Impala, or if you have control
       over the way externally produced data files are arranged, use your judgment to specify
       columns in the most convenient order:
     </p>

     <ul>
       <li>
         <p>
           If certain columns are often <codeph>NULL</codeph>, specify those columns last. You
           might produce data files that omit these trailing columns entirely. Impala
           automatically fills in the <codeph>NULL</codeph> values if so.
         </p>
       </li>

       <li>
         <p>
           If an unpartitioned table will be used as the source for an <codeph>INSERT ...
           SELECT</codeph> operation into a partitioned table, specify last in the unpartitioned
           table any columns that correspond to partition key columns in the partitioned table,
           and in the same order as the partition key columns are declared in the partitioned
           table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when copying
           data to the partitioned table, rather than specifying each column name individually.
         </p>
       </li>

       <li>
         <p>
           If you specify columns in an order that you later discover is suboptimal, you can
           sometimes work around the problem without recreating the table. You can create a view
           that selects columns from the original table in a permuted order, then do a
           <codeph>SELECT *</codeph> from the view. When inserting data into a table, you can
           specify a permuted order for the inserted columns to match the order in the
           destination table.
         </p>
       </li>
     </ul>

     <p conref="../shared/impala_common.xml#common/hive_blurb"/>

     <p>
       Impala queries can make use of metadata about the table and columns, such as the number of
       rows in a table or the number of different values in a column. Prior to Impala 1.2.2, to
       create this metadata, you issued the <codeph>ANALYZE TABLE</codeph> statement in Hive to
       gather this information, after creating the table and loading representative data into it.
       In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement produces these
       statistics within Impala, without needing to use Hive at all.
     </p>

     <p conref="../shared/impala_common.xml#common/hbase_blurb"/>

     <note>
       <p>
         The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because
         it currently does not support the <codeph>STORED BY</codeph> clause needed for HBase
         tables. Create such tables in Hive, then query them through Impala. For information on
         using Impala with HBase tables, see <xref href="impala_hbase.xml#impala_hbase"/>.
       </p>
     </note>

     <p conref="../shared/impala_common.xml#common/s3_blurb"/>

     <p rev="2.2.0">
       To create a table where the data resides in the Amazon Simple Storage Service (S3),
       specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to
       the data files in S3.
     </p>

     <p rev="2.6.0 IMPALA-1878">
       In <keyword keyref="impala26_full"/> and higher, you can use this special
       <codeph>LOCATION</codeph> syntax as part of a <codeph>CREATE TABLE AS SELECT</codeph>
       statement.
     </p>

     <p conref="../shared/impala_common.xml#common/s3_ddl"/>

     <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/>

     <p conref="../shared/impala_common.xml#common/hdfs_blurb"/>

     <p>
       The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in
       HDFS. The <codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an
       existing HDFS directory, and does not create any new directory in HDFS. To locate the HDFS
       data directory for a table, issue a <codeph>DESCRIBE FORMATTED
       <varname>table</varname></codeph> statement. To examine the contents of that HDFS
       directory, use an OS command such as <codeph>hdfs dfs -ls
       hdfs://<varname>path</varname></codeph>, either from the OS command line or through the
       <codeph>shell</codeph> or <codeph>!</codeph> commands in <cmdname>impala-shell</cmdname>.
     </p>

     <p>
       The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data
       directory to hold any data copied by the <codeph>INSERT</codeph> portion of the statement.
       (Even if no data is copied, Impala might create one or more empty data files.)
     </p>

     <p conref="../shared/impala_common.xml#common/permissions_blurb"/>

     <p rev="">
       The user ID that the <cmdname>impalad</cmdname> daemon runs under, typically the
       <codeph>impala</codeph> user, must have both execute and write permission for the database
       directory where the table is being created.
     </p>

     <p conref="../shared/impala_common.xml#common/security_blurb"/>

     <p conref="../shared/impala_common.xml#common/redaction_yes"/>

     <p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/>

     <p conref="../shared/impala_common.xml#common/related_info"/>

     <p>
       <xref href="impala_tables.xml#tables"/>,
       <xref
         href="impala_alter_table.xml#alter_table"/>,
       <xref
         href="impala_drop_table.xml#drop_table"/>,
       <xref
         href="impala_partitioning.xml#partitioning"/>,
       <xref
         href="impala_tables.xml#internal_tables"/>,
       <xref
         href="impala_tables.xml#external_tables"/>,
       <xref
         href="impala_compute_stats.xml#compute_stats"/>,
       <xref
         href="impala_sync_ddl.xml#sync_ddl"/>,
       <xref
         href="impala_show.xml#show_tables"/>,
       <xref
         href="impala_show.xml#show_create_table"/>,
       <xref
         href="impala_describe.xml#describe"/>
     </p>

   </conbody>

 </concept>