| <?xml version="1.0" encoding="UTF-8"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> |
| <concept id="create_table" outputclass="impala sql_statement"> |
| |
| <title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title> |
| |
| <titlealts audience="PDF"> |
| |
| <navtitle>CREATE TABLE</navtitle> |
| |
| </titlealts> |
| |
| <prolog> |
| <metadata> |
| <data name="Category" value="Impala"/> |
| <data name="Category" value="SQL"/> |
| <data name="Category" value="DDL"/> |
| <data name="Category" value="Impala Data Types"/> |
| <data name="Category" value="Developers"/> |
| <data name="Category" value="Data Analysts"/> |
| <data name="Category" value="HDFS Caching"/> |
| <data name="Category" value="Tables"/> |
| <data name="Category" value="Schemas"/> |
| <data name="Category" value="S3"/> |
| <data name="Category" value="Kudu"/> |
| </metadata> |
| </prolog> |
| |
| <conbody> |
| |
| <p> |
| Creates a new table and specifies its characteristics. While creating a table, you |
| optionally specify aspects such as: |
| </p> |
| |
| <ul> |
| <li> |
| Whether the table is internal or external. |
| </li> |
| |
| <li> |
| The columns and associated data types. |
| </li> |
| |
| <li> |
| The columns used for physically partitioning the data. |
| </li> |
| |
| <li> |
| The file format for data files. |
| </li> |
| |
| <li> |
| The HDFS directory where the data files are located. |
| </li> |
| </ul> |
| |
| <p conref="../shared/impala_common.xml#common/syntax_blurb"/> |
| |
| <p> |
| The general syntax for creating a table and specifying its columns is as follows: |
| </p> |
| |
| <p> |
| <b>Explicit column definitions:</b> |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| (<varname>col_name</varname> <varname>data_type</varname> |
| [<varname>constraint_specification</varname>] |
| [COMMENT '<varname>col_comment</varname>'] |
| [, ...] |
| ) |
| [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)] |
| <ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| [ROW FORMAT <varname>row_format</varname>] |
| [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| [STORED AS <varname>file_format</varname>] |
| [LOCATION '<varname>hdfs_path</varname>'] |
| <ph rev="1.4.0">[CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED] |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| </codeblock> |
| |
| <p> |
| <b>CREATE TABLE AS SELECT:</b> |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname> |
| <ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph> |
| <ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| [ROW FORMAT <varname>row_format</varname>] |
| [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| <ph rev="">[STORED AS <varname>ctas_file_format</varname>]</ph> |
| [LOCATION '<varname>hdfs_path</varname>'] |
| <ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED] |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| AS |
| <varname>select_statement</varname></codeblock> |
| |
| <codeblock>primitive_type: |
| TINYINT |
| | SMALLINT |
| | INT |
| | BIGINT |
| | BOOLEAN |
| | FLOAT |
| | DOUBLE |
| <ph rev="1.4.0">| DECIMAL</ph> |
| | STRING |
| <ph rev="2.0.0">| CHAR</ph> |
| <ph rev="2.0.0">| VARCHAR</ph> |
| | TIMESTAMP |
| |
| <ph rev="2.3.0">complex_type: |
| struct_type |
| | array_type |
| | map_type |
| |
| struct_type: STRUCT < <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... > |
| |
| array_type: ARRAY < <varname>primitive_or_complex_type</varname> > |
| |
| map_type: MAP < <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> > |
| </ph> |
| |
| constraint_specification: |
| PRIMARY KEY (<varname>col_name</varname>, ...) [DISABLE] [NOVALIDATE] [RELY], [<varname>foreign_key_specification</varname>, ...] |
| |
| foreign_key_specification: |
| FOREIGN KEY (<varname>col_name</varname>, ...) REFERENCES table_name(<varname>col_name</varname>, ...) [DISABLE] [NOVALIDATE] [RELY] |
| |
| row_format: |
| DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']] |
| [LINES TERMINATED BY '<varname>char</varname>'] |
| |
| file_format: |
| PARQUET |
| | TEXTFILE |
| | AVRO |
| | SEQUENCEFILE |
| | RCFILE |
| |
| <ph rev="">ctas_file_format: |
| PARQUET |
| | TEXTFILE</ph> |
| </codeblock> |
| |
| <p> |
| <b>Column definitions inferred from data file:</b> |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>' |
| [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)] |
| <ph rev="2.9.0 IMPALA-4166">[SORT BY ([<varname>column</varname> [, <varname>column</varname> ...]])]</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| [ROW FORMAT <varname>row_format</varname>] |
| [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| [STORED AS <varname>file_format</varname>] |
| [LOCATION '<varname>hdfs_path</varname>'] |
| <ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED] |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| data_type: |
| <varname>primitive_type</varname> |
| | array_type |
| | map_type |
| | struct_type |
| </codeblock> |
| |
| <p> |
| <b>Internal Kudu tables:</b> |
| </p> |
| |
| <codeblock rev="kudu">CREATE TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| (<varname>col_name</varname> <varname>data_type</varname> |
| <ph rev="kudu IMPALA-3719">[<varname>kudu_column_attribute</varname> ...]</ph> |
| [COMMENT '<varname>col_comment</varname>'] |
| [, ...] |
| [PRIMARY KEY (<varname>col_name</varname>[, ...])] |
| ) |
| <ph rev="kudu">[PARTITION BY <varname>kudu_partition_clause</varname>]</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| STORED AS KUDU |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| </codeblock> |
| |
| <p rev="kudu IMPALA-3719"> |
| <b>Kudu column attributes:</b> |
| <codeblock rev="kudu"> |
| PRIMARY KEY |
| | [NOT] NULL |
| | ENCODING <varname>codec</varname> |
| | COMPRESSION <varname>algorithm</varname> |
| | DEFAULT <varname>constant</varname> |
| | BLOCK_SIZE <varname>number</varname> |
| </codeblock> |
| </p> |
| |
| <p rev="kudu IMPALA-3719"> |
| <b>kudu_partition_clause:</b> |
| <codeblock rev="kudu"> |
| kudu_partition_clause ::= [ <varname>hash_clause</varname> [, ...]] [, <varname>range_clause</varname> ] |
| |
| hash_clause ::= |
| HASH [ (<varname>pk_col</varname> [, ...]) ] |
| PARTITIONS <varname>n</varname> |
| |
| range_clause ::= |
| RANGE [ (<varname>pk_col</varname> [, ...]) ] |
| ( |
| { |
| PARTITION <varname>constant_expression</varname> <varname>range_comparison_operator</varname> VALUES <varname>range_comparison_operator</varname> <varname>constant_expression</varname> |
| | PARTITION VALUE = <varname>constant_expression_or_tuple</varname> |
| } |
| [, ...] |
| ) |
| |
| range_comparison_operator ::= { < | <= } |
| </codeblock> |
| </p> |
| |
| <p> |
| <b>External Kudu tables:</b> |
| </p> |
| |
| <p>In Impala 3.4 and earlier, you can create an external Kudu table based on a pre-existing Kudu |
| schema using the table property <codeph>'kudu.table_name'='internal_kudu_name'</codeph>. </p> |
| |
| <codeblock>CREATE EXTERNAL TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| [COMMENT '<varname>col_comment</varname>'] |
| STORED AS KUDU |
| [TBLPROPERTIES ('<varname>kudu.table.name</varname>'='<varname>internal_kudu_name</varname>', '<varname>key1</varname>'='<varname>value1</varname>',...)] |
| </codeblock> |
| |
| <p>Alternatively, in Impala 3.4 and higher, you can also create an external Kudu table as follows:</p> |
| |
| <codeblock rev="kudu">CREATE EXTERNAL TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| (<varname>col_name</varname> <varname>data_type</varname> |
| [<varname>kudu_column_attribute</varname> ...] |
| [COMMENT '<varname>col_comment</varname>'] |
| [, ...] |
| [PRIMARY KEY (<varname>col_name</varname>[, ...])] |
| ) |
| [PARTITION BY <varname>kudu_partition_clause</varname>] |
| [COMMENT '<varname>table_comment</varname>'] |
| STORED AS KUDU |
| [TBLPROPERTIES ('external.table.purge'='true', '<varname>key1</varname>'='<varname>value1</varname>',...)] |
| </codeblock> |
| |
| <ul> |
| <li>Use a Hive metastore (HMS) 3 or later.</li> |
| <li>Provide column specifications to define the schema when you create the table, similar to |
| creating an internal table. </li> |
| <li>Omit the <codeph>kudu.table_name</codeph> table property as there is no pre-existing |
| schema. </li> |
| <li>Include the required <codeph>external.table.purge</codeph> property.</li> |
| </ul> |
| |
| <p> Only the schema metadata is stored in HMS when you create an external table; however, using |
| this create table syntax, drop table on the Kudu external table deletes the data stored |
| outside HMS in Kudu as well as the metadata (schema) inside HMS. Likewise, renaming the table |
| changes the name of the table in HMS and in Kudu. Kudu synchronizes changes to the actual data |
| and metadata; consequently, operations such as dropping a table or altering a table name |
| simulate internal table operations. </p> |
| |
| <p> |
| <b>CREATE TABLE AS SELECT for Kudu tables:</b> |
| </p> |
| |
| <codeblock rev="kudu">CREATE TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname> |
| [PRIMARY KEY (<varname>col_name</varname>[, ...])] |
| [PARTITION BY <varname>kudu_partition_clause</varname>] |
| [COMMENT '<varname>table_comment</varname>'] |
| STORED AS KUDU |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| AS |
| <varname>select_statement</varname></codeblock> |
| |
| <p conref="../shared/impala_common.xml#common/ddl_blurb"/> |
| |
| <!-- Should really have some info up front about all the data types and file formats. |
| Consider adding here, or at least making inline links to the relevant keywords |
| in the syntax spec above. --> |
| |
| <p> |
| <b>Column definitions:</b> |
| </p> |
| |
| <p> |
| Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column |
| definitions are required or not allowed. |
| </p> |
| |
| <p> |
| With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph> |
| syntax, you do not specify the columns at all; the column names and types are derived from |
| the source table, query, or data file. |
| </p> |
| |
| <p> With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns, its |
| name, type, optionally constraints, and optionally a comment, in addition to any columns used |
| as partitioning keys. There is one exception where the column list is not required: when |
| creating an Avro table with the <codeph>STORED AS AVRO</codeph> clause, you can omit the list |
| of columns and specify the same metadata as part of the <codeph>TBLPROPERTIES</codeph> clause. </p> |
| |
| <p rev="3.4.0"> |
| <b>Constraints:</b> |
| </p> |
| <p>Constraints are advisory and intended for estimating cardinality during query planning in a |
| future release; there is no attempt to enforce constraints. Add primary and foreign key |
| information after column definitions. Do not include a constraint name; the constraint name is |
| generated internally as a UUID. The following constraint states are supported: <ul |
| id="ul_gbz_3kl_4kb"> |
| <li>DISABLE</li> |
| <li>NOVALIDATE</li> |
| <li>RELY</li> |
| </ul>The ENABLE, VALIDATE, and NORELY options are not supported. The foreign key must be |
| defined as the primary key in the referenced table. </p> |
| <p> Constraint examples: <codeblock>CREATE TABLE pk(col1 INT, col2 STRING, PRIMARY KEY(col1, col2));</codeblock> |
| <codeblock>CREATE TABLE fk(id INT, col1 INT, col2 STRING, PRIMARY KEY(id), |
| FOREIGN KEY(col1, col2) REFERENCES pk(col1, col2));</codeblock> |
| <codeblock>CREATE TABLE pk(id INT, PRIMARY KEY(id) DISABLE, NOVALIDATE, RELY);</codeblock> |
| <codeblock>CREATE TABLE fk(id INT, col1 INT, col2 STRING, PRIMARY KEY(id), |
| FOREIGN KEY(col1, col2) REFERENCES pk(col1, col2));</codeblock> |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> |
| |
| <p rev="2.3.0"> |
| The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or |
| <codeph>MAP</codeph>) are available in <keyword keyref="impala23_full"/> and higher. |
| Because you can nest these types (for example, to make an array of maps or a struct with |
| an array field), these types are also sometimes referred to as nested types. See |
| <xref href="impala_complex_types.xml#complex_types"/> for usage details. |
| </p> |
| |
| <!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down |
| after some of the more common clauses. --> |
| |
| <p rev="2.3.0"> |
| Impala can create tables containing complex type columns, with any supported file format. |
| Because currently Impala can only query complex type columns in Parquet tables, creating |
| tables with complex type columns and other file formats such as text is of limited use. |
| For example, you might create a text table including some columns with complex types with |
| Impala, and use Hive as part of your to ingest the nested type data and copy it to an |
| identical Parquet table. Or you might create a partitioned table containing complex type |
| columns using one file format, and use <codeph>ALTER TABLE</codeph> to change the file |
| format of individual partitions to Parquet; Impala can then query only the Parquet-format |
| partitions in that table. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/> |
| |
| <p> |
| <b>Internal and external tables (EXTERNAL and LOCATION clauses):</b> |
| </p> |
| |
| <p> By default, Impala creates an internal table, where Impala manages the underlying data files |
| for the table, and physically deletes the data files when you drop the table. If you specify |
| the <codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table, |
| where the data files are typically produced outside Impala and queried from their original |
| locations in HDFS, and Impala leaves the data files in place when you drop the table. For |
| details about internal and external tables, see <xref href="impala_tables.xml#tables"/>. </p> |
| |
| <p> |
| Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify |
| the path to the HDFS directory where Impala reads and writes files for the table. For |
| example, if your data pipeline produces Parquet files in the HDFS directory |
| <filepath>/user/etl/destination</filepath>, you might create an external table as follows: |
| </p> |
| |
| <codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP) |
| STORED AS PARQUET LOCATION '/user/etl/destination'; |
| </codeblock> |
| |
| <p> |
| Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often |
| specified together, <codeph>LOCATION</codeph> is optional for external tables, and you can |
| also specify <codeph>LOCATION</codeph> for internal tables. The difference is all about |
| whether Impala <q>takes control</q> of the underlying data files and moves them when you |
| rename the table, or deletes them when you drop the table. For more about internal and |
| external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see |
| <xref |
| href="impala_tables.xml#tables"/>. |
| </p> |
| |
| <p> |
| <b>Partitioned tables (PARTITIONED BY clause):</b> |
| </p> |
| |
| <p> |
| The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from |
| one or more specified columns. Impala queries can use the partition metadata to minimize |
| the amount of data that is read from disk or transmitted across the network, particularly |
| during join queries. For details about partitioning, see |
| <xref |
| href="impala_partitioning.xml#partitioning"/>. |
| </p> |
| |
| <note rev="IMPALA-3719"> |
| <p> |
| All Kudu tables require partitioning, which involves different syntax than non-Kudu |
| tables. See the <codeph>PARTITION BY</codeph> clause, rather than <codeph>PARTITIONED |
| BY</codeph>, for Kudu tables. |
| </p> |
| |
| <p rev="IMPALA-5546"> |
| In <keyword keyref="impala210_full"/> and higher, the <codeph>PARTITION BY</codeph> |
| clause is optional for Kudu tables. If the clause is omitted, Impala automatically |
| constructs a single partition that is not connected to any column. Because such a table |
| cannot take advantage of Kudu features for parallelized queries and query optimizations, |
| omitting the <codeph>PARTITION BY</codeph> clause is only appropriate for small lookup |
| tables. |
| </p> |
| </note> |
| |
| <p rev="2.5.0"> |
| Prior to <keyword keyref="impala25_full"/>, you could use a partitioned table as the |
| source and copy data from it, but could not specify any partitioning clauses for the new |
| table. In <keyword |
| keyref="impala25_full"/> and higher, you can now use the |
| <codeph>PARTITIONED BY</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph> |
| statement. See the examples under the following discussion of the <codeph>CREATE TABLE AS |
| SELECT</codeph> syntax variation. |
| </p> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| <b>Sorted tables (SORT BY clause):</b> |
| </p> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| The optional <codeph>SORT BY</codeph> clause lets you specify zero or more columns that |
| are sorted in the data files created by each Impala <codeph>INSERT</codeph> or |
| <codeph>CREATE TABLE AS SELECT</codeph> operation. Creating data files that are sorted is |
| most useful for Parquet tables, where the metadata stored inside each file includes the |
| minimum and maximum values for each column in the file. (The statistics apply to each row |
| group within the file; for simplicity, Impala writes a single row group in each file.) |
| Grouping data values together in relatively narrow ranges within each data file makes it |
| possible for Impala to quickly skip over data files that do not contain value ranges |
| indicated in the <codeph>WHERE</codeph> clause of a query, and can improve the |
| effectiveness of Parquet encoding and compression. |
| </p> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| This clause is not applicable for Kudu tables or HBase tables. Although it works for other |
| HDFS file formats besides Parquet, the more efficient layout is most evident with Parquet |
| tables, because each Parquet data file includes statistics about the data values in that |
| file. |
| </p> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| The <codeph>SORT BY</codeph> columns cannot include any partition key columns for a |
| partitioned table, because those column values are not represented in the underlying data |
| files. |
| </p> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| Because data files can arrive in Impala tables by mechanisms that do not respect the |
| <codeph>SORT BY</codeph> clause, such as <codeph>LOAD DATA</codeph> or ETL tools that |
| create HDFS files, Impala does not guarantee or rely on the data being sorted. The sorting |
| aspect is only used to create a more efficient layout for Parquet files generated by |
| Impala, which helps to optimize the processing of those Parquet files during Impala |
| queries. During an <codeph>INSERT</codeph> or <codeph>CREATE TABLE AS SELECT</codeph> |
| operation, the sorting occurs when the <codeph>SORT BY</codeph> clause applies to the |
| destination table for the data, regardless of whether the source table has a <codeph>SORT |
| BY</codeph> clause. |
| </p> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| For example, when creating a table intended to contain census data, you might define sort |
| columns such as last name and state. If a data file in this table contains a narrow range |
| of last names, for example from <codeph>Smith</codeph> to <codeph>Smythe</codeph>, Impala |
| can quickly detect that this data file contains no matches for a <codeph>WHERE</codeph> |
| clause such as <codeph>WHERE last_name = 'Jones'</codeph> and avoid reading the entire |
| file. |
| </p> |
| |
| <codeblock rev="2.9.0 IMPALA-4166">CREATE TABLE census_data (last_name STRING, first_name STRING, state STRING, address STRING) |
| SORT BY (last_name, state) |
| STORED AS PARQUET; |
| </codeblock> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| Likewise, if an existing table contains data without any sort order, you can reorganize |
| the data in a more efficient way by using <codeph>INSERT</codeph> or <codeph>CREATE TABLE |
| AS SELECT</codeph> to copy that data into a new table with a <codeph>SORT BY</codeph> |
| clause: |
| </p> |
| |
| <codeblock rev="2.9.0 IMPALA-4166">CREATE TABLE sorted_census_data |
| SORT BY (last_name, state) |
| STORED AS PARQUET |
| AS SELECT last_name, first_name, state, address |
| FROM unsorted_census_data; |
| </codeblock> |
| |
| <p rev="2.9.0 IMPALA-4166"> |
| The metadata for the <codeph>SORT BY</codeph> clause is stored in the |
| <codeph>TBLPROPERTIES</codeph> fields for the table. Other SQL engines that can |
| interoperate with Impala tables, such as Hive and Spark SQL, do not recognize this |
| property when inserting into a table that has a <codeph>SORT BY</codeph> clause. |
| </p> |
| |
| <p> |
| <b>Transactional tables:</b> |
| </p> |
| |
| <p> |
| In the version 3.3 and higher, when integrated with Hive 3, Impala can create, read, and |
| insert into transactional tables. |
| </p> |
| |
| <p> |
| To create a table that supports transactions, use the <codeph>TBLPROPERTIES</codeph> |
| clause and set the <codeph>'transactional'</codeph> and |
| <codeph>'transactional_properties'</codeph> as below. Currently, Impala only supports |
| insert-only transactional tables. |
| <codeblock>TBLPROPERTIES('transactional'='true', 'transactional_properties'='insert_only')</codeblock> |
| </p> |
| |
| <p> |
| When integrated with Hive3 and the <codeph>DEFAULT_TRANSACTIONAL_TYPE</codeph> query |
| option is set to <codeph>INSERT_ONLY</codeph>, tables are created as insert-only |
| transactional table by default. |
| </p> |
| |
| <p> |
| Transactional tables are not supported for Kudu and HBase. |
| </p> |
| |
| <p rev="kudu" conref="../shared/impala_common.xml#common/kudu_blurb"/> |
| |
| <p rev="kudu"> |
| Because Kudu tables do not support clauses related to HDFS and S3 data files and |
| partitioning mechanisms, the syntax associated with the <codeph>STORED AS KUDU</codeph> |
| clause is shown separately in the above syntax descriptions. Kudu tables have their own |
| syntax for <codeph>CREATE TABLE</codeph>, <codeph>CREATE EXTERNAL TABLE</codeph>, and |
| <codeph>CREATE TABLE AS SELECT</codeph>. <ph rev="IMPALA-2256">Prior to |
| <keyword keyref="impala210_full"/>, all internal Kudu tables require a <codeph>PARTITION |
| BY</codeph> clause, different than the <codeph>PARTITIONED BY</codeph> clause for |
| HDFS-backed tables.</ph> |
| </p> |
| |
| <p> |
| Here are some examples of creating empty Kudu tables: |
| </p> |
| |
| <codeblock> |
| <ph rev="IMPALA-2256">-- Single partition. Only for <keyword keyref="impala210_full"/> and higher. |
| -- Only suitable for small lookup tables. |
| CREATE TABLE kudu_no_partition_by_clause |
| ( |
| id bigint PRIMARY KEY, s STRING, b BOOLEAN |
| ) |
| STORED AS KUDU;</ph> |
| |
| -- Single-column primary key. |
| CREATE TABLE kudu_t1 (id BIGINT PRIMARY key, s STRING, b BOOLEAN) |
| PARTITION BY HASH (id) PARTITIONS 20 STORED AS KUDU; |
| |
| -- Multi-column primary key. |
| CREATE TABLE kudu_t2 (id BIGINT, s STRING, b BOOLEAN, PRIMARY KEY (id,s)) |
| PARTITION BY HASH (s) PARTITIONS 30 STORED AS KUDU; |
| |
| <![CDATA[-- Meaningful primary key column is good for range partitioning. |
| CREATE TABLE kudu_t3 (id BIGINT, year INT, s STRING, |
| b BOOLEAN, PRIMARY KEY (id,year)) |
| PARTITION BY HASH (id) PARTITIONS 20, |
| RANGE (year) (PARTITION 1980 <= VALUES < 1990, |
| PARTITION 1990 <= VALUES < 2000, |
| PARTITION VALUE = 2001, |
| PARTITION 2001 < VALUES) |
| STORED AS KUDU; |
| ]]> |
| </codeblock> |
| |
| <p> |
| Here is an example of creating an external Kudu table based on an pre-existing table |
| identified by the table property: </p> |
| |
| <codeblock><![CDATA[ |
| -- Inherits column definitions from original table. |
| -- For tables created through Impala, the kudu.table_name property |
| -- comes from DESCRIBE FORMATTED output from the original table. |
| CREATE EXTERNAL TABLE external_t1 STORED AS KUDU |
| TBLPROPERTIES ('kudu.table_name'='kudu_tbl_created_via_api'); |
| ]]> |
| </codeblock> |
| |
| <p> |
| In Impala 3.4 and higher, by default HMS implicitly translates internal Kudu tables to |
| external Kudu tables with the 'external.table.purge' property set to true. You can explicitly |
| create such external Kudu tables similar to the way you create internal Kudu tables. You must |
| set the table property <codeph>'external.table.purge'</codeph> to true. Here is an example of |
| creating an external Kudu table: |
| </p> |
| |
| <codeblock>CREATE EXTERNAL TABLE myextkudutbl ( |
| id int PRIMARY KEY, |
| name string) |
| PARTITION BY HASH PARTITIONS 8 |
| STORED AS KUDU |
| TBLPROPERTIES ('external.table.purge'='true'); |
| </codeblock> |
| <p> |
| Operations on the resulting external table in Impala, HMS, and Kudu table metadata is |
| synchronized. HMS-Kudu integration does not need to be enabled for external table |
| synchronization. Such synchronized tables behave similar to internal tables. For example, |
| dropping a table removes the underlying Kudu table data as well as the table metadata in HMS. |
| </p> |
| <p> |
| If you want to drop only the HMS metadata and not drop the Kudu table, you |
| can set <codeph>external.table.purge</codeph> to false, as shown in the following example: |
| </p> |
| |
| <codeblock> |
| ALTER TABLE myextkudutbl set tblproperties('external.table.purge'='false'); |
| </codeblock> |
| |
| <p> |
| Here is an example of <codeph>CREATE TABLE AS SELECT</codeph> syntax for a Kudu table: |
| </p> |
| |
| <codeblock><![CDATA[ |
| -- The CTAS statement defines the primary key and partitioning scheme. |
| -- The rest of the column definitions are derived from the select list. |
| CREATE TABLE ctas_t1 |
| PRIMARY KEY (id) PARTITION BY HASH (id) PARTITIONS 10 |
| STORED AS KUDU |
| AS SELECT id, s FROM kudu_t1; |
| ]]> |
| </codeblock> |
| |
| <p rev="kudu"> |
| The following <codeph>CREATE TABLE</codeph> clauses are not supported for Kudu tables: |
| </p> |
| |
| <ul rev="kudu"> |
| <li> |
| <codeph>PARTITIONED BY</codeph> (Kudu tables use the clause <codeph>PARTITION |
| BY</codeph> instead) |
| </li> |
| |
| <li> |
| <codeph>LOCATION</codeph> |
| </li> |
| |
| <li> |
| <codeph>ROWFORMAT</codeph> |
| </li> |
| |
| <li> |
| <codeph>CACHED IN | UNCACHED</codeph> |
| </li> |
| |
| <li> |
| <codeph>WITH SERDEPROPERTIES</codeph> |
| </li> |
| </ul> |
| |
| <p rev="IMPALA-3719"> |
| For more on the <codeph>PRIMARY KEY</codeph> clause, see |
| <xref href="impala_kudu.xml#kudu_primary_key"/> and |
| <xref |
| href="impala_kudu.xml#kudu_primary_key_attribute"/>. |
| </p> |
| |
| <p> |
| For more on creating a Kudu table with a specific replication factor, see |
| <xref href="impala_kudu.xml#kudu_replication_factor"/>. |
| </p> |
| |
| <p rev="IMPALA-3719"> |
| For more on the <codeph>NULL</codeph> and <codeph>NOT NULL</codeph> attributes, see |
| <xref |
| href="impala_kudu.xml#kudu_not_null_attribute"/>. |
| </p> |
| |
| <p rev="IMPALA-3719"> |
| For more on the <codeph>ENCODING</codeph> attribute, see |
| <xref href="impala_kudu.xml#kudu_encoding_attribute"/>. |
| </p> |
| |
| <p rev="IMPALA-3719"> |
| For more on the <codeph>COMPRESSION</codeph> attribute, see |
| <xref href="impala_kudu.xml#kudu_compression_attribute"/>. |
| </p> |
| |
| <p rev="IMPALA-3719"> |
| For more on the <codeph>DEFAULT</codeph> attribute, see |
| <xref href="impala_kudu.xml#kudu_default_attribute"/>. |
| </p> |
| |
| <p rev="IMPALA-3719"> |
| For more on the <codeph>BLOCK_SIZE</codeph> attribute, see |
| <xref href="impala_kudu.xml#kudu_block_size_attribute"/>. |
| </p> |
| |
| <p rev="kudu IMPALA-3719"> |
| <b>Partitioning for Kudu tables (PARTITION BY clause)</b> |
| </p> |
| |
| <p rev="kudu IMPALA-3719"> |
| For Kudu tables, you specify logical partitioning across one or more columns using the |
| <codeph>PARTITION BY</codeph> clause. In contrast to partitioning for HDFS-based tables, |
| multiple values for a partition key column can be located in the same partition. The |
| optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key |
| columns into a specified number of buckets. You can use more than one |
| <codeph>HASH</codeph> clause, specifying a distinct set of partition key columns for each. |
| The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on a |
| set of comparison operations for the partition key columns. |
| </p> |
| |
| <p rev="kudu IMPALA-3719"> |
| Here are some examples of the <codeph>PARTITION BY HASH</codeph> syntax: |
| </p> |
| |
| <codeblock rev="kudu IMPALA-3719"><![CDATA[ |
| -- Apply hash function to 1 primary key column. |
| create table hash_t1 (x bigint, y bigint, s string, primary key (x,y)) |
| partition by hash (x) partitions 10 |
| stored as kudu; |
| |
| -- Apply hash function to a different primary key column. |
| create table hash_t2 (x bigint, y bigint, s string, primary key (x,y)) |
| partition by hash (y) partitions 10 |
| stored as kudu; |
| |
| -- Apply hash function to both primary key columns. |
| -- In this case, the total number of partitions is 10. |
| create table hash_t3 (x bigint, y bigint, s string, primary key (x,y)) |
| partition by hash (x,y) partitions 10 |
| stored as kudu; |
| |
| -- When the column list is omitted, apply hash function to all primary key columns. |
| create table hash_t4 (x bigint, y bigint, s string, primary key (x,y)) |
| partition by hash partitions 10 |
| stored as kudu; |
| |
| -- Hash the X values independently from the Y values. |
| -- In this case, the total number of partitions is 10 x 20. |
| create table hash_t5 (x bigint, y bigint, s string, primary key (x,y)) |
| partition by hash (x) partitions 10, hash (y) partitions 20 |
| stored as kudu; |
| ]]> |
| </codeblock> |
| |
| <p rev="kudu IMPALA-3719"> |
| Here are some examples of the <codeph>PARTITION BY RANGE</codeph> syntax: |
| </p> |
| |
| <codeblock rev="kudu IMPALA-3719"><![CDATA[ |
| -- Create partitions that cover every possible value of X. |
| -- Ranges that span multiple values use the keyword VALUES between |
| -- a pair of < and <= comparisons. |
| create table range_t1 (x bigint, s string, s2 string, primary key (x, s)) |
| partition by range (x) |
| ( |
| partition 0 <= values <= 49, partition 50 <= values <= 100, |
| partition values < 0, partition 100 < values |
| ) |
| stored as kudu; |
| |
| -- Create partitions that cover some possible values of X. |
| -- Values outside the covered range(s) are rejected. |
| -- New range partitions can be added through ALTER TABLE. |
| create table range_t2 (x bigint, s string, s2 string, primary key (x, s)) |
| partition by range (x) |
| ( |
| partition 0 <= values <= 49, partition 50 <= values <= 100 |
| ) |
| stored as kudu; |
| |
| -- A range can also specify a single specific value, using the keyword VALUE |
| -- with an = comparison. |
| create table range_t3 (x bigint, s string, s2 string, primary key (x, s)) |
| partition by range (s) |
| ( |
| partition value = 'Yes', partition value = 'No', partition value = 'Maybe' |
| ) |
| stored as kudu; |
| |
| -- Using multiple columns in the RANGE clause and tuples inside the partition spec |
| -- only works for partitions specified with the VALUE= syntax. |
| create table range_t4 (x bigint, s string, s2 string, primary key (x, s)) |
| partition by range (x,s) |
| ( |
| partition value = (0,'zero'), partition value = (1,'one'), partition value = (2,'two') |
| ) |
| stored as kudu; |
| ]]> |
| </codeblock> |
| |
| <p rev="kudu IMPALA-3719"> |
| Here are some examples combining both <codeph>HASH</codeph> and <codeph>RANGE</codeph> |
| syntax for the <codeph>PARTITION BY</codeph> clause: |
| </p> |
| |
| <codeblock rev="kudu IMPALA-3719"><![CDATA[ |
| -- Values from each range partition are hashed into 10 associated buckets. |
| -- Total number of partitions in this case is 10 x 2. |
| create table combined_t1 (x bigint, s string, s2 string, primary key (x, s)) |
| partition by hash (x) partitions 10, range (x) |
| ( |
| partition 0 <= values <= 49, partition 50 <= values <= 100 |
| ) |
| stored as kudu; |
| |
| -- The hash partitioning and range partitioning can apply to different columns. |
| -- But all the columns used in either partitioning scheme must be from the primary key. |
| create table combined_t2 (x bigint, s string, s2 string, primary key (x, s)) |
| partition by hash (s) partitions 10, range (x) |
| ( |
| partition 0 <= values <= 49, partition 50 <= values <= 100 |
| ) |
| stored as kudu; |
| ]]> |
| </codeblock> |
| |
| <p rev="kudu IMPALA-3719"> |
| For more usage details and examples of the Kudu partitioning syntax, see |
| <xref keyref="impala_kudu"/>. |
| </p> |
| |
| <p> |
| <b>Specifying file format (STORED AS and ROW FORMAT clauses):</b> |
| </p> |
| |
| <p rev="DOCS-1523"> |
| The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files. |
| Currently, Impala can query more types of file formats than it can create or insert into. |
| Use Hive to perform any create or data load operations that are not currently available in |
| Impala. For example, Impala can create an Avro, SequenceFile, or RCFile table but cannot |
| insert data into it. There are also Impala-specific procedures for using compression with |
| each kind of file format. For details about working with data files of various formats, |
| see <xref href="impala_file_formats.xml#file_formats"/>. |
| </p> |
| |
| <note> |
| In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing |
| the <codeph>CREATE TABLE</codeph> statement in Hive. See |
| <xref href="impala_avro.xml#avro"/> for details and examples. |
| </note> |
| |
| <p> |
| By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala |
| tables are created as text files with Ctrl-A (hex 01) characters as the delimiter. |
| <!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? --> |
| Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files |
| that use a different delimiter character such as tab or <codeph>|</codeph>, or a different |
| line end character such as carriage return or newline. When specifying delimiter and line |
| end characters with the <codeph>FIELDS TERMINATED BY</codeph> and <codeph>LINES TERMINATED |
| BY</codeph> clauses, use <codeph>'\t'</codeph> for tab, <codeph>'\n'</codeph> for newline |
| or linefeed, <codeph>'\r'</codeph> for carriage return, and |
| <codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more |
| examples of text tables, see <xref href="impala_txtfile.xml#txtfile"/>. |
| </p> |
| |
| <p> |
| The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through |
| an <codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to |
| existing data files that you put into an Impala table directory. (You can ingest existing |
| data files either by creating the table with <codeph>CREATE EXTERNAL TABLE ... |
| LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or through an HDFS operation |
| such as <codeph>hdfs dfs -put <varname>file</varname> |
| <varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used |
| anywhere else in the file, and put it in front of each instance of the delimiter character |
| that occurs within a field value. Surrounding field values with quotation marks does not |
| help Impala to parse fields with embedded delimiter characters; the quotation marks are |
| considered to be part of the column value. If you want to use <codeph>\</codeph> as the |
| escape character, specify the clause in <cmdname>impala-shell</cmdname> as <codeph>ESCAPED |
| BY '\\'</codeph>. |
| </p> |
| |
| <note conref="../shared/impala_common.xml#common/thorn"/> |
| |
| <p> |
| <b>Cloning tables (LIKE clause):</b> |
| </p> |
| |
| <p> |
| To create an empty table with the same columns, comments, and other attributes as another |
| table, use the following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows |
| a restricted set of clauses, currently only the <codeph>LOCATION</codeph>, |
| <codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph> clauses. |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| <ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| [STORED AS <varname>file_format</varname>] |
| [LOCATION '<varname>hdfs_path</varname>']</codeblock> |
| |
| <note rev="1.2.0"> |
| <p rev="1.2.0"> |
| To clone the structure of a table and transfer data into it in a single operation, use |
| the <codeph>CREATE TABLE AS SELECT</codeph> syntax described in the next subsection. |
| </p> |
| </note> |
| |
| <p> |
| When you clone the structure of an existing table using the <codeph>CREATE TABLE ... |
| LIKE</codeph> syntax, the new table keeps the same file format as the original one, so you |
| only need to specify the <codeph>STORED AS</codeph> clause if you want to use a different |
| file format, or when specifying a view as the original table. (Creating a table |
| <q>like</q> a view produces a text table by default.) |
| </p> |
| |
| <p> |
| Although normally Impala cannot create an HBase table directly, Impala can clone the |
| structure of an existing HBase table with the <codeph>CREATE TABLE ... LIKE</codeph> |
| syntax, preserving the file format and metadata from the original table. |
| </p> |
| |
| <p> |
| There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph> |
| with an Avro table. For example, you cannot use this technique for an Avro table that is |
| specified with an Avro schema but no columns. When in doubt, check if a <codeph>CREATE |
| TABLE ... LIKE</codeph> operation works in Hive; if not, it typically will not work in |
| Impala either. |
| </p> |
| |
| <p> |
| If the original table is partitioned, the new table inherits the same partition key |
| columns. Because the new table is initially empty, it does not inherit the actual |
| partitions that exist in the original one. To create partitions in the new table, insert |
| data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/create_table_like_view"/> |
| |
| <p> |
| Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the |
| physical data of the table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward |
| to copy any data from the original table into the new one, optionally converting the data |
| to a new file format. (For some file formats, Impala can do a <codeph>CREATE TABLE ... |
| LIKE</codeph> to create the table, but Impala cannot insert data in that file format; in |
| these cases, you must load the data in Hive. See |
| <xref |
| href="impala_file_formats.xml#file_formats"/> for details.) |
| </p> |
| |
| <p rev="1.2" id="ctas"> |
| <b>CREATE TABLE AS SELECT:</b> |
| </p> |
| |
| <p> |
| The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a |
| table based on column definitions from another table, and copy data from the source table |
| to the destination table without issuing any separate <codeph>INSERT</codeph> statement. |
| This idiom is so popular that it has its own acronym, <q>CTAS</q>. |
| </p> |
| |
| <p> |
| The following examples show how to copy data from a source table <codeph>T1</codeph> to a |
| variety of destinations tables, applying various transformations to the table properties, |
| table layout, or the data itself as part of the operation: |
| </p> |
| |
| <codeblock> |
| -- Sample table to be the source of CTAS operations. |
| CREATE TABLE t1 (x INT, y STRING); |
| INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three'); |
| |
| -- Clone all the columns and data from one table to another. |
| CREATE TABLE clone_of_t1 AS SELECT * FROM t1; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| |
| -- Clone the columns and data, and convert the data to a different file format. |
| CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| |
| -- Copy only some rows to the new table. |
| CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 2 row(s) | |
| +-------------------+ |
| |
| -- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data. |
| CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 0 row(s) | |
| +-------------------+ |
| |
| -- Reorder and rename columns and transform the data. |
| CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| SELECT * FROM t5; |
| +-------+---+---------------------+ |
| | s | a | n | |
| +-------+---+---------------------+ |
| | ONE | 2 | Entirely new column | |
| | TWO | 3 | Entirely new column | |
| | THREE | 4 | Entirely new column | |
| +-------+---+---------------------+ |
| </codeblock> |
| |
| <!-- These are a little heavyweight to get into here. Therefore commenting out. |
| Some overlap with the new column-changing examples in the code listing above. |
| Create tables with different column order, names, or types than the original. |
| CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1; |
| CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1; |
| CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock> |
| --> |
| |
| <!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. --> |
| |
| <p rev="1.2"> |
| See <xref href="impala_select.xml#select"/> for details about query syntax for the |
| <codeph>SELECT</codeph> portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement. |
| </p> |
| |
| <p rev="1.2"> |
| The newly created table inherits the column names that you select from the original table, |
| which you can override by specifying column aliases in the query. Any column or table |
| comments from the original table are not carried over to the new table. |
| </p> |
| |
| <note rev="DOCS-1523"> |
| When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS |
| SELECT</codeph> statement, the destination table must be a file format that Impala can |
| write to: currently, text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile |
| table as the destination table for a CTAS operation. |
| </note> |
| |
| <p rev="2.5.0"> |
| Prior to <keyword keyref="impala25_full"/> you could use a partitioned table as the source |
| and copy data from it, but could not specify any partitioning clauses for the new table. |
| In <keyword |
| keyref="impala25_full"/> and higher, you can now use the |
| <codeph>PARTITIONED BY</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph> |
| statement. The following example demonstrates how you can copy data from an unpartitioned |
| table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new partitioned |
| table in the process. The main syntax consideration is the column order in the |
| <codeph>PARTITIONED BY</codeph> clause and the select list: the partition key columns must |
| be listed last in the select list, in the same order as in the <codeph>PARTITIONED |
| BY</codeph> clause. Therefore, in this case, the column order in the destination table is |
| different from the source table. You also only specify the column names in the |
| <codeph>PARTITIONED BY</codeph> clause, not the data types or column comments. |
| </p> |
| |
| <codeblock rev="2.5.0"> |
| create table partitions_no (year smallint, month tinyint, s string); |
| insert into partitions_no values (2016, 1, 'January 2016'), |
| (2016, 2, 'February 2016'), (2016, 3, 'March 2016'); |
| |
| -- Prove that the source table is not partitioned. |
| show partitions partitions_no; |
| ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no |
| |
| -- Create new table with partitions based on column values from source table. |
| <b>create table partitions_yes partitioned by (year, month) |
| as select s, year, month from partitions_no;</b> |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| |
| -- Prove that the destination table is partitioned. |
| show partitions partitions_yes; |
| +-------+-------+-------+--------+------+... |
| | year | month | #Rows | #Files | Size |... |
| +-------+-------+-------+--------+------+... |
| | 2016 | 1 | -1 | 1 | 13B |... |
| | 2016 | 2 | -1 | 1 | 14B |... |
| | 2016 | 3 | -1 | 1 | 11B |... |
| | Total | | -1 | 3 | 38B |... |
| +-------+-------+-------+--------+------+... |
| </codeblock> |
| |
| <p rev="2.5.0"> |
| The most convenient layout for partitioned tables is with all the partition key columns at |
| the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax requires that column order in the |
| select list, resulting in that same column order in the destination table. |
| </p> |
| |
| <codeblock rev="2.5.0"> |
| describe partitions_no; |
| +-------+----------+---------+ |
| | name | type | comment | |
| +-------+----------+---------+ |
| | year | smallint | | |
| | month | tinyint | | |
| | s | string | | |
| +-------+----------+---------+ |
| |
| -- The CTAS operation forced us to put the partition key columns last. |
| -- Having those columns last works better with idioms such as SELECT * |
| -- for partitioned tables. |
| describe partitions_yes; |
| +-------+----------+---------+ |
| | name | type | comment | |
| +-------+----------+---------+ |
| | s | string | | |
| | year | smallint | | |
| | month | tinyint | | |
| +-------+----------+---------+ |
| </codeblock> |
| |
| <p rev="2.5.0"> |
| Attempting to use a select list with the partition key columns not at the end results in |
| an error due to a column name mismatch: |
| </p> |
| |
| <codeblock rev="2.5.0"> |
| -- We expect this CTAS to fail because non-key column S |
| -- comes after key columns YEAR and MONTH in the select list. |
| create table partitions_maybe partitioned by (year, month) |
| as select year, month, s from partitions_no; |
| ERROR: AnalysisException: Partition column name mismatch: year != month |
| </codeblock> |
| |
| <p rev="1.2"> |
| For example, the following statements show how you can clone all the data in a table, or a |
| subset of the columns and/or rows, or reorder columns, rename them, or construct them out |
| of expressions: |
| </p> |
| |
| <p rev="1.2"> |
| As part of a CTAS operation, you can convert the data to any file format that Impala can |
| write (currently, <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot |
| specify the lower-level properties of a text table, such as the delimiter. |
| </p> |
| |
| <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/> |
| |
| <p rev="1.4.0"> |
| <b>CREATE TABLE LIKE PARQUET:</b> |
| </p> |
| |
| <p rev="1.4.0"> |
| The variation <codeph>CREATE TABLE ... LIKE PARQUET |
| '<varname>hdfs_path_of_parquet_file</varname>'</codeph> lets you skip the column |
| definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and data |
| types are automatically configured based on the organization of the specified Parquet data |
| file, which must already reside in HDFS. You can use a data file located outside the |
| Impala database directories, or a file from an existing Impala Parquet table; either way, |
| Impala only uses the column definitions from the file and does not use the HDFS location |
| for the <codeph>LOCATION</codeph> attribute of the new table. (Although you can also |
| specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use |
| the same schema as the data file and point the Impala table at the associated directory |
| for querying.) |
| </p> |
| |
| <p rev="1.4.0"> |
| The following considerations apply when you use the <codeph>CREATE TABLE LIKE |
| PARQUET</codeph> technique: |
| </p> |
| |
| <ul rev="1.4.0"> |
| <li> |
| Any column comments from the original table are not preserved in the new table. Each |
| column in the new table has a comment stating the low-level Parquet field type used to |
| deduce the appropriate SQL column type. |
| </li> |
| |
| <li> |
| If you use a data file from a partitioned Impala table, any partition key columns from |
| the original table are left out of the new table, because they are represented in HDFS |
| directory names rather than stored in the data file. To preserve the partition |
| information, repeat the same <codeph>PARTITION</codeph> clause as in the original |
| <codeph>CREATE TABLE</codeph> statement. |
| </li> |
| |
| <li> |
| The file format of the new table defaults to text, as with other kinds of <codeph>CREATE |
| TABLE</codeph> statements. To make the new table also use Parquet format, include the |
| clause <codeph>STORED AS PARQUET</codeph> in the <codeph>CREATE TABLE LIKE |
| PARQUET</codeph> statement. |
| </li> |
| |
| <li> |
| If the Parquet data file comes from an existing Impala table, currently, any |
| <codeph>TINYINT</codeph> or <codeph>SMALLINT</codeph> columns are turned into |
| <codeph>INT</codeph> columns in the new table. Internally, Parquet stores such values as |
| 32-bit integers. |
| </li> |
| |
| <li> |
| When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS |
| SELECT</codeph> and <codeph>INSERT ... SELECT</codeph> statements always create at least |
| one data file, even if the <codeph>SELECT</codeph> part of the statement does not match |
| any rows. You can use such an empty Parquet data file as a template for subsequent |
| <codeph>CREATE TABLE LIKE PARQUET</codeph> statements. |
| </li> |
| </ul> |
| |
| <p> |
| For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE |
| LIKE PARQUET</codeph> syntax, see <xref |
| href="impala_parquet.xml#parquet"/>. |
| </p> |
| |
| <p> |
| <b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b> |
| </p> |
| |
| <p rev="1.2"> |
| You can associate arbitrary items of metadata with a table by specifying the |
| <codeph>TBLPROPERTIES</codeph> clause. This clause takes a comma-separated list of |
| key-value pairs and stores those items in the metastore database. You can also change the |
| table properties later with an <codeph>ALTER TABLE</codeph> statement. You can observe the |
| table properties for different delimiter and escape characters using the <codeph>DESCRIBE |
| FORMATTED</codeph> command, and change those settings for an existing table with |
| <codeph>ALTER TABLE ... SET TBLPROPERTIES</codeph>. |
| </p> |
| |
| <p rev="1.2"> |
| You can also associate SerDes properties with the table by specifying key-value pairs |
| through the <codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by |
| Impala, which has its own built-in serializer and deserializer for the file formats it |
| supports. Particular property values might be needed for Hive compatibility with certain |
| variations of file formats, particularly Avro. |
| </p> |
| |
| <p> |
| Some DDL operations that interact with other Hadoop components require specifying |
| particular values in the <codeph>SERDEPROPERTIES</codeph> or |
| <codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or an HBase table. |
| (You typically create HBase tables in Hive, because they require additional clauses not |
| currently available in Impala.) |
| <!-- Haven't got a working example from Lenni, so suppressing this recommendation for now. |
| The Avro schema properties can be specified through either |
| <codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>; |
| for best compatibility with future versions of Hive, |
| use <codeph>SERDEPROPERTIES</codeph> in this case. |
| --> |
| </p> |
| |
| <p> |
| To see the column definitions and column comments for an existing table, for example |
| before issuing a <codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS |
| SELECT</codeph> statement, issue the statement <codeph>DESCRIBE |
| <varname>table_name</varname></codeph>. To see even more detail, such as the location of |
| data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and |
| <codeph>STORED AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED |
| <varname>table_name</varname></codeph>. <codeph>DESCRIBE FORMATTED</codeph> is also needed |
| to see any overall table comment (as opposed to individual column comments). |
| </p> |
| |
| <p> |
| After creating a table, your <cmdname>impala-shell</cmdname> session or another |
| <cmdname>impala-shell</cmdname> connected to the same node can immediately query that |
| table. There might be a brief interval (one statestore heartbeat) before the table can be |
| queried through a different Impala node. To make the <codeph>CREATE TABLE</codeph> |
| statement return only when the table is recognized by all Impala nodes in the cluster, |
| enable the <codeph>SYNC_DDL</codeph> query option. |
| </p> |
| |
| <p rev="1.4.0"> |
| <b>HDFS caching (CACHED IN clause):</b> |
| </p> |
| |
| <p rev="1.4.0"> |
| If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in |
| the table directory or the partition subdirectories are designated to be loaded into |
| memory with the HDFS caching mechanism. See |
| <xref |
| href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using |
| the HDFS caching feature. |
| </p> |
| |
| <p |
| conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/> |
| |
| <!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. --> |
| |
| <p> |
| <b>Column order</b>: |
| </p> |
| |
| <p> |
| If you intend to use the table to hold data files produced by some external source, |
| specify the columns in the same order as they appear in the data files. |
| </p> |
| |
| <p> |
| If you intend to insert or copy data into the table through Impala, or if you have control |
| over the way externally produced data files are arranged, use your judgment to specify |
| columns in the most convenient order: |
| </p> |
| |
| <ul> |
| <li> |
| <p> |
| If certain columns are often <codeph>NULL</codeph>, specify those columns last. You |
| might produce data files that omit these trailing columns entirely. Impala |
| automatically fills in the <codeph>NULL</codeph> values if so. |
| </p> |
| </li> |
| |
| <li> |
| <p> |
| If an unpartitioned table will be used as the source for an <codeph>INSERT ... |
| SELECT</codeph> operation into a partitioned table, specify last in the unpartitioned |
| table any columns that correspond to partition key columns in the partitioned table, |
| and in the same order as the partition key columns are declared in the partitioned |
| table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when copying |
| data to the partitioned table, rather than specifying each column name individually. |
| </p> |
| </li> |
| |
| <li> |
| <p> |
| If you specify columns in an order that you later discover is suboptimal, you can |
| sometimes work around the problem without recreating the table. You can create a view |
| that selects columns from the original table in a permuted order, then do a |
| <codeph>SELECT *</codeph> from the view. When inserting data into a table, you can |
| specify a permuted order for the inserted columns to match the order in the |
| destination table. |
| </p> |
| </li> |
| </ul> |
| |
| <p conref="../shared/impala_common.xml#common/hive_blurb"/> |
| |
| <p> |
| Impala queries can make use of metadata about the table and columns, such as the number of |
| rows in a table or the number of different values in a column. Prior to Impala 1.2.2, to |
| create this metadata, you issued the <codeph>ANALYZE TABLE</codeph> statement in Hive to |
| gather this information, after creating the table and loading representative data into it. |
| In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement produces these |
| statistics within Impala, without needing to use Hive at all. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/hbase_blurb"/> |
| |
| <note> |
| <p> |
| The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because |
| it currently does not support the <codeph>STORED BY</codeph> clause needed for HBase |
| tables. Create such tables in Hive, then query them through Impala. For information on |
| using Impala with HBase tables, see <xref href="impala_hbase.xml#impala_hbase"/>. |
| </p> |
| </note> |
| |
| <p conref="../shared/impala_common.xml#common/s3_blurb"/> |
| |
| <p rev="2.2.0"> |
| To create a table where the data resides in the Amazon Simple Storage Service (S3), |
| specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to |
| the data files in S3. |
| </p> |
| |
| <p rev="2.6.0 IMPALA-1878"> |
| In <keyword keyref="impala26_full"/> and higher, you can use this special |
| <codeph>LOCATION</codeph> syntax as part of a <codeph>CREATE TABLE AS SELECT</codeph> |
| statement. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/s3_ddl"/> |
| |
| <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/> |
| |
| <p conref="../shared/impala_common.xml#common/hdfs_blurb"/> |
| |
| <p> |
| The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in |
| HDFS. The <codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an |
| existing HDFS directory, and does not create any new directory in HDFS. To locate the HDFS |
| data directory for a table, issue a <codeph>DESCRIBE FORMATTED |
| <varname>table</varname></codeph> statement. To examine the contents of that HDFS |
| directory, use an OS command such as <codeph>hdfs dfs -ls |
| hdfs://<varname>path</varname></codeph>, either from the OS command line or through the |
| <codeph>shell</codeph> or <codeph>!</codeph> commands in <cmdname>impala-shell</cmdname>. |
| </p> |
| |
| <p> |
| The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data |
| directory to hold any data copied by the <codeph>INSERT</codeph> portion of the statement. |
| (Even if no data is copied, Impala might create one or more empty data files.) |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/permissions_blurb"/> |
| |
| <p rev=""> |
| The user ID that the <cmdname>impalad</cmdname> daemon runs under, typically the |
| <codeph>impala</codeph> user, must have both execute and write permission for the database |
| directory where the table is being created. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/security_blurb"/> |
| |
| <p conref="../shared/impala_common.xml#common/redaction_yes"/> |
| |
| <p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/> |
| |
| <p conref="../shared/impala_common.xml#common/related_info"/> |
| |
| <p> |
| <xref href="impala_tables.xml#tables"/>, |
| <xref |
| href="impala_alter_table.xml#alter_table"/>, |
| <xref |
| href="impala_drop_table.xml#drop_table"/>, |
| <xref |
| href="impala_partitioning.xml#partitioning"/>, |
| <xref |
| href="impala_tables.xml#internal_tables"/>, |
| <xref |
| href="impala_tables.xml#external_tables"/>, |
| <xref |
| href="impala_compute_stats.xml#compute_stats"/>, |
| <xref |
| href="impala_sync_ddl.xml#sync_ddl"/>, |
| <xref |
| href="impala_show.xml#show_tables"/>, |
| <xref |
| href="impala_show.xml#show_create_table"/>, |
| <xref |
| href="impala_describe.xml#describe"/> |
| </p> |
| |
| </conbody> |
| |
| </concept> |