| <?xml version="1.0" encoding="UTF-8"?> |
| <!-- |
| Licensed to the Apache Software Foundation (ASF) under one |
| or more contributor license agreements. See the NOTICE file |
| distributed with this work for additional information |
| regarding copyright ownership. The ASF licenses this file |
| to you under the Apache License, Version 2.0 (the |
| "License"); you may not use this file except in compliance |
| with the License. You may obtain a copy of the License at |
| |
| http://www.apache.org/licenses/LICENSE-2.0 |
| |
| Unless required by applicable law or agreed to in writing, |
| software distributed under the License is distributed on an |
| "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| KIND, either express or implied. See the License for the |
| specific language governing permissions and limitations |
| under the License. |
| --> |
| <!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd"> |
| <concept id="create_table" outputclass="impala sql_statement"> |
| |
| <title outputclass="impala_title sql_statement_title">CREATE TABLE Statement</title> |
| <titlealts audience="PDF"><navtitle>CREATE TABLE</navtitle></titlealts> |
| <prolog> |
| <metadata> |
| <data name="Category" value="Impala"/> |
| <data name="Category" value="SQL"/> |
| <data name="Category" value="DDL"/> |
| <data name="Category" value="Impala Data Types"/> |
| <data name="Category" value="Developers"/> |
| <data name="Category" value="Data Analysts"/> |
| <data name="Category" value="HDFS Caching"/> |
| <data name="Category" value="Tables"/> |
| <data name="Category" value="Schemas"/> |
| <data name="Category" value="S3"/> |
| <!-- <data name="Category" value="Kudu"/> --> |
| </metadata> |
| </prolog> |
| |
| <conbody> |
| |
| <p> |
| <indexterm audience="Cloudera">CREATE TABLE statement</indexterm> |
| Creates a new table and specifies its characteristics. While creating a table, you optionally specify aspects |
| such as: |
| </p> |
| |
| <ul> |
| <li> |
| Whether the table is internal or external. |
| </li> |
| |
| <li> |
| The columns and associated data types. |
| </li> |
| |
| <li> |
| The columns used for physically partitioning the data. |
| </li> |
| |
| <li> |
| The file format for data files. |
| </li> |
| |
| <li> |
| The HDFS directory where the data files are located. |
| </li> |
| </ul> |
| |
| <p conref="../shared/impala_common.xml#common/syntax_blurb"/> |
| |
| <p> |
| The general syntax for creating a table and specifying its columns is as follows: |
| </p> |
| |
| <p> |
| <b>Explicit column definitions:</b> |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...) |
| [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)] |
| [COMMENT '<varname>table_comment</varname>'] |
| [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| [ |
| [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>] |
| ] |
| [LOCATION '<varname>hdfs_path</varname>'] |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| <ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED] |
| </codeblock> |
| |
| <p> |
| <b>Column definitions inferred from data file:</b> |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>' |
| [COMMENT '<varname>table_comment</varname>'] |
| [PARTITIONED BY (<varname>col_name</varname> <varname>data_type</varname> [COMMENT '<varname>col_comment</varname>'], ...)] |
| [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| [ |
| [ROW FORMAT <varname>row_format</varname>] [STORED AS <varname>file_format</varname>] |
| ] |
| [LOCATION '<varname>hdfs_path</varname>'] |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| <ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED] |
| data_type: |
| <varname>primitive_type</varname> |
| | array_type |
| | map_type |
| | struct_type |
| </codeblock> |
| |
| <p> |
| <b>CREATE TABLE AS SELECT:</b> |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] <varname>db_name</varname>.]<varname>table_name</varname> |
| <ph rev="2.5.0">[PARTITIONED BY (<varname>col_name</varname>[, ...])]</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| [WITH SERDEPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| [ |
| [ROW FORMAT <varname>row_format</varname>] <ph rev="CDH-41501">[STORED AS <varname>ctas_file_format</varname>]</ph> |
| ] |
| [LOCATION '<varname>hdfs_path</varname>'] |
| [TBLPROPERTIES ('<varname>key1</varname>'='<varname>value1</varname>', '<varname>key2</varname>'='<varname>value2</varname>', ...)] |
| <ph rev="1.4.0"> [CACHED IN '<varname>pool_name</varname>'</ph> <ph rev="2.2.0">[WITH REPLICATION = <varname>integer</varname>]</ph> | UNCACHED] |
| AS |
| <varname>select_statement</varname></codeblock> |
| |
| <codeblock>primitive_type: |
| TINYINT |
| | SMALLINT |
| | INT |
| | BIGINT |
| | BOOLEAN |
| | FLOAT |
| | DOUBLE |
| <ph rev="1.4.0">| DECIMAL</ph> |
| | STRING |
| <ph rev="2.0.0">| CHAR</ph> |
| <ph rev="2.0.0">| VARCHAR</ph> |
| | TIMESTAMP |
| |
| <ph rev="2.3.0">complex_type: |
| struct_type |
| | array_type |
| | map_type |
| |
| struct_type: STRUCT < <varname>name</varname> : <varname>primitive_or_complex_type</varname> [COMMENT '<varname>comment_string</varname>'], ... > |
| |
| array_type: ARRAY < <varname>primitive_or_complex_type</varname> > |
| |
| map_type: MAP < <varname>primitive_type</varname>, <varname>primitive_or_complex_type</varname> > |
| </ph> |
| row_format: |
| DELIMITED [FIELDS TERMINATED BY '<varname>char</varname>' [ESCAPED BY '<varname>char</varname>']] |
| [LINES TERMINATED BY '<varname>char</varname>'] |
| |
| file_format: |
| PARQUET |
| | TEXTFILE |
| | AVRO |
| | SEQUENCEFILE |
| | RCFILE |
| |
| <ph rev="CDH-41501">ctas_file_format: |
| PARQUET |
| | TEXTFILE</ph> |
| </codeblock> |
| |
| <p conref="../shared/impala_common.xml#common/ddl_blurb"/> |
| |
| <!-- Should really have some info up front about all the data types and file formats. |
| Consider adding here, or at least making inline links to the relevant keywords |
| in the syntax spec above. --> |
| |
| <p> |
| <b>Column definitions:</b> |
| </p> |
| |
| <p> |
| Depending on the form of the <codeph>CREATE TABLE</codeph> statement, the column definitions are |
| required or not allowed. |
| </p> |
| |
| <p> |
| With the <codeph>CREATE TABLE AS SELECT</codeph> and <codeph>CREATE TABLE LIKE</codeph> |
| syntax, you do not specify the columns at all; the column names and types are derived from the source table, query, |
| or data file. |
| </p> |
| |
| <p> |
| With the basic <codeph>CREATE TABLE</codeph> syntax, you must list one or more columns, |
| its name, type, and optionally a comment, in addition to any columns used as partitioning keys. |
| There is one exception where the column list is not required: when creating an Avro table with the |
| <codeph>STORED AS AVRO</codeph> clause, you can omit the list of columns and specify the same metadata |
| as part of the <codeph>TBLPROPERTIES</codeph> clause. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/complex_types_blurb"/> |
| |
| <p rev="2.3.0"> |
| The Impala complex types (<codeph>STRUCT</codeph>, <codeph>ARRAY</codeph>, or <codeph>MAP</codeph>) |
| are available in <keyword keyref="impala23_full"/> and higher. |
| Because you can nest these types (for example, to make an array of maps or a struct |
| with an array field), these types are also sometimes referred to as nested types. |
| See <xref href="impala_complex_types.xml#complex_types"/> for usage details. |
| </p> |
| |
| <!-- This is kind of an obscure and rare usage scenario. Consider moving all the complex type stuff further down |
| after some of the more common clauses. --> |
| <p rev="2.3.0"> |
| Impala can create tables containing complex type columns, with any supported file format. |
| Because currently Impala can only query complex type columns in Parquet tables, creating |
| tables with complex type columns and other file formats such as text is of limited use. |
| For example, you might create a text table including some columns with complex types with Impala, and use Hive |
| as part of your to ingest the nested type data and copy it to an identical Parquet table. |
| Or you might create a partitioned table containing complex type columns using one file format, and |
| use <codeph>ALTER TABLE</codeph> to change the file format of individual partitions to Parquet; Impala |
| can then query only the Parquet-format partitions in that table. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/complex_types_partitioning"/> |
| |
| <p> |
| <b>Internal and external tables (EXTERNAL and LOCATION clauses):</b> |
| </p> |
| |
| <p> |
| By default, Impala creates an <q>internal</q> table, where Impala manages the underlying data files for the |
| table, and physically deletes the data files when you drop the table. If you specify the |
| <codeph>EXTERNAL</codeph> clause, Impala treats the table as an <q>external</q> table, where the data files |
| are typically produced outside Impala and queried from their original locations in HDFS, and Impala leaves |
| the data files in place when you drop the table. For details about internal and external tables, see |
| <xref href="impala_tables.xml#tables"/>. |
| </p> |
| |
| <p> |
| Typically, for an external table you include a <codeph>LOCATION</codeph> clause to specify the path to the |
| HDFS directory where Impala reads and writes files for the table. For example, if your data pipeline produces |
| Parquet files in the HDFS directory <filepath>/user/etl/destination</filepath>, you might create an external |
| table as follows: |
| </p> |
| |
| <codeblock>CREATE EXTERNAL TABLE external_parquet (c1 INT, c2 STRING, c3 TIMESTAMP) |
| STORED AS PARQUET LOCATION '/user/etl/destination'; |
| </codeblock> |
| |
| <p> |
| Although the <codeph>EXTERNAL</codeph> and <codeph>LOCATION</codeph> clauses are often specified together, |
| <codeph>LOCATION</codeph> is optional for external tables, and you can also specify <codeph>LOCATION</codeph> |
| for internal tables. The difference is all about whether Impala <q>takes control</q> of the underlying data |
| files and moves them when you rename the table, or deletes them when you drop the table. For more about |
| internal and external tables and how they interact with the <codeph>LOCATION</codeph> attribute, see |
| <xref href="impala_tables.xml#tables"/>. |
| </p> |
| |
| <p> |
| <b>Partitioned tables (PARTITIONED BY clause):</b> |
| </p> |
| |
| <p> |
| The <codeph>PARTITIONED BY</codeph> clause divides the data files based on the values from one or more |
| specified columns. Impala queries can use the partition metadata to minimize the amount of data that is read |
| from disk or transmitted across the network, particularly during join queries. For details about |
| partitioning, see <xref href="impala_partitioning.xml#partitioning"/>. |
| </p> |
| |
| <p rev="2.5.0"> |
| Prior to <keyword keyref="impala25_full"/> you could use a partitioned table |
| as the source and copy data from it, but could not specify any partitioning clauses for the new table. |
| In <keyword keyref="impala25_full"/> and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a |
| <codeph>CREATE TABLE AS SELECT</codeph> statement. See the examples under the following discussion of |
| the <codeph>CREATE TABLE AS SELECT</codeph> syntax variation. |
| </p> |
| |
| <!-- |
| <p rev="kudu"> |
| <b>Partitioning for Kudu tables (PARTITION BY clause)</b> |
| </p> |
| |
| <p rev="kudu"> |
| For Kudu tables, you specify logical partitioning across one or more columns using the |
| <codeph>PARTITION BY</codeph> clause. In contrast to partitioning for HDFS-based tables, |
| multiple values for a partition key column can be located in the same partition. |
| The optional <codeph>HASH</codeph> clause lets you divide one or a set of partition key columns |
| into a specified number of partitions; you can use more than one <codeph>HASH</codeph> |
| clause, specifying a distinct set of partition key columns for each. |
| The optional <codeph>RANGE</codeph> clause further subdivides the partitions, based on |
| a range of values for the partition key columns. |
| </p> |
| --> |
| |
| <p> |
| <b>Specifying file format (STORED AS and ROW FORMAT clauses):</b> |
| </p> |
| |
| <p rev="DOCS-1523"> |
| The <codeph>STORED AS</codeph> clause identifies the format of the underlying data files. Currently, Impala |
| can query more types of file formats than it can create or insert into. Use Hive to perform any create or |
| data load operations that are not currently available in Impala. For example, Impala can create an Avro, |
| SequenceFile, or RCFile table but cannot insert data into it. There are also Impala-specific procedures for using |
| compression with each kind of file format. For details about working with data files of various formats, see |
| <xref href="impala_file_formats.xml#file_formats"/>. |
| </p> |
| |
| <note> |
| In Impala 1.4.0 and higher, Impala can create Avro tables, which formerly required doing the <codeph>CREATE |
| TABLE</codeph> statement in Hive. See <xref href="impala_avro.xml#avro"/> for details and examples. |
| </note> |
| |
| <p> |
| By default (when no <codeph>STORED AS</codeph> clause is specified), data files in Impala tables are created |
| as text files with Ctrl-A (hex 01) characters as the delimiter. |
| <!-- Verify if ROW FORMAT is entirely ignored outside of text tables, or does it apply somehow to SequenceFile and/or RCFile too? --> |
| Specify the <codeph>ROW FORMAT DELIMITED</codeph> clause to produce or ingest data files that use a different |
| delimiter character such as tab or <codeph>|</codeph>, or a different line end character such as carriage |
| return or newline. When specifying delimiter and line end characters with the <codeph>FIELDS TERMINATED |
| BY</codeph> and <codeph>LINES TERMINATED BY</codeph> clauses, use <codeph>'\t'</codeph> for tab, |
| <codeph>'\n'</codeph> for newline or linefeed, <codeph>'\r'</codeph> for carriage return, and |
| <codeph>\</codeph><codeph>0</codeph> for ASCII <codeph>nul</codeph> (hex 00). For more examples of text |
| tables, see <xref href="impala_txtfile.xml#txtfile"/>. |
| </p> |
| |
| <p> |
| The <codeph>ESCAPED BY</codeph> clause applies both to text files that you create through an |
| <codeph>INSERT</codeph> statement to an Impala <codeph>TEXTFILE</codeph> table, and to existing data files |
| that you put into an Impala table directory. (You can ingest existing data files either by creating the table |
| with <codeph>CREATE EXTERNAL TABLE ... LOCATION</codeph>, the <codeph>LOAD DATA</codeph> statement, or |
| through an HDFS operation such as <codeph>hdfs dfs -put <varname>file</varname> |
| <varname>hdfs_path</varname></codeph>.) Choose an escape character that is not used anywhere else in the |
| file, and put it in front of each instance of the delimiter character that occurs within a field value. |
| Surrounding field values with quotation marks does not help Impala to parse fields with embedded delimiter |
| characters; the quotation marks are considered to be part of the column value. If you want to use |
| <codeph>\</codeph> as the escape character, specify the clause in <cmdname>impala-shell</cmdname> as |
| <codeph>ESCAPED BY '\\'</codeph>. |
| </p> |
| |
| <note conref="../shared/impala_common.xml#common/thorn"/> |
| |
| <p> |
| <b>Cloning tables (LIKE clause):</b> |
| </p> |
| |
| <p> |
| To create an empty table with the same columns, comments, and other attributes as another table, use the |
| following variation. The <codeph>CREATE TABLE ... LIKE</codeph> form allows a restricted set of clauses, |
| currently only the <codeph>LOCATION</codeph>, <codeph>COMMENT</codeph>, and <codeph>STORED AS</codeph> |
| clauses. |
| </p> |
| |
| <codeblock>CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [<varname>db_name</varname>.]<varname>table_name</varname> |
| <ph rev="1.4.0">LIKE { [<varname>db_name</varname>.]<varname>table_name</varname> | PARQUET '<varname>hdfs_path_of_parquet_file</varname>' }</ph> |
| [COMMENT '<varname>table_comment</varname>'] |
| [STORED AS <varname>file_format</varname>] |
| [LOCATION '<varname>hdfs_path</varname>']</codeblock> |
| |
| <note rev="1.2.0"> |
| <p rev="1.2.0"> |
| To clone the structure of a table and transfer data into it in a single operation, use the <codeph>CREATE |
| TABLE AS SELECT</codeph> syntax described in the next subsection. |
| </p> |
| </note> |
| |
| <p> |
| When you clone the structure of an existing table using the <codeph>CREATE TABLE ... LIKE</codeph> syntax, |
| the new table keeps the same file format as the original one, so you only need to specify the <codeph>STORED |
| AS</codeph> clause if you want to use a different file format, or when specifying a view as the original |
| table. (Creating a table <q>like</q> a view produces a text table by default.) |
| </p> |
| |
| <p> |
| Although normally Impala cannot create an HBase table directly, Impala can clone the structure of an existing |
| HBase table with the <codeph>CREATE TABLE ... LIKE</codeph> syntax, preserving the file format and metadata |
| from the original table. |
| </p> |
| |
| <p> |
| There are some exceptions to the ability to use <codeph>CREATE TABLE ... LIKE</codeph> with an Avro table. |
| For example, you cannot use this technique for an Avro table that is specified with an Avro schema but no |
| columns. When in doubt, check if a <codeph>CREATE TABLE ... LIKE</codeph> operation works in Hive; if not, it |
| typically will not work in Impala either. |
| </p> |
| |
| <p> |
| If the original table is partitioned, the new table inherits the same partition key columns. Because the new |
| table is initially empty, it does not inherit the actual partitions that exist in the original one. To create |
| partitions in the new table, insert data or issue <codeph>ALTER TABLE ... ADD PARTITION</codeph> statements. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/create_table_like_view"/> |
| |
| <p> |
| Because <codeph>CREATE TABLE ... LIKE</codeph> only manipulates table metadata, not the physical data of the |
| table, issue <codeph>INSERT INTO TABLE</codeph> statements afterward to copy any data from the original table |
| into the new one, optionally converting the data to a new file format. (For some file formats, Impala can do |
| a <codeph>CREATE TABLE ... LIKE</codeph> to create the table, but Impala cannot insert data in that file |
| format; in these cases, you must load the data in Hive. See |
| <xref href="impala_file_formats.xml#file_formats"/> for details.) |
| </p> |
| |
| <p rev="1.2" id="ctas"> |
| <b>CREATE TABLE AS SELECT:</b> |
| </p> |
| |
| <p> |
| The <codeph>CREATE TABLE AS SELECT</codeph> syntax is a shorthand notation to create a table based on column |
| definitions from another table, and copy data from the source table to the destination table without issuing |
| any separate <codeph>INSERT</codeph> statement. This idiom is so popular that it has its own acronym, |
| <q>CTAS</q>. |
| </p> |
| |
| <p> |
| The following examples show how to copy data from a source table <codeph>T1</codeph> |
| to a variety of destinations tables, applying various transformations to the table |
| properties, table layout, or the data itself as part of the operation: |
| </p> |
| |
| <codeblock> |
| -- Sample table to be the source of CTAS operations. |
| CREATE TABLE t1 (x INT, y STRING); |
| INSERT INTO t1 VALUES (1, 'one'), (2, 'two'), (3, 'three'); |
| |
| -- Clone all the columns and data from one table to another. |
| CREATE TABLE clone_of_t1 AS SELECT * FROM t1; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| |
| -- Clone the columns and data, and convert the data to a different file format. |
| CREATE TABLE parquet_version_of_t1 STORED AS PARQUET AS SELECT * FROM t1; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| |
| -- Copy only some rows to the new table. |
| CREATE TABLE subset_of_t1 AS SELECT * FROM t1 WHERE x >= 2; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 2 row(s) | |
| +-------------------+ |
| |
| -- Same idea as CREATE TABLE LIKE: clone table layout but do not copy any data. |
| CREATE TABLE empty_clone_of_t1 AS SELECT * FROM t1 WHERE 1=0; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 0 row(s) | |
| +-------------------+ |
| |
| -- Reorder and rename columns and transform the data. |
| CREATE TABLE t5 AS SELECT upper(y) AS s, x+1 AS a, 'Entirely new column' AS n FROM t1; |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| SELECT * FROM t5; |
| +-------+---+---------------------+ |
| | s | a | n | |
| +-------+---+---------------------+ |
| | ONE | 2 | Entirely new column | |
| | TWO | 3 | Entirely new column | |
| | THREE | 4 | Entirely new column | |
| +-------+---+---------------------+ |
| </codeblock> |
| |
| <!-- These are a little heavyweight to get into here. Therefore commenting out. |
| Some overlap with the new column-changing examples in the code listing above. |
| Create tables with different column order, names, or types than the original. |
| CREATE TABLE some_columns_from_t1 AS SELECT c1, c3, c5 FROM t1; |
| CREATE TABLE reordered_columns_from_t1 AS SELECT c4, c3, c1, c2 FROM t1; |
| CREATE TABLE synthesized_columns AS SELECT upper(c1) AS all_caps, c2+c3 AS total, "California" AS state FROM t1;</codeblock> |
| --> |
| |
| <!-- CREATE TABLE AS <select> now incorporated up higher in the original syntax diagram. --> |
| |
| <p rev="1.2"> |
| See <xref href="impala_select.xml#select"/> for details about query syntax for the <codeph>SELECT</codeph> |
| portion of a <codeph>CREATE TABLE AS SELECT</codeph> statement. |
| </p> |
| |
| <p rev="1.2"> |
| The newly created table inherits the column names that you select from the original table, which you can |
| override by specifying column aliases in the query. Any column or table comments from the original table are |
| not carried over to the new table. |
| </p> |
| |
| <note rev="DOCS-1523"> |
| When using the <codeph>STORED AS</codeph> clause with a <codeph>CREATE TABLE AS SELECT</codeph> |
| statement, the destination table must be a file format that Impala can write to: currently, |
| text or Parquet. You cannot specify an Avro, SequenceFile, or RCFile table as the destination |
| table for a CTAS operation. |
| </note> |
| |
| <p rev="2.5.0"> |
| Prior to <keyword keyref="impala25_full"/> you could use a partitioned table |
| as the source and copy data from it, but could not specify any partitioning clauses for the new table. |
| In <keyword keyref="impala25_full"/> and higher, you can now use the <codeph>PARTITIONED BY</codeph> clause with a |
| <codeph>CREATE TABLE AS SELECT</codeph> statement. The following example demonstrates how you can copy |
| data from an unpartitioned table in a <codeph>CREATE TABLE AS SELECT</codeph> operation, creating a new |
| partitioned table in the process. The main syntax consideration is the column order in the <codeph>PARTITIONED BY</codeph> |
| clause and the select list: the partition key columns must be listed last in the select list, in the same |
| order as in the <codeph>PARTITIONED BY</codeph> clause. Therefore, in this case, the column order in the |
| destination table is different from the source table. You also only specify the column names in the |
| <codeph>PARTITIONED BY</codeph> clause, not the data types or column comments. |
| </p> |
| |
| <codeblock rev="2.5.0"> |
| create table partitions_no (year smallint, month tinyint, s string); |
| insert into partitions_no values (2016, 1, 'January 2016'), |
| (2016, 2, 'February 2016'), (2016, 3, 'March 2016'); |
| |
| -- Prove that the source table is not partitioned. |
| show partitions partitions_no; |
| ERROR: AnalysisException: Table is not partitioned: ctas_partition_by.partitions_no |
| |
| -- Create new table with partitions based on column values from source table. |
| <b>create table partitions_yes partitioned by (year, month) |
| as select s, year, month from partitions_no;</b> |
| +-------------------+ |
| | summary | |
| +-------------------+ |
| | Inserted 3 row(s) | |
| +-------------------+ |
| |
| -- Prove that the destination table is partitioned. |
| show partitions partitions_yes; |
| +-------+-------+-------+--------+------+... |
| | year | month | #Rows | #Files | Size |... |
| +-------+-------+-------+--------+------+... |
| | 2016 | 1 | -1 | 1 | 13B |... |
| | 2016 | 2 | -1 | 1 | 14B |... |
| | 2016 | 3 | -1 | 1 | 11B |... |
| | Total | | -1 | 3 | 38B |... |
| +-------+-------+-------+--------+------+... |
| </codeblock> |
| |
| <p rev="2.5.0"> |
| The most convenient layout for partitioned tables is with all the |
| partition key columns at the end. The CTAS <codeph>PARTITIONED BY</codeph> syntax |
| requires that column order in the select list, resulting in that same |
| column order in the destination table. |
| </p> |
| |
| <codeblock rev="2.5.0"> |
| describe partitions_no; |
| +-------+----------+---------+ |
| | name | type | comment | |
| +-------+----------+---------+ |
| | year | smallint | | |
| | month | tinyint | | |
| | s | string | | |
| +-------+----------+---------+ |
| |
| -- The CTAS operation forced us to put the partition key columns last. |
| -- Having those columns last works better with idioms such as SELECT * |
| -- for partitioned tables. |
| describe partitions_yes; |
| +-------+----------+---------+ |
| | name | type | comment | |
| +-------+----------+---------+ |
| | s | string | | |
| | year | smallint | | |
| | month | tinyint | | |
| +-------+----------+---------+ |
| </codeblock> |
| |
| <p rev="2.5.0"> |
| Attempting to use a select list with the partition key columns |
| not at the end results in an error due to a column name mismatch: |
| </p> |
| |
| <codeblock rev="2.5.0"> |
| -- We expect this CTAS to fail because non-key column S |
| -- comes after key columns YEAR and MONTH in the select list. |
| create table partitions_maybe partitioned by (year, month) |
| as select year, month, s from partitions_no; |
| ERROR: AnalysisException: Partition column name mismatch: year != month |
| </codeblock> |
| |
| <p rev="1.2"> |
| For example, the following statements show how you can clone all the data in a table, or a subset of the |
| columns and/or rows, or reorder columns, rename them, or construct them out of expressions: |
| </p> |
| |
| <p rev="1.2"> |
| As part of a CTAS operation, you can convert the data to any file format that Impala can write (currently, |
| <codeph>TEXTFILE</codeph> and <codeph>PARQUET</codeph>). You cannot specify the lower-level properties of a |
| text table, such as the delimiter. |
| </p> |
| |
| <p rev="obwl" conref="../shared/impala_common.xml#common/insert_sort_blurb"/> |
| |
| <p rev="1.4.0"> |
| <b>CREATE TABLE LIKE PARQUET:</b> |
| </p> |
| |
| <p rev="1.4.0"> |
| The variation <codeph>CREATE TABLE ... LIKE PARQUET '<varname>hdfs_path_of_parquet_file</varname>'</codeph> |
| lets you skip the column definitions of the <codeph>CREATE TABLE</codeph> statement. The column names and |
| data types are automatically configured based on the organization of the specified Parquet data file, which |
| must already reside in HDFS. You can use a data file located outside the Impala database directories, or a |
| file from an existing Impala Parquet table; either way, Impala only uses the column definitions from the file |
| and does not use the HDFS location for the <codeph>LOCATION</codeph> attribute of the new table. (Although |
| you can also specify the enclosing directory with the <codeph>LOCATION</codeph> attribute, to both use the |
| same schema as the data file and point the Impala table at the associated directory for querying.) |
| </p> |
| |
| <p rev="1.4.0"> |
| The following considerations apply when you use the <codeph>CREATE TABLE LIKE PARQUET</codeph> technique: |
| </p> |
| |
| <ul rev="1.4.0"> |
| <li> |
| Any column comments from the original table are not preserved in the new table. Each column in the new |
| table has a comment stating the low-level Parquet field type used to deduce the appropriate SQL column |
| type. |
| </li> |
| |
| <li> |
| If you use a data file from a partitioned Impala table, any partition key columns from the original table |
| are left out of the new table, because they are represented in HDFS directory names rather than stored in |
| the data file. To preserve the partition information, repeat the same <codeph>PARTITION</codeph> clause as |
| in the original <codeph>CREATE TABLE</codeph> statement. |
| </li> |
| |
| <li> |
| The file format of the new table defaults to text, as with other kinds of <codeph>CREATE TABLE</codeph> |
| statements. To make the new table also use Parquet format, include the clause <codeph>STORED AS |
| PARQUET</codeph> in the <codeph>CREATE TABLE LIKE PARQUET</codeph> statement. |
| </li> |
| |
| <li> |
| If the Parquet data file comes from an existing Impala table, currently, any <codeph>TINYINT</codeph> or |
| <codeph>SMALLINT</codeph> columns are turned into <codeph>INT</codeph> columns in the new table. |
| Internally, Parquet stores such values as 32-bit integers. |
| </li> |
| |
| <li> |
| When the destination table uses the Parquet file format, the <codeph>CREATE TABLE AS SELECT</codeph> and |
| <codeph>INSERT ... SELECT</codeph> statements always create at least one data file, even if the |
| <codeph>SELECT</codeph> part of the statement does not match any rows. You can use such an empty Parquet |
| data file as a template for subsequent <codeph>CREATE TABLE LIKE PARQUET</codeph> statements. |
| </li> |
| </ul> |
| |
| <p> |
| For more details about creating Parquet tables, and examples of the <codeph>CREATE TABLE LIKE |
| PARQUET</codeph> syntax, see <xref href="impala_parquet.xml#parquet"/>. |
| </p> |
| |
| <p> |
| <b>Visibility and Metadata (TBLPROPERTIES and WITH SERDEPROPERTIES clauses):</b> |
| </p> |
| |
| <p rev="1.2"> |
| You can associate arbitrary items of metadata with a table by specifying the <codeph>TBLPROPERTIES</codeph> |
| clause. This clause takes a comma-separated list of key-value pairs and stores those items in the metastore |
| database. You can also change the table properties later with an <codeph>ALTER TABLE</codeph> statement. You |
| can observe the table properties for different delimiter and escape characters using the <codeph>DESCRIBE |
| FORMATTED</codeph> command, and change those settings for an existing table with <codeph>ALTER TABLE ... SET |
| TBLPROPERTIES</codeph>. |
| </p> |
| |
| <p rev="1.2"> |
| You can also associate SerDes properties with the table by specifying key-value pairs through the |
| <codeph>WITH SERDEPROPERTIES</codeph> clause. This metadata is not used by Impala, which has its own built-in |
| serializer and deserializer for the file formats it supports. Particular property values might be needed for |
| Hive compatibility with certain variations of file formats, particularly Avro. |
| </p> |
| |
| <p> |
| Some DDL operations that interact with other Hadoop components require specifying particular values in the |
| <codeph>SERDEPROPERTIES</codeph> or <codeph>TBLPROPERTIES</codeph> fields, such as creating an Avro table or |
| an HBase table. (You typically create HBase tables in Hive, because they require additional clauses not |
| currently available in Impala.) |
| <!-- Haven't got a working example from Lenni, so suppressing this recommendation for now. |
| The Avro schema properties can be specified through either |
| <codeph>TBLPROPERTIES</codeph> or <codeph>SERDEPROPERTIES</codeph>; |
| for best compatibility with future versions of Hive, |
| use <codeph>SERDEPROPERTIES</codeph> in this case. |
| --> |
| </p> |
| |
| <p> |
| To see the column definitions and column comments for an existing table, for example before issuing a |
| <codeph>CREATE TABLE ... LIKE</codeph> or a <codeph>CREATE TABLE ... AS SELECT</codeph> statement, issue the |
| statement <codeph>DESCRIBE <varname>table_name</varname></codeph>. To see even more detail, such as the |
| location of data files and the values for clauses such as <codeph>ROW FORMAT</codeph> and <codeph>STORED |
| AS</codeph>, issue the statement <codeph>DESCRIBE FORMATTED <varname>table_name</varname></codeph>. |
| <codeph>DESCRIBE FORMATTED</codeph> is also needed to see any overall table comment (as opposed to individual |
| column comments). |
| </p> |
| |
| <p> |
| After creating a table, your <cmdname>impala-shell</cmdname> session or another |
| <cmdname>impala-shell</cmdname> connected to the same node can immediately query that table. There might be a |
| brief interval (one statestore heartbeat) before the table can be queried through a different Impala node. To |
| make the <codeph>CREATE TABLE</codeph> statement return only when the table is recognized by all Impala nodes |
| in the cluster, enable the <codeph>SYNC_DDL</codeph> query option. |
| </p> |
| |
| <p rev="1.4.0"> |
| <b>HDFS caching (CACHED IN clause):</b> |
| </p> |
| |
| <p rev="1.4.0"> |
| If you specify the <codeph>CACHED IN</codeph> clause, any existing or future data files in the table |
| directory or the partition subdirectories are designated to be loaded into memory with the HDFS caching |
| mechanism. See <xref href="impala_perf_hdfs_caching.xml#hdfs_caching"/> for details about using the HDFS |
| caching feature. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/impala_cache_replication_factor"/> |
| |
| <!-- Say something in here about the SHOW statement, e.g. SHOW TABLES, SHOW TABLE/COLUMN STATS, SHOW PARTITIONS. --> |
| |
| <p> |
| <b>Column order</b>: |
| </p> |
| |
| <p> |
| If you intend to use the table to hold data files produced by some external source, specify the columns in |
| the same order as they appear in the data files. |
| </p> |
| |
| <p> |
| If you intend to insert or copy data into the table through Impala, or if you have control over the way |
| externally produced data files are arranged, use your judgment to specify columns in the most convenient |
| order: |
| </p> |
| |
| <ul> |
| <li> |
| <p> |
| If certain columns are often <codeph>NULL</codeph>, specify those columns last. You might produce data |
| files that omit these trailing columns entirely. Impala automatically fills in the <codeph>NULL</codeph> |
| values if so. |
| </p> |
| </li> |
| |
| <li> |
| <p> |
| If an unpartitioned table will be used as the source for an <codeph>INSERT ... SELECT</codeph> operation |
| into a partitioned table, specify last in the unpartitioned table any columns that correspond to |
| partition key columns in the partitioned table, and in the same order as the partition key columns are |
| declared in the partitioned table. This technique lets you use <codeph>INSERT ... SELECT *</codeph> when |
| copying data to the partitioned table, rather than specifying each column name individually. |
| </p> |
| </li> |
| |
| <li> |
| <p> |
| If you specify columns in an order that you later discover is suboptimal, you can sometimes work around |
| the problem without recreating the table. You can create a view that selects columns from the original |
| table in a permuted order, then do a <codeph>SELECT *</codeph> from the view. When inserting data into a |
| table, you can specify a permuted order for the inserted columns to match the order in the destination |
| table. |
| </p> |
| </li> |
| </ul> |
| |
| <p conref="../shared/impala_common.xml#common/hive_blurb"/> |
| |
| <p> |
| Impala queries can make use of metadata about the table and columns, such as the number of rows in a table or |
| the number of different values in a column. Prior to Impala 1.2.2, to create this metadata, you issued the |
| <codeph>ANALYZE TABLE</codeph> statement in Hive to gather this information, after creating the table and |
| loading representative data into it. In Impala 1.2.2 and higher, the <codeph>COMPUTE STATS</codeph> statement |
| produces these statistics within Impala, without needing to use Hive at all. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/hbase_blurb"/> |
| |
| <note> |
| <p> |
| The Impala <codeph>CREATE TABLE</codeph> statement cannot create an HBase table, because it currently does |
| not support the <codeph>STORED BY</codeph> clause needed for HBase tables. Create such tables in Hive, then |
| query them through Impala. For information on using Impala with HBase tables, see |
| <xref href="impala_hbase.xml#impala_hbase"/>. |
| </p> |
| </note> |
| |
| <p conref="../shared/impala_common.xml#common/s3_blurb"/> |
| <p rev="2.2.0"> |
| To create a table where the data resides in the Amazon Simple Storage Service (S3), |
| specify a <codeph>s3a://</codeph> prefix <codeph>LOCATION</codeph> attribute pointing to the data files in S3. |
| </p> |
| |
| <p rev="2.6.0 CDH-39913 IMPALA-1878"> |
| In <keyword keyref="impala26_full"/> and higher, you can |
| use this special <codeph>LOCATION</codeph> syntax |
| as part of a <codeph>CREATE TABLE AS SELECT</codeph> statement. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/s3_ddl"/> |
| |
| <p conref="../shared/impala_common.xml#common/insert_sort_blurb"/> |
| |
| <p conref="../shared/impala_common.xml#common/hdfs_blurb"/> |
| |
| <p> |
| The <codeph>CREATE TABLE</codeph> statement for an internal table creates a directory in HDFS. The |
| <codeph>CREATE EXTERNAL TABLE</codeph> statement associates the table with an existing HDFS directory, and |
| does not create any new directory in HDFS. To locate the HDFS data directory for a table, issue a |
| <codeph>DESCRIBE FORMATTED <varname>table</varname></codeph> statement. To examine the contents of that HDFS |
| directory, use an OS command such as <codeph>hdfs dfs -ls hdfs://<varname>path</varname></codeph>, either |
| from the OS command line or through the <codeph>shell</codeph> or <codeph>!</codeph> commands in |
| <cmdname>impala-shell</cmdname>. |
| </p> |
| |
| <p> |
| The <codeph>CREATE TABLE AS SELECT</codeph> syntax creates data files under the table data directory to hold |
| any data copied by the <codeph>INSERT</codeph> portion of the statement. (Even if no data is copied, Impala |
| might create one or more empty data files.) |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/permissions_blurb"/> |
| <p rev="CDH-19187"> |
| The user ID that the <cmdname>impalad</cmdname> daemon runs under, |
| typically the <codeph>impala</codeph> user, must have both execute and write |
| permission for the database directory where the table is being created. |
| </p> |
| |
| <p conref="../shared/impala_common.xml#common/security_blurb"/> |
| <p conref="../shared/impala_common.xml#common/redaction_yes"/> |
| |
| <p conref="../shared/impala_common.xml#common/cancel_blurb_maybe"/> |
| |
| <p conref="../shared/impala_common.xml#common/related_info"/> |
| |
| <p> |
| <xref href="impala_tables.xml#tables"/>, |
| <xref href="impala_alter_table.xml#alter_table"/>, <xref href="impala_drop_table.xml#drop_table"/>, |
| <xref href="impala_partitioning.xml#partitioning"/>, <xref href="impala_tables.xml#internal_tables"/>, |
| <xref href="impala_tables.xml#external_tables"/>, <xref href="impala_compute_stats.xml#compute_stats"/>, |
| <xref href="impala_sync_ddl.xml#sync_ddl"/>, <xref href="impala_show.xml#show_tables"/>, |
| <xref href="impala_show.xml#show_create_table"/>, <xref href="impala_describe.xml#describe"/> |
| </p> |
| </conbody> |
| </concept> |