blob: 8933537e03fe20a7e374728d876da36a26996441 [file] [log] [blame]
<!--
$PostgreSQL: pgsql/doc/src/sgml/ref/create_external_table.sgml,v 1.105 2006/09/16 00:30:17 momjian Exp $
PostgreSQL documentation
-->
<refentry id="SQL-CREATEEXTERNALTABLE">
<refmeta>
<refentrytitle id="sql-createexternaltable-title">CREATE EXTERNAL TABLE</refentrytitle>
<refmiscinfo>SQL - Language Statements</refmiscinfo>
</refmeta>
<refnamediv>
<refname>CREATE EXTERNAL TABLE</refname>
<refpurpose>define a new external table</refpurpose>
</refnamediv>
<indexterm zone="sql-createexternaltable">
<primary>CREATE EXTERNAL TABLE</primary>
</indexterm>
<refsynopsisdiv>
<synopsis>
CREATE [READABLE] EXTERNAL TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
LOCATION ('file://seghost[:port]/path/file' [, ...])
| ('gpfdist://filehost[:port]/file_pattern[#transform]'
| ('gpfdists://filehost[:port]/file_pattern[#transform]'
[, ...])
| ('gphdfs://hdfs_host[:port]/path/file')
FORMAT 'TEXT'
[( [HEADER]
[DELIMITER [AS] 'delimiter' | 'OFF']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CSV'
[( [HEADER]
[QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE NOT NULL column [, ...]]
[ESCAPE [AS] 'escape']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'encoding' ]
[ [LOG ERRORS INTO error_table] SEGMENT REJECT LIMIT count
[ROWS | PERCENT] ]
CREATE [READABLE] EXTERNAL WEB TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
LOCATION ('http://webhost[:port]/path/file' [, ...])
| EXECUTE 'command' [ON ALL
| MASTER
| number_of_segments
| HOST ['segment_hostname']
| SEGMENT segment_id ]
FORMAT 'TEXT'
[( [HEADER]
[DELIMITER [AS] 'delimiter' | 'OFF']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CSV'
[( [HEADER]
[QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE NOT NULL column [, ...]]
[ESCAPE [AS] 'escape']
[NEWLINE [ AS ] 'LF' | 'CR' | 'CRLF']
[FILL MISSING FIELDS] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'encoding' ]
[ [LOG ERRORS INTO error_table] SEGMENT REJECT LIMIT count
[ROWS | PERCENT] ]
CREATE WRITABLE EXTERNAL TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
LOCATION('gpfdist://outputhost[:port]/filename[#transform]'
| ('gpfdists://outputhost[:port]/file_pattern[#transform]'
[, ...])
| ('gphdfs://hdfs_host[:port]/path')
FORMAT 'TEXT'
[( [DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF'] )]
| 'CSV'
[([QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE QUOTE column [, ...]] ]
[ESCAPE [AS] 'escape'] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'write_encoding' ]
[ DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY ]
CREATE WRITABLE EXTERNAL WEB TABLE table_name
( column_name data_type [, ...] | LIKE other_table )
EXECUTE 'command' [ON ALL]
FORMAT 'TEXT'
[( [DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[ESCAPE [AS] 'escape' | 'OFF'] )]
| 'CSV'
[([QUOTE [AS] 'quote']
[DELIMITER [AS] 'delimiter']
[NULL [AS] 'null string']
[FORCE QUOTE column [, ...]] ]
[ESCAPE [AS] 'escape'] )]
| 'CUSTOM' (Formatter=<formatter specifications>)
[ ENCODING 'write_encoding' ]
[ DISTRIBUTED BY (column, [ ... ] ) | DISTRIBUTED RANDOMLY ]
</synopsis>
</refsynopsisdiv>
<refsect1 id="SQL-CREATEEXTERNALTABLE-description">
<title>Description</title>
<para>
<command>CREATE EXTERNAL TABLE</command> creates an external table definition.
External tables are comprised of flat files that reside outside of the database.
Creating an external table allows you to access these flat files as though they
were a regular database table. External table data can be queried directly (and in parallel)
using SQL commands. You can, for example, select, join, or sort external table data.
You can also create views and synonyms for external tables. However external tables
are read-only. DML operations (<command>UPDATE</command>, <command>INSERT</command>, <command>DELETE</command>, or <command>TRUNCATE</command>) are not allowed,
and you cannot create indexes on external tables.
</para>
<para>
An external table definition can be thought of as a view that allows running any
SQL query against external data without requiring that data to first be loaded
into the database. External tables provide an easy way to perform basic extraction,
transformation, and loading (ETL) tasks that are common in data warehousing.
External table files are read in parallel by the Greenplum Database segment instances,
so they also provide a means for fast data loading. External tables should not be used
for frequently queried tables.
</para>
<para>
You may specify multiple external data sources or URIs (uniform resource identifiers)
with the <literal>LOCATION</literal> clause — up to the number of primary segment instances
in your Greenplum Database array. Each URI points to an external data file or other data
source. These URIs do not need to exist prior to defining an external table (<command>CREATE EXTERNAL TABLE</command>
does not validate the URIs specified). However you will get an error if they cannot be found
when querying the external table.
</para>
<para>
There are three protocols that you can use to access the external table data sources.
You may use one of the following protocols per <command>CREATE EXTERNAL TABLE</command> statement
(cannot mix protocols):
</para>
<para>
gpfdist — If using the <literal>gpfdist://</literal> protocol, you must have the
Greenplum file distribution program (<literal>gpfdist</literal>) running on the host where the external
data files reside. This program points to a given directory on the file host and
serves external data files to all Greenplum Database segments in parallel.
All primary segments access the external file(s) in parallel regardless of how
many URIs you specify when defining the external table. <literal>gpfdist</literal>
is located in <literal>$GPHOME/bin<literal> on your Greenplum Database master host.
</para>
<para>
file — If using the <literal>file://</literal> protocol the external data file(s)
must reside on a segment host in a location accessible by the Greenplum super user
(gpadmin). The number of URIs specified corresponds to the number of segment
instances that will work in parallel to access the external table. So for
example, if you have a Greenplum Database system with 8 primary segments and
you specify 2 external files, only 2 of the 8 segments will access the external
table in parallel at query time. The number of external files per segment host
cannot exceed the number of primary segment instances on that host. For example,
if your array has 4 primary segment instances per segment host, you may place
4 external files on each segment host. Also, the host name used in the URI
must match the segment host name as registered in the
<literal>gp_configuration</literal> system catalog table.
</para>
<para>
http — If using the <literal>http://</literal> protocol the external data file(s)
must reside on a web server that is accessible by the Greenplum segment hosts.
The number of URIs specified corresponds to the number of segment instances
that will work in parallel to access the external table. So for example,
if you have a Greenplum Database system with 8 primary segments and you specify 2
external files, only 2 of the 8 segments will access the external table in
parallel at query time.
</para>
<para>
The <literal>FORMAT</literal> clause is used to describe how the external
table files are formatted. The files can be in either plain text (TEXT)
or comma separated values (CSV) format. If the data in the file does not
use the default column delimiter, escape character, null string and so on,
you must specify the additional formatting options so that the data in
the external file is read correctly by Greenplum Database.
</para>
</refsect1>
<refsect1>
<title>Parameters</title>
<variablelist>
<varlistentry>
<term><replaceable class="PARAMETER">table_name</replaceable></term>
<listitem>
<para>
The name of the new external table.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><replaceable class="PARAMETER">column_name</replaceable></term>
<listitem>
<para>
The name of a column to create in the external table.
Unlike regular tables, external tables do not have column
constraints or default values, so do not specify those.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><replaceable class="PARAMETER">data_type</replaceable></term>
<listitem>
<para>
The data type of the column.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>LOCATION</literal></term>
<listitem>
<para>
Specifies the URI of the external data source(s) to be used to populate the external table.
If the host name is omitted, localhost is assumed. If port is omitted for http and gpfdist protocols,
port 8080 is assumed.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>FORMAT</literal></term>
<listitem>
<para>
Specifies the format of the external file(s) - either plain text
(<literal>TEXT</literal>) or comma separated values (<literal>CSV</literal>) format.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>DELIMITER</literal></term>
<listitem>
<para>
Specifies a single character that separates columns within
each row (line) of the external file. The default
is a tab character in TEXT mode, a comma in CSV mode.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>NULL</literal></term>
<listitem>
<para>
Specifies the string that represents a null value.
The default is \N (backslash-N) in TEXT mode, and an
empty value with no quotations in CSV mode. You might
prefer an empty string even in TEXT mode for cases where
you do not want to distinguish nulls from empty strings.
When using external tables, any data item that matches this
string will be considered a null value.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>NULL</literal></term>
<listitem>
<para>
Specifies the single character that is used for C escape sequences
(such as \n,\t,\100, and so on) and for escaping data characters
that might otherwise be taken as row or column delimiters. Make
sure to choose an escape character that is not used anywhere in
your actual column data. The default escape character is a
\ (backslash), however it is possible to specify any other
character to represent an escape. It is also possible to
disable escaping by specifying the value <literal>OFF</literal> as the escape value.
This is very useful for data such as web log data that has many
embedded backslashes that are not intended to be escapes.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>HEADER</literal></term>
<listitem>
<para>
For CSV formatted files, specifies that the first line in the
external file(s) is a header row (contains the names of
the table columns) and should not be included as data for the
external table. If using multiple external files, all files must
have a header row. If using the <literal>gpfdist://</literal> protocol,
do not use the <literal>HEADER</literal> clause. Instead,
specify the header (<literal>-h</literal>) option when
starting the <literal>gpfdist</literal> utility.
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>QUOTE</literal></term>
<listitem>
<para>
Specifies the quotation character for CSV mode. The default is double-quote (").
</para>
</listitem>
</varlistentry>
<varlistentry>
<term><literal>FORCE NOT NULL</literal></term>
<listitem>
<para>
In CSV mode, processes each specified column as though
it were quoted and hence not a NULL value. For the default
null string in CSV mode (nothing between two delimiters),
this causes missing values to be evaluated as zero-length strings.
</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1 id="SQL-CREATEEXTERNALTABLE-notes">
<title>Notes</title>
<para>
There is a system view named <literal>pg_max_external_files</literal> that you can use
to determine how many external table files are permitted per external table.
This view lists the available file slots per segment host (if using the <literal>file://</literal> protocol).
For example: <literal>SELECT * FROM pg_max_external_files;</literal>
</para>
<para>
During dump/restore operations, only external table definitions will be backed up
and restored. The data files will not be included. You can use <command>CREATE TABLE AS</command>,
<command>SELECT INTO</command>, or <command>INSERT...SELECT</command> to load external table
data files into another (non-external) database table, and the data will be
loaded in parallel according to the external table definition.
</para>
<para>
If an external table file has a data error, any operation that reads from the
external table will fail. Similar to <command>COPY</command>, loading from
external tables is an all or nothing operation.
</para>
</refsect1>
<refsect1 id="SQL-CREATEEXTERNALTABLE-examples">
<title>Examples</title>
<para>
Start the gpfdist file server program in the background on the
Greenplum master host on port 8081 serving files from
directory /home/staging:
<programlisting>
gpfdist -p 8081 -d /var/data/staging &
</programlisting>
</para>
<para>
Create an external table named <structname>ext_customer</> using
the <literal>gpfdist</> protocol and a text formatted file
named <structname>customers.txt</> with a pipe (|) as the
column delimiter and an empty space as null:
<programlisting>
CREATE EXTERNAL TABLE ext_customer (id int, name text, sponsor text)
LOCATION ( 'gpfdist://*.txt' )
FORMAT 'TEXT' ( DELIMITER '|' NULL ' ');
<programlisting>
</para>
<para>
Create an external table named <structname>ext_expenses</> using
the <literal>file</> protocol and several CSV formatted files
that have a header row:
<programlisting>
CREATE EXTERNAL TABLE ext_expenses (name text, date date, amount float4, category text, description text)
LOCATION (
'file://seghost1/dbfast/external/expenses1.csv',
'file://seghost1/dbfast/external/expenses2.csv',
'file://seghost2/dbfast/external/expenses3.csv',
'file://seghost2/dbfast/external/expenses4.csv',
'file://seghost3/dbfast/external/expenses5.csv',
'file://seghost3/dbfast/external/expenses6.csv',
)
FORMAT 'CSV' ( HEADER );
</programlisting>
</para>
</refsect1>
<refsect1 id="SQL-CREATEEXTERNALTABLE-compatibility">
<title id="SQL-CREATEEXTERNALTABLE-compatibility-title">Compatibility</title>
<para>
The <command>CREATE EXTERNAL TABLE</command> is a Greenplum Database extension.
The <acronym>SQL</acronym> standard makes no provisions for external tables.
</para>
</refsect1>
<refsect1>
<title>See Also</title>
<simplelist type="inline">
<member><xref linkend="sql-createtableas" endterm="sql-createtableas-title"></member>
<member><xref linkend="sql-copy" endterm="sql-copy-title"></member>
<member><xref linkend="sql-selectinto" endterm="sql-selectinto-title"></member>
</simplelist>
</refsect1>
</refentry>