blob: 99c870d952a8094ecc97a93a989070815cb2d153 [file]
-- test single row error handling, both in COPY and external tables.
--
-- # different errors depending on dispatch timings (missing data vs
-- # invalid input syntax)
--
-- start_matchsubs
--
-- m/CONTEXT\:\s+COPY sreh_copy\,\s+line/
-- s/line \d+(.*)/line SOME_LINE/
--
-- m/DETAIL: Failing row contains \(.*\)/
-- s/DETAIL: Failing row contains \(.*\)/DETAIL: Failing row contains (#####)/
--
-- end_matchsubs
-- ###########################################################
-- COPY
-- ###########################################################
CREATE TABLE sreh_copy(a int, b int, c int) distributed by(a);
--
-- ROW reject limit only
--
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' SEGMENT REJECT LIMIT 1000;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
SELECT * FROM sreh_copy ORDER BY a,b,c;
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
--
-- ROW reject limit only - low value that gets reached
--
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' SEGMENT REJECT LIMIT 2;
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "eleven", column a
CONTEXT: COPY sreh_copy, line 11, column a: "eleven"
SELECT * FROM sreh_copy ORDER BY a,b,c;
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
--
-- error logs
--
DROP TABLE IF EXISTS sreh_copy; CREATE TABLE sreh_copy(a int, b int, c int) distributed by(a);
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' LOG ERRORS INTO WHATEVER SEGMENT REJECT LIMIT 1000;
ERROR: error table is not supported
LINE 1: ...ess/data/bad_data1.data' DELIMITER '|' LOG ERRORS INTO WHATE...
^
HINT: Set gp_ignore_error_table to ignore the [INTO error-table] clause for backward compatibility.
SET gp_ignore_error_table=true;
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' LOG ERRORS INTO WHATEVER SEGMENT REJECT LIMIT 1000;
WARNING: error table is not supported
HINT: Use gp_read_error_log() and gp_truncate_error_log() to view and manage the internal error log associated with your table.
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
SELECT * FROM sreh_copy ORDER BY a,b,c;
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
WITH error_log AS (SELECT gp_read_error_log('sreh_copy')) select count(*) from error_log;
count
-------
10
(1 row)
--
-- error logs - do the same thing again. this time error logs exist and should get data appended
--
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
SELECT * FROM sreh_copy ORDER BY a,b,c;
a | b | c
----+----+----
1 | 1 | 1
1 | 1 | 1
5 | 5 | 5
5 | 5 | 5
6 | 6 | 6
6 | 6 | 6
9 | 9 | 9
9 | 9 | 9
10 | 10 | 10
10 | 10 | 10
14 | 14 | 14
14 | 14 | 14
(12 rows)
SELECT linenum, rawdata FROM gp_read_error_log('sreh_copy') ORDER BY linenum;
linenum | rawdata
---------+---------
2 | 2
2 | 2
3 | 3|
3 | 3|
4 | 4|4|4|4
4 | 4|4|4|4
7 | 7|7|bad
7 | 7|7|bad
8 | bad|8|8
8 | bad|8|8
11 | eleven
11 | eleven
12 | 12||||
12 | 12||||
13 | 13|,13,
13 | 13|,13,
15 | 15
15 | 15
16 | sixteen
16 | sixteen
(20 rows)
--
-- error logs - do the same thing again. this time use data from STDIN (should show in error logs)
--
DROP TABLE IF EXISTS sreh_copy; CREATE TABLE sreh_copy(a int, b int, c int) distributed by(a);
COPY sreh_copy FROM STDIN DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000;
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
SELECT * FROM sreh_copy ORDER BY a,b,c;
a | b | c
-----+-----+-----
100 | 100 | 100
300 | 300 | 300
(2 rows)
WITH error_log AS (SELECT gp_read_error_log('sreh_copy')) select count(*) from error_log;
count
-------
2
(1 row)
--
-- constraint errors - data is rolled back (CHECK)
--
CREATE TABLE sreh_constr(a int check (a > 10));
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
COPY sreh_constr FROM STDIN DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000;
ERROR: new row for relation "sreh_constr" violates check constraint "sreh_constr_a_check"
DETAIL: Failing row contains (10).
CONTEXT: COPY sreh_constr, line 4
WITH error_log AS (SELECT gp_read_error_log('sreh_constr')) select count(*) from error_log;
count
-------
1
(1 row)
SELECT * FROM sreh_constr; -- should exist and be empty
a
---
(0 rows)
DROP TABLE sreh_constr;
--
-- constraint errors - data is rolled back from target table (UNIQUE)
--
CREATE TABLE sreh_constr(a int unique);
COPY sreh_constr FROM STDIN DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000;
ERROR: duplicate key value violates unique constraint "sreh_constr_a_key" (seg0 @hostname@:11001 pid=68456)
DETAIL: Key (a)=(9) already exists.
CONTEXT: COPY sreh_constr, line 5: "9"
WITH error_log AS (SELECT gp_read_error_log('sreh_constr')) select count(*) from error_log;
count
-------
1
(1 row)
SELECT * FROM sreh_constr; -- should exist and be empty
a
---
(0 rows)
--
-- test PERCENT reject limit logic with threshold 100.
-- (we don't test the default threshold (300) yet. we could if needed).
-- bad records in bad_data3.data (300 rows): 5,15,55,56,107,136,160,213,214,286
-- this data file is made with errors that will occur only on the QD (with COPY)
-- so the percent calculation should always be the same regardless of number of
-- QE's in the system.
--
set gp_reject_percent_threshold = 100;
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 10 PERCENT; --pass
NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 2 PERCENT; --fail
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a
CONTEXT: COPY sreh_copy, line 107, column a
--
-- test PERCENT reject limit logic with custom threshold 10 (only practical for test purposes)
--
set gp_reject_percent_threshold = 10;
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 10 PERCENT; --fail
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a
CONTEXT: COPY sreh_copy, line 15, column a
COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 20 PERCENT; --pass
NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data
-- MPP-2933 (multiple dist-key attr conversion errors)
create table t2933 (col1 varchar(3) NULL , col2 char(1) NULL, col3 varchar(4) NULL, col4 char(1) NULL, col5 varchar(20) NULL)
Distributed By (col1, col2, col3, col4);
COPY t2933 FROM STDIN DELIMITER '|' SEGMENT REJECT LIMIT 10;
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
-- cleanup
DROP TABLE sreh_copy;
DROP TABLE sreh_constr;
-- ###########################################################
-- External Tables
-- ###########################################################
CREATE EXTERNAL WEB TABLE gpfdist_sreh_start (x text)
execute E'((@bindir@/gpfdist -p 8080 -d @abs_srcdir@/data </dev/null >/dev/null 2>&1 &); for i in `seq 1 30`; do curl 127.0.0.1:8080 >/dev/null 2>&1 && break; sleep 1; done; echo "starting...") '
on MASTER
FORMAT 'text' (delimiter '|');
CREATE EXTERNAL WEB TABLE gpfdist_sreh_stop (x text)
execute E'(pkill gpfdist || killall gpfdist) > /dev/null 2>&1; echo "stopping..."'
on MASTER
FORMAT 'text' (delimiter '|');
select * from gpfdist_sreh_stop;
x
-------------
stopping...
(1 row)
select * from gpfdist_sreh_start;
x
-------------
starting...
(1 row)
-- create a target table to load into
CREATE TABLE sreh_target(a int, b int, c int) distributed by(a);
--
-- reject limit only
--
CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' )
FORMAT 'text' (delimiter '|')
SEGMENT REJECT LIMIT 10000;
SELECT * FROM sreh_ext;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
INSERT INTO sreh_target SELECT * FROM sreh_ext;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
SELECT count(*) FROM sreh_target;
count
-------
6
(1 row)
TRUNCATE sreh_target;
DROP EXTERNAL TABLE sreh_ext;
--
-- reject limit only - low value that gets reached
--
CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' )
FORMAT 'text' (delimiter '|')
SEGMENT REJECT LIMIT 2;
SELECT * FROM sreh_ext ORDER BY a;
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "", column b (seg1 slice1 @hostname@:11002 pid=68457)
CONTEXT: External table sreh_ext, line 3 of gpfdist://@hostname@:8080/bad_data1.data: "3|"
INSERT INTO sreh_target SELECT * FROM sreh_ext;
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "", column b (seg1 slice1 @hostname@:11002 pid=68459)
CONTEXT: External table sreh_ext, line 3 of gpfdist://@hostname@:8080/bad_data1.data: "3|"
SELECT count(*) FROM sreh_target;
count
-------
0
(1 row)
TRUNCATE sreh_target;
DROP EXTERNAL TABLE sreh_ext;
--
-- error logs
--
CREATE EXTERNAL TABLE sreh_ext_err_tbl(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' )
FORMAT 'text' (delimiter '|')
LOG ERRORS INTO WHATEVER
SEGMENT REJECT LIMIT 1000;
WARNING: error table is not supported
HINT: Use gp_read_error_log() and gp_truncate_error_log() to view and manage the internal error log associated with your table.
SELECT * FROM sreh_ext_err_tbl ORDER BY a;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
-- Verify the fields that we easily can.
WITH error_log AS (SELECT * FROM gp_read_error_log('sreh_ext_err_tbl'))
SELECT relname, filename LIKE 'gpfdist://@hostname@:8080/bad_data1.data [%]' as filename_ok, linenum, errmsg from error_log;
relname | filename_ok | linenum | errmsg
------------------+-------------+---------+------------------------------------------------------------
sreh_ext_err_tbl | t | 2 | missing data for column "b"
sreh_ext_err_tbl | t | 3 | invalid input syntax for type integer: "", column b
sreh_ext_err_tbl | t | 4 | extra data after last expected column, column b
sreh_ext_err_tbl | t | 7 | invalid input syntax for type integer: "bad", column c
sreh_ext_err_tbl | t | 8 | invalid input syntax for type integer: "bad", column a
sreh_ext_err_tbl | t | 11 | invalid input syntax for type integer: "eleven", column a
sreh_ext_err_tbl | t | 12 | extra data after last expected column, column a
sreh_ext_err_tbl | t | 13 | invalid input syntax for type integer: ",13,", column b
sreh_ext_err_tbl | t | 15 | missing data for column "b"
sreh_ext_err_tbl | t | 16 | invalid input syntax for type integer: "sixteen", column a
(10 rows)
CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' )
FORMAT 'text' (delimiter '|')
LOG ERRORS
SEGMENT REJECT LIMIT 1000;
SELECT * FROM sreh_ext ORDER BY a;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
WITH error_log AS (SELECT gp_read_error_log('sreh_ext')) select count(*) from error_log;
count
-------
10
(1 row)
INSERT INTO sreh_target SELECT * FROM sreh_ext;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
SELECT count(*) FROM sreh_target;
count
-------
6
(1 row)
TRUNCATE sreh_target;
--
-- error logs - do the same thing again. this time error logs exist and should get data appended
--
SELECT * FROM sreh_ext ORDER BY a;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
a | b | c
----+----+----
1 | 1 | 1
5 | 5 | 5
6 | 6 | 6
9 | 9 | 9
10 | 10 | 10
14 | 14 | 14
(6 rows)
SELECT linenum, rawdata FROM gp_read_error_log('sreh_ext') ORDER BY linenum;
linenum | rawdata
---------+---------
2 | 2
2 | 2
2 | 2
3 | 3|
3 | 3|
3 | 3|
4 | 4|4|4|4
4 | 4|4|4|4
4 | 4|4|4|4
7 | 7|7|bad
7 | 7|7|bad
7 | 7|7|bad
8 | bad|8|8
8 | bad|8|8
8 | bad|8|8
11 | eleven
11 | eleven
11 | eleven
12 | 12||||
12 | 12||||
12 | 12||||
13 | 13|,13,
13 | 13|,13,
13 | 13|,13,
15 | 15
15 | 15
15 | 15
16 | sixteen
16 | sixteen
16 | sixteen
(30 rows)
INSERT INTO sreh_target SELECT * FROM sreh_ext;
NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data
SELECT count(*) FROM sreh_target;
count
-------
6
(1 row)
TRUNCATE sreh_target;
DROP EXTERNAL TABLE sreh_ext;
DROP EXTERNAL TABLE sreh_ext_err_tbl;
--
-- constraint errors - data is rolled back from both target and error tables (CHECK)
--
CREATE TABLE sreh_constr(a int, b int, c int check (c < 10));
NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table.
HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew.
CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' )
FORMAT 'text' (delimiter '|')
LOG ERRORS
SEGMENT REJECT LIMIT 1000;
INSERT INTO sreh_constr SELECT * FROM sreh_ext;
ERROR: new row for relation "sreh_constr" violates check constraint "sreh_constr_c_check" (seg0 172.17.0.2:25432 pid=161224)
DETAIL: Failing row contains (14, 14, 14).
SELECT linenum, rawdata FROM gp_read_error_log('sreh_constr') ORDER BY linenum;
linenum | rawdata
---------+---------
(0 rows)
SELECT COUNT(*) FROM sreh_constr; -- should be empty
count
-------
0
(1 row)
--
-- test PERCENT reject limit logic with threshold 100.
-- (we don't test the default threshold (300) yet. we could if needed).
-- bad records in bad_data3.data (300 rows): 5,15,55,56,107,136,160,213,214,286
-- in here we assume that gpfdist will distribute all the data to one QE (since
-- it is much smaller than the data buffer it uses) so the percent calculation
-- should always be the same regardless of number of QE's in the system.
--
set gp_reject_percent_threshold = 100;
CREATE EXTERNAL TABLE sreh_ext_10percent(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data3.data' )
FORMAT 'text' (delimiter '|')
SEGMENT REJECT LIMIT 10 PERCENT;
SELECT count(*) FROM sreh_ext_10percent; -- pass
NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data
count
-------
291
(1 row)
CREATE EXTERNAL TABLE sreh_ext_2percent(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data3.data' )
FORMAT 'text' (delimiter '|')
SEGMENT REJECT LIMIT 2 PERCENT;
SELECT count(*) FROM sreh_ext_2percent; -- fail
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a (seg0 slice1 @hostname@:11001 pid=68456)
CONTEXT: External table sreh_ext_2percent, line 107 of gpfdist://@hostname@:8080/bad_data3.data, column a
--
-- test PERCENT reject limit logic with custom threshold 10 (only practical for test purposes)
--
set gp_reject_percent_threshold = 10;
SELECT count(*) FROM sreh_ext_10percent; -- fail
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a (seg0 slice1 @hostname@:11001 pid=68456)
CONTEXT: External table sreh_ext_10percent, line 15 of gpfdist://@hostname@:8080/bad_data3.data, column a
CREATE EXTERNAL TABLE sreh_ext_20percent(a int, b int, c int)
LOCATION ('gpfdist://@hostname@:8080/bad_data3.data' )
FORMAT 'text' (delimiter '|')
SEGMENT REJECT LIMIT 20 PERCENT;
SELECT count(*) FROM sreh_ext_20percent; -- pass
NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data
count
-------
291
(1 row)
set gp_reject_percent_threshold = 300; -- reset
-- cleanup
select * from gpfdist_sreh_stop;
x
-------------
stopping...
(1 row)
DROP EXTERNAL WEB TABLE gpfdist_sreh_stop;
DROP EXTERNAL WEB TABLE gpfdist_sreh_start;
DROP EXTERNAL TABLE sreh_ext;
DROP EXTERNAL TABLE sreh_ext_2percent;
DROP EXTERNAL TABLE sreh_ext_10percent;
DROP EXTERNAL TABLE sreh_ext_20percent;
DROP TABLE sreh_target;
DROP TABLE sreh_constr;