| -- test single row error handling, both in COPY and external tables. |
| -- |
| -- # different errors depending on dispatch timings (missing data vs |
| -- # invalid input syntax) |
| -- |
| -- start_matchsubs |
| -- |
| -- m/CONTEXT\:\s+COPY sreh_copy\,\s+line/ |
| -- s/line \d+(.*)/line SOME_LINE/ |
| -- |
| -- m/DETAIL: Failing row contains \(.*\)/ |
| -- s/DETAIL: Failing row contains \(.*\)/DETAIL: Failing row contains (#####)/ |
| -- |
| -- end_matchsubs |
| -- ########################################################### |
| -- COPY |
| -- ########################################################### |
| CREATE TABLE sreh_copy(a int, b int, c int) distributed by(a); |
| -- |
| -- ROW reject limit only |
| -- |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' SEGMENT REJECT LIMIT 1000; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| SELECT * FROM sreh_copy ORDER BY a,b,c; |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| -- |
| -- ROW reject limit only - low value that gets reached |
| -- |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' SEGMENT REJECT LIMIT 2; |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "eleven", column a |
| CONTEXT: COPY sreh_copy, line 11, column a: "eleven" |
| SELECT * FROM sreh_copy ORDER BY a,b,c; |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| -- |
| -- error logs |
| -- |
| DROP TABLE IF EXISTS sreh_copy; CREATE TABLE sreh_copy(a int, b int, c int) distributed by(a); |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' LOG ERRORS INTO WHATEVER SEGMENT REJECT LIMIT 1000; |
| ERROR: error table is not supported |
| LINE 1: ...ess/data/bad_data1.data' DELIMITER '|' LOG ERRORS INTO WHATE... |
| ^ |
| HINT: Set gp_ignore_error_table to ignore the [INTO error-table] clause for backward compatibility. |
| SET gp_ignore_error_table=true; |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' LOG ERRORS INTO WHATEVER SEGMENT REJECT LIMIT 1000; |
| WARNING: error table is not supported |
| HINT: Use gp_read_error_log() and gp_truncate_error_log() to view and manage the internal error log associated with your table. |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| SELECT * FROM sreh_copy ORDER BY a,b,c; |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| WITH error_log AS (SELECT gp_read_error_log('sreh_copy')) select count(*) from error_log; |
| count |
| ------- |
| 10 |
| (1 row) |
| |
| -- |
| -- error logs - do the same thing again. this time error logs exist and should get data appended |
| -- |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data1.data' DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| SELECT * FROM sreh_copy ORDER BY a,b,c; |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| 14 | 14 | 14 |
| (12 rows) |
| |
| SELECT linenum, rawdata FROM gp_read_error_log('sreh_copy') ORDER BY linenum; |
| linenum | rawdata |
| ---------+--------- |
| 2 | 2 |
| 2 | 2 |
| 3 | 3| |
| 3 | 3| |
| 4 | 4|4|4|4 |
| 4 | 4|4|4|4 |
| 7 | 7|7|bad |
| 7 | 7|7|bad |
| 8 | bad|8|8 |
| 8 | bad|8|8 |
| 11 | eleven |
| 11 | eleven |
| 12 | 12|||| |
| 12 | 12|||| |
| 13 | 13|,13, |
| 13 | 13|,13, |
| 15 | 15 |
| 15 | 15 |
| 16 | sixteen |
| 16 | sixteen |
| (20 rows) |
| |
| -- |
| -- error logs - do the same thing again. this time use data from STDIN (should show in error logs) |
| -- |
| DROP TABLE IF EXISTS sreh_copy; CREATE TABLE sreh_copy(a int, b int, c int) distributed by(a); |
| COPY sreh_copy FROM STDIN DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000; |
| NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data |
| SELECT * FROM sreh_copy ORDER BY a,b,c; |
| a | b | c |
| -----+-----+----- |
| 100 | 100 | 100 |
| 300 | 300 | 300 |
| (2 rows) |
| |
| WITH error_log AS (SELECT gp_read_error_log('sreh_copy')) select count(*) from error_log; |
| count |
| ------- |
| 2 |
| (1 row) |
| |
| -- |
| -- constraint errors - data is rolled back (CHECK) |
| -- |
| CREATE TABLE sreh_constr(a int check (a > 10)); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| COPY sreh_constr FROM STDIN DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000; |
| ERROR: new row for relation "sreh_constr" violates check constraint "sreh_constr_a_check" |
| DETAIL: Failing row contains (10). |
| CONTEXT: COPY sreh_constr, line 4 |
| WITH error_log AS (SELECT gp_read_error_log('sreh_constr')) select count(*) from error_log; |
| count |
| ------- |
| 1 |
| (1 row) |
| |
| SELECT * FROM sreh_constr; -- should exist and be empty |
| a |
| --- |
| (0 rows) |
| |
| DROP TABLE sreh_constr; |
| -- |
| -- constraint errors - data is rolled back from target table (UNIQUE) |
| -- |
| CREATE TABLE sreh_constr(a int unique); |
| COPY sreh_constr FROM STDIN DELIMITER '|' LOG ERRORS SEGMENT REJECT LIMIT 1000; |
| ERROR: duplicate key value violates unique constraint "sreh_constr_a_key" (seg0 @hostname@:11001 pid=68456) |
| DETAIL: Key (a)=(9) already exists. |
| CONTEXT: COPY sreh_constr, line 5: "9" |
| WITH error_log AS (SELECT gp_read_error_log('sreh_constr')) select count(*) from error_log; |
| count |
| ------- |
| 1 |
| (1 row) |
| |
| SELECT * FROM sreh_constr; -- should exist and be empty |
| a |
| --- |
| (0 rows) |
| |
| -- |
| -- test PERCENT reject limit logic with threshold 100. |
| -- (we don't test the default threshold (300) yet. we could if needed). |
| -- bad records in bad_data3.data (300 rows): 5,15,55,56,107,136,160,213,214,286 |
| -- this data file is made with errors that will occur only on the QD (with COPY) |
| -- so the percent calculation should always be the same regardless of number of |
| -- QE's in the system. |
| -- |
| set gp_reject_percent_threshold = 100; |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 10 PERCENT; --pass |
| NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 2 PERCENT; --fail |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a |
| CONTEXT: COPY sreh_copy, line 107, column a |
| -- |
| -- test PERCENT reject limit logic with custom threshold 10 (only practical for test purposes) |
| -- |
| set gp_reject_percent_threshold = 10; |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 10 PERCENT; --fail |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a |
| CONTEXT: COPY sreh_copy, line 15, column a |
| COPY sreh_copy FROM '@abs_srcdir@/data/bad_data3.data' DELIMITER '|' SEGMENT REJECT LIMIT 20 PERCENT; --pass |
| NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data |
| -- MPP-2933 (multiple dist-key attr conversion errors) |
| create table t2933 (col1 varchar(3) NULL , col2 char(1) NULL, col3 varchar(4) NULL, col4 char(1) NULL, col5 varchar(20) NULL) |
| Distributed By (col1, col2, col3, col4); |
| COPY t2933 FROM STDIN DELIMITER '|' SEGMENT REJECT LIMIT 10; |
| NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data |
| -- cleanup |
| DROP TABLE sreh_copy; |
| DROP TABLE sreh_constr; |
| -- ########################################################### |
| -- External Tables |
| -- ########################################################### |
| CREATE EXTERNAL WEB TABLE gpfdist_sreh_start (x text) |
| execute E'((@bindir@/gpfdist -p 8080 -d @abs_srcdir@/data </dev/null >/dev/null 2>&1 &); for i in `seq 1 30`; do curl 127.0.0.1:8080 >/dev/null 2>&1 && break; sleep 1; done; echo "starting...") ' |
| on MASTER |
| FORMAT 'text' (delimiter '|'); |
| CREATE EXTERNAL WEB TABLE gpfdist_sreh_stop (x text) |
| execute E'(pkill gpfdist || killall gpfdist) > /dev/null 2>&1; echo "stopping..."' |
| on MASTER |
| FORMAT 'text' (delimiter '|'); |
| select * from gpfdist_sreh_stop; |
| x |
| ------------- |
| stopping... |
| (1 row) |
| |
| select * from gpfdist_sreh_start; |
| x |
| ------------- |
| starting... |
| (1 row) |
| |
| -- create a target table to load into |
| CREATE TABLE sreh_target(a int, b int, c int) distributed by(a); |
| -- |
| -- reject limit only |
| -- |
| CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' ) |
| FORMAT 'text' (delimiter '|') |
| SEGMENT REJECT LIMIT 10000; |
| SELECT * FROM sreh_ext; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| INSERT INTO sreh_target SELECT * FROM sreh_ext; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| SELECT count(*) FROM sreh_target; |
| count |
| ------- |
| 6 |
| (1 row) |
| |
| TRUNCATE sreh_target; |
| DROP EXTERNAL TABLE sreh_ext; |
| -- |
| -- reject limit only - low value that gets reached |
| -- |
| CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' ) |
| FORMAT 'text' (delimiter '|') |
| SEGMENT REJECT LIMIT 2; |
| SELECT * FROM sreh_ext ORDER BY a; |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "", column b (seg1 slice1 @hostname@:11002 pid=68457) |
| CONTEXT: External table sreh_ext, line 3 of gpfdist://@hostname@:8080/bad_data1.data: "3|" |
| INSERT INTO sreh_target SELECT * FROM sreh_ext; |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "", column b (seg1 slice1 @hostname@:11002 pid=68459) |
| CONTEXT: External table sreh_ext, line 3 of gpfdist://@hostname@:8080/bad_data1.data: "3|" |
| SELECT count(*) FROM sreh_target; |
| count |
| ------- |
| 0 |
| (1 row) |
| |
| TRUNCATE sreh_target; |
| DROP EXTERNAL TABLE sreh_ext; |
| -- |
| -- error logs |
| -- |
| CREATE EXTERNAL TABLE sreh_ext_err_tbl(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' ) |
| FORMAT 'text' (delimiter '|') |
| LOG ERRORS INTO WHATEVER |
| SEGMENT REJECT LIMIT 1000; |
| WARNING: error table is not supported |
| HINT: Use gp_read_error_log() and gp_truncate_error_log() to view and manage the internal error log associated with your table. |
| SELECT * FROM sreh_ext_err_tbl ORDER BY a; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| -- Verify the fields that we easily can. |
| WITH error_log AS (SELECT * FROM gp_read_error_log('sreh_ext_err_tbl')) |
| SELECT relname, filename LIKE 'gpfdist://@hostname@:8080/bad_data1.data [%]' as filename_ok, linenum, errmsg from error_log; |
| relname | filename_ok | linenum | errmsg |
| ------------------+-------------+---------+------------------------------------------------------------ |
| sreh_ext_err_tbl | t | 2 | missing data for column "b" |
| sreh_ext_err_tbl | t | 3 | invalid input syntax for type integer: "", column b |
| sreh_ext_err_tbl | t | 4 | extra data after last expected column, column b |
| sreh_ext_err_tbl | t | 7 | invalid input syntax for type integer: "bad", column c |
| sreh_ext_err_tbl | t | 8 | invalid input syntax for type integer: "bad", column a |
| sreh_ext_err_tbl | t | 11 | invalid input syntax for type integer: "eleven", column a |
| sreh_ext_err_tbl | t | 12 | extra data after last expected column, column a |
| sreh_ext_err_tbl | t | 13 | invalid input syntax for type integer: ",13,", column b |
| sreh_ext_err_tbl | t | 15 | missing data for column "b" |
| sreh_ext_err_tbl | t | 16 | invalid input syntax for type integer: "sixteen", column a |
| (10 rows) |
| |
| CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' ) |
| FORMAT 'text' (delimiter '|') |
| LOG ERRORS |
| SEGMENT REJECT LIMIT 1000; |
| SELECT * FROM sreh_ext ORDER BY a; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| WITH error_log AS (SELECT gp_read_error_log('sreh_ext')) select count(*) from error_log; |
| count |
| ------- |
| 10 |
| (1 row) |
| |
| INSERT INTO sreh_target SELECT * FROM sreh_ext; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| SELECT count(*) FROM sreh_target; |
| count |
| ------- |
| 6 |
| (1 row) |
| |
| TRUNCATE sreh_target; |
| -- |
| -- error logs - do the same thing again. this time error logs exist and should get data appended |
| -- |
| SELECT * FROM sreh_ext ORDER BY a; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| a | b | c |
| ----+----+---- |
| 1 | 1 | 1 |
| 5 | 5 | 5 |
| 6 | 6 | 6 |
| 9 | 9 | 9 |
| 10 | 10 | 10 |
| 14 | 14 | 14 |
| (6 rows) |
| |
| SELECT linenum, rawdata FROM gp_read_error_log('sreh_ext') ORDER BY linenum; |
| linenum | rawdata |
| ---------+--------- |
| 2 | 2 |
| 2 | 2 |
| 2 | 2 |
| 3 | 3| |
| 3 | 3| |
| 3 | 3| |
| 4 | 4|4|4|4 |
| 4 | 4|4|4|4 |
| 4 | 4|4|4|4 |
| 7 | 7|7|bad |
| 7 | 7|7|bad |
| 7 | 7|7|bad |
| 8 | bad|8|8 |
| 8 | bad|8|8 |
| 8 | bad|8|8 |
| 11 | eleven |
| 11 | eleven |
| 11 | eleven |
| 12 | 12|||| |
| 12 | 12|||| |
| 12 | 12|||| |
| 13 | 13|,13, |
| 13 | 13|,13, |
| 13 | 13|,13, |
| 15 | 15 |
| 15 | 15 |
| 15 | 15 |
| 16 | sixteen |
| 16 | sixteen |
| 16 | sixteen |
| (30 rows) |
| |
| INSERT INTO sreh_target SELECT * FROM sreh_ext; |
| NOTICE: found 10 data formatting errors (10 or more input rows), rejected related input data |
| SELECT count(*) FROM sreh_target; |
| count |
| ------- |
| 6 |
| (1 row) |
| |
| TRUNCATE sreh_target; |
| DROP EXTERNAL TABLE sreh_ext; |
| DROP EXTERNAL TABLE sreh_ext_err_tbl; |
| -- |
| -- constraint errors - data is rolled back from both target and error tables (CHECK) |
| -- |
| CREATE TABLE sreh_constr(a int, b int, c int check (c < 10)); |
| NOTICE: Table doesn't have 'DISTRIBUTED BY' clause -- Using column named 'a' as the Apache Cloudberry data distribution key for this table. |
| HINT: The 'DISTRIBUTED BY' clause determines the distribution of data. Make sure column(s) chosen are the optimal data distribution key to minimize skew. |
| CREATE EXTERNAL TABLE sreh_ext(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data1.data' ) |
| FORMAT 'text' (delimiter '|') |
| LOG ERRORS |
| SEGMENT REJECT LIMIT 1000; |
| INSERT INTO sreh_constr SELECT * FROM sreh_ext; |
| ERROR: new row for relation "sreh_constr" violates check constraint "sreh_constr_c_check" (seg0 172.17.0.2:25432 pid=161224) |
| DETAIL: Failing row contains (14, 14, 14). |
| SELECT linenum, rawdata FROM gp_read_error_log('sreh_constr') ORDER BY linenum; |
| linenum | rawdata |
| ---------+--------- |
| (0 rows) |
| |
| SELECT COUNT(*) FROM sreh_constr; -- should be empty |
| count |
| ------- |
| 0 |
| (1 row) |
| |
| -- |
| -- test PERCENT reject limit logic with threshold 100. |
| -- (we don't test the default threshold (300) yet. we could if needed). |
| -- bad records in bad_data3.data (300 rows): 5,15,55,56,107,136,160,213,214,286 |
| -- in here we assume that gpfdist will distribute all the data to one QE (since |
| -- it is much smaller than the data buffer it uses) so the percent calculation |
| -- should always be the same regardless of number of QE's in the system. |
| -- |
| set gp_reject_percent_threshold = 100; |
| CREATE EXTERNAL TABLE sreh_ext_10percent(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data3.data' ) |
| FORMAT 'text' (delimiter '|') |
| SEGMENT REJECT LIMIT 10 PERCENT; |
| SELECT count(*) FROM sreh_ext_10percent; -- pass |
| NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data |
| count |
| ------- |
| 291 |
| (1 row) |
| |
| CREATE EXTERNAL TABLE sreh_ext_2percent(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data3.data' ) |
| FORMAT 'text' (delimiter '|') |
| SEGMENT REJECT LIMIT 2 PERCENT; |
| SELECT count(*) FROM sreh_ext_2percent; -- fail |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a (seg0 slice1 @hostname@:11001 pid=68456) |
| CONTEXT: External table sreh_ext_2percent, line 107 of gpfdist://@hostname@:8080/bad_data3.data, column a |
| -- |
| -- test PERCENT reject limit logic with custom threshold 10 (only practical for test purposes) |
| -- |
| set gp_reject_percent_threshold = 10; |
| SELECT count(*) FROM sreh_ext_10percent; -- fail |
| ERROR: segment reject limit reached, aborting operation |
| DETAIL: Last error was: invalid input syntax for type integer: "BAD", column a (seg0 slice1 @hostname@:11001 pid=68456) |
| CONTEXT: External table sreh_ext_10percent, line 15 of gpfdist://@hostname@:8080/bad_data3.data, column a |
| CREATE EXTERNAL TABLE sreh_ext_20percent(a int, b int, c int) |
| LOCATION ('gpfdist://@hostname@:8080/bad_data3.data' ) |
| FORMAT 'text' (delimiter '|') |
| SEGMENT REJECT LIMIT 20 PERCENT; |
| SELECT count(*) FROM sreh_ext_20percent; -- pass |
| NOTICE: found 9 data formatting errors (9 or more input rows), rejected related input data |
| count |
| ------- |
| 291 |
| (1 row) |
| |
| set gp_reject_percent_threshold = 300; -- reset |
| -- cleanup |
| select * from gpfdist_sreh_stop; |
| x |
| ------------- |
| stopping... |
| (1 row) |
| |
| DROP EXTERNAL WEB TABLE gpfdist_sreh_stop; |
| DROP EXTERNAL WEB TABLE gpfdist_sreh_start; |
| DROP EXTERNAL TABLE sreh_ext; |
| DROP EXTERNAL TABLE sreh_ext_2percent; |
| DROP EXTERNAL TABLE sreh_ext_10percent; |
| DROP EXTERNAL TABLE sreh_ext_20percent; |
| DROP TABLE sreh_target; |
| DROP TABLE sreh_constr; |