blob: 3beceb6b2f36be9280a7ec9cc83afbb3245cd0a0 [file]
--
-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
--
-- Regression test for https://github.com/apache/cloudberry/issues/1425
-- COPY FROM should correctly count encoding errors as single rejected rows,
-- not double-count them. Also, encoding error SREH should work when
-- transcoding is required.
--
-- ===================================================================
-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
--
-- The file has 3 lines:
-- line 1: valid
-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
-- line 3: valid
--
-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
-- and 1 < 2. Before the fix, the error was double-counted (counted as 2),
-- which would cause the reject limit to be reached on the next error check.
-- ===================================================================
CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
-- Verify that valid rows (lines 1 and 3) were imported.
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
(2 rows)
-- Verify that exactly 1 error was logged (not 2).
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
1
(1 row)
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)
TRUNCATE copy_enc_err;
-- ===================================================================
-- Test 2: Non-transcoding with multiple bad lines
--
-- The file has 5 lines: lines 2 and 4 are bad.
-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
-- All 3 valid rows should be imported.
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
5 | good5
(3 rows)
-- Exactly 2 errors should be logged.
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
2
(1 row)
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)
TRUNCATE copy_enc_err;
-- ===================================================================
-- Test 3: Non-transcoding, reject limit reached correctly
--
-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
-- rejectcount (2) >= rejectlimit (2).
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid byte sequence for encoding "UTF8": 0xfe
CONTEXT: COPY copy_enc_err, line 3
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)
-- ===================================================================
-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
--
-- The file has 3 lines with data that claims to be EUC_CN:
-- line 1: valid ASCII (valid in EUC_CN)
-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
-- line 3: valid ASCII (valid in EUC_CN)
--
-- Before the fix, this would error with:
-- "Data validation error: since the source data need transcoding
-- sreh can not handle yet."
-- After the fix, it should skip line 2 and import lines 1 and 3.
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
-- Valid rows should be imported.
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
(2 rows)
-- Exactly 1 error should be logged.
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
1
(1 row)
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)
TRUNCATE copy_enc_err;
-- ===================================================================
-- Test 5: Transcoding with multiple bad lines
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
5 | good5
(3 rows)
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
2
(1 row)
-- Cleanup
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)
DROP TABLE copy_enc_err;