diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c index 5d1ff1b53c8..6bf309478aa 100644 --- a/src/backend/commands/copyfromparse.c +++ b/src/backend/commands/copyfromparse.c @@ -649,8 +649,7 @@ CopyLoadInputBuf(CopyFromState cstate) */ if (cstate->input_reached_error) { - /* so far, we only support no transcoding conversion error handling */ - if (cstate->cdbsreh && !cstate->need_transcoding) + if (cstate->cdbsreh) { MemoryContext oldcontext = CurrentMemoryContext; PG_TRY(); @@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo, void RemoveInvalidDataInBuf(CopyFromState cstate) { - int nbytes; int scanidx; if (cstate->errMode == ALL_OR_NOTHING) @@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate) if (!cstate->need_transcoding) { + int nbytes; + /* * According to `BeginCopyFrom`, if not need_transcoding these two * pointer share one memory space. @@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate) /* leave a hint to identify find eol after next raw page read */ cstate->find_eol_with_rawreading = true; } - - /* reset input buf, so we can redo conversion/verification */ - cstate->input_reached_error = false; - cstate->input_buf_index = 0; - cstate->input_buf_len = 0; - - /* reset line_buf */ - resetStringInfo(&cstate->line_buf); - cstate->line_buf_valid = false; - cstate->cdbsreh->rejectcount++; } else { - ereport(ERROR, (errmsg("Data validation error: since the source data " - "need transcoding sreh can not handle yet."))); + /* + * Transcoding case: raw_buf and input_buf are separate buffers. + * Skip the bad line in raw_buf by finding the next EOL. No need to + * memmove raw_buf here; CopyLoadRawBuf() will compact it when more + * raw data is needed. + */ + if (FindEolInUnverifyRawBuf(cstate, &scanidx)) + { + cstate->raw_buf_index += scanidx; + } + else + { + /* Current page can not find eol, to skip current raw buffer */ + cstate->raw_buf_len = 0; + cstate->raw_buf_index = 0; + + /* leave a hint to identify find eol after next raw page read */ + cstate->find_eol_with_rawreading = true; + } } + + /* reset input buf, so we can redo conversion/verification */ + cstate->input_reached_error = false; + cstate->input_buf_index = 0; + cstate->input_buf_len = 0; + + /* reset line_buf */ + resetStringInfo(&cstate->line_buf); + cstate->line_buf_valid = false; } static bool diff --git a/src/test/regress/data/copy_enc_err_euccn.data b/src/test/regress/data/copy_enc_err_euccn.data new file mode 100644 index 00000000000..a73dd217cd3 --- /dev/null +++ b/src/test/regress/data/copy_enc_err_euccn.data @@ -0,0 +1,3 @@ +1|good1 +2|bad¡ +3|good3 diff --git a/src/test/regress/data/copy_enc_err_euccn_multi.data b/src/test/regress/data/copy_enc_err_euccn_multi.data new file mode 100644 index 00000000000..8a5a30ae25e --- /dev/null +++ b/src/test/regress/data/copy_enc_err_euccn_multi.data @@ -0,0 +1,5 @@ +1|good1 +2|bad¡ +3|good3 +4|bad¡ +5|good5 diff --git a/src/test/regress/data/copy_enc_err_utf8.data b/src/test/regress/data/copy_enc_err_utf8.data new file mode 100644 index 00000000000..e8297cc05ca --- /dev/null +++ b/src/test/regress/data/copy_enc_err_utf8.data @@ -0,0 +1,3 @@ +1|good1 +2|bad +3|good3 diff --git a/src/test/regress/data/copy_enc_err_utf8_multi.data b/src/test/regress/data/copy_enc_err_utf8_multi.data new file mode 100644 index 00000000000..9544e7d3c13 --- /dev/null +++ b/src/test/regress/data/copy_enc_err_utf8_multi.data @@ -0,0 +1,5 @@ +1|good1 +2|bad +3|good3 +4|badþ +5|good5 diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore index 927156dbc3f..c837ca324d5 100644 --- a/src/test/regress/expected/.gitignore +++ b/src/test/regress/expected/.gitignore @@ -69,3 +69,4 @@ /tag.out /ao_unique_index_partition.out /bfv_copy.out +/copy_encoding_error.out diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule index 039e8d7e9c4..3c8f7965b28 100755 --- a/src/test/regress/greenplum_schedule +++ b/src/test/regress/greenplum_schedule @@ -35,6 +35,7 @@ test: gp_dispatch_keepalives # copy command # copy form a file with different EOL test: copy_eol +test: copy_encoding_error test: dedupset diff --git a/src/test/regress/input/copy_encoding_error.source b/src/test/regress/input/copy_encoding_error.source new file mode 100644 index 00000000000..6302442b761 --- /dev/null +++ b/src/test/regress/input/copy_encoding_error.source @@ -0,0 +1,107 @@ +-- +-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT. +-- +-- Regression test for https://github.com/apache/cloudberry/issues/1425 +-- COPY FROM should correctly count encoding errors as single rejected rows, +-- not double-count them. Also, encoding error SREH should work when +-- transcoding is required. +-- + +-- =================================================================== +-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database) +-- +-- The file has 3 lines: +-- line 1: valid +-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline) +-- line 3: valid +-- +-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row, +-- and 1 < 2. Before the fix, the error was double-counted (counted as 2), +-- which would cause the reject limit to be reached on the next error check. +-- =================================================================== + +CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a); + +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|' + LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; + +-- Verify that valid rows (lines 1 and 3) were imported. +SELECT * FROM copy_enc_err ORDER BY a; + +-- Verify that exactly 1 error was logged (not 2). +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + +SELECT gp_truncate_error_log('copy_enc_err'); +TRUNCATE copy_enc_err; + +-- =================================================================== +-- Test 2: Non-transcoding with multiple bad lines +-- +-- The file has 5 lines: lines 2 and 4 are bad. +-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors. +-- =================================================================== + +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|' + LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS; + +-- All 3 valid rows should be imported. +SELECT * FROM copy_enc_err ORDER BY a; + +-- Exactly 2 errors should be logged. +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + +SELECT gp_truncate_error_log('copy_enc_err'); +TRUNCATE copy_enc_err; + +-- =================================================================== +-- Test 3: Non-transcoding, reject limit reached correctly +-- +-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because +-- rejectcount (2) >= rejectlimit (2). +-- =================================================================== + +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|' + LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; + +SELECT gp_truncate_error_log('copy_enc_err'); + +-- =================================================================== +-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database) +-- +-- The file has 3 lines with data that claims to be EUC_CN: +-- line 1: valid ASCII (valid in EUC_CN) +-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows) +-- line 3: valid ASCII (valid in EUC_CN) +-- +-- Before the fix, this would error with: +-- "Data validation error: since the source data need transcoding +-- sreh can not handle yet." +-- After the fix, it should skip line 2 and import lines 1 and 3. +-- =================================================================== + +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|' + ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; + +-- Valid rows should be imported. +SELECT * FROM copy_enc_err ORDER BY a; + +-- Exactly 1 error should be logged. +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + +SELECT gp_truncate_error_log('copy_enc_err'); +TRUNCATE copy_enc_err; + +-- =================================================================== +-- Test 5: Transcoding with multiple bad lines +-- =================================================================== + +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|' + ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS; + +SELECT * FROM copy_enc_err ORDER BY a; + +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + +-- Cleanup +SELECT gp_truncate_error_log('copy_enc_err'); +DROP TABLE copy_enc_err; diff --git a/src/test/regress/output/copy_encoding_error.source b/src/test/regress/output/copy_encoding_error.source new file mode 100644 index 00000000000..3beceb6b2f3 --- /dev/null +++ b/src/test/regress/output/copy_encoding_error.source @@ -0,0 +1,161 @@ +-- +-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT. +-- +-- Regression test for https://github.com/apache/cloudberry/issues/1425 +-- COPY FROM should correctly count encoding errors as single rejected rows, +-- not double-count them. Also, encoding error SREH should work when +-- transcoding is required. +-- +-- =================================================================== +-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database) +-- +-- The file has 3 lines: +-- line 1: valid +-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline) +-- line 3: valid +-- +-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row, +-- and 1 < 2. Before the fix, the error was double-counted (counted as 2), +-- which would cause the reject limit to be reached on the next error check. +-- =================================================================== +CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a); +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|' + LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; +NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data +-- Verify that valid rows (lines 1 and 3) were imported. +SELECT * FROM copy_enc_err ORDER BY a; + a | b +---+------- + 1 | good1 + 3 | good3 +(2 rows) + +-- Verify that exactly 1 error was logged (not 2). +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + error_count +------------- + 1 +(1 row) + +SELECT gp_truncate_error_log('copy_enc_err'); + gp_truncate_error_log +----------------------- + t +(1 row) + +TRUNCATE copy_enc_err; +-- =================================================================== +-- Test 2: Non-transcoding with multiple bad lines +-- +-- The file has 5 lines: lines 2 and 4 are bad. +-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors. +-- =================================================================== +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|' + LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS; +NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data +-- All 3 valid rows should be imported. +SELECT * FROM copy_enc_err ORDER BY a; + a | b +---+------- + 1 | good1 + 3 | good3 + 5 | good5 +(3 rows) + +-- Exactly 2 errors should be logged. +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + error_count +------------- + 2 +(1 row) + +SELECT gp_truncate_error_log('copy_enc_err'); + gp_truncate_error_log +----------------------- + t +(1 row) + +TRUNCATE copy_enc_err; +-- =================================================================== +-- Test 3: Non-transcoding, reject limit reached correctly +-- +-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because +-- rejectcount (2) >= rejectlimit (2). +-- =================================================================== +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|' + LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; +ERROR: segment reject limit reached, aborting operation +DETAIL: Last error was: invalid byte sequence for encoding "UTF8": 0xfe +CONTEXT: COPY copy_enc_err, line 3 +SELECT gp_truncate_error_log('copy_enc_err'); + gp_truncate_error_log +----------------------- + t +(1 row) + +-- =================================================================== +-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database) +-- +-- The file has 3 lines with data that claims to be EUC_CN: +-- line 1: valid ASCII (valid in EUC_CN) +-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows) +-- line 3: valid ASCII (valid in EUC_CN) +-- +-- Before the fix, this would error with: +-- "Data validation error: since the source data need transcoding +-- sreh can not handle yet." +-- After the fix, it should skip line 2 and import lines 1 and 3. +-- =================================================================== +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|' + ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS; +NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data +-- Valid rows should be imported. +SELECT * FROM copy_enc_err ORDER BY a; + a | b +---+------- + 1 | good1 + 3 | good3 +(2 rows) + +-- Exactly 1 error should be logged. +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + error_count +------------- + 1 +(1 row) + +SELECT gp_truncate_error_log('copy_enc_err'); + gp_truncate_error_log +----------------------- + t +(1 row) + +TRUNCATE copy_enc_err; +-- =================================================================== +-- Test 5: Transcoding with multiple bad lines +-- =================================================================== +COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|' + ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS; +NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data +SELECT * FROM copy_enc_err ORDER BY a; + a | b +---+------- + 1 | good1 + 3 | good3 + 5 | good5 +(3 rows) + +SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err'); + error_count +------------- + 2 +(1 row) + +-- Cleanup +SELECT gp_truncate_error_log('copy_enc_err'); + gp_truncate_error_log +----------------------- + t +(1 row) + +DROP TABLE copy_enc_err; diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore index 7cbb805e8f5..9b5f3660fa7 100644 --- a/src/test/regress/sql/.gitignore +++ b/src/test/regress/sql/.gitignore @@ -63,3 +63,4 @@ /tag.sql /ao_unique_index_partition.sql /bfv_copy.sql +/copy_encoding_error.sql