Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 31 additions & 15 deletions src/backend/commands/copyfromparse.c
Original file line number Diff line number Diff line change
Expand Up @@ -649,8 +649,7 @@ CopyLoadInputBuf(CopyFromState cstate)
*/
if (cstate->input_reached_error)
{
/* so far, we only support no transcoding conversion error handling */
if (cstate->cdbsreh && !cstate->need_transcoding)
if (cstate->cdbsreh)
{
MemoryContext oldcontext = CurrentMemoryContext;
PG_TRY();
Expand Down Expand Up @@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
void
RemoveInvalidDataInBuf(CopyFromState cstate)
{
int nbytes;
int scanidx;

if (cstate->errMode == ALL_OR_NOTHING)
Expand All @@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate)

if (!cstate->need_transcoding)
{
int nbytes;

/*
* According to `BeginCopyFrom`, if not need_transcoding these two
* pointer share one memory space.
Expand All @@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
/* leave a hint to identify find eol after next raw page read */
cstate->find_eol_with_rawreading = true;
}

/* reset input buf, so we can redo conversion/verification */
cstate->input_reached_error = false;
cstate->input_buf_index = 0;
cstate->input_buf_len = 0;

/* reset line_buf */
resetStringInfo(&cstate->line_buf);
cstate->line_buf_valid = false;
cstate->cdbsreh->rejectcount++;
}
else
{
ereport(ERROR, (errmsg("Data validation error: since the source data "
"need transcoding sreh can not handle yet.")));
/*
* Transcoding case: raw_buf and input_buf are separate buffers.
* Skip the bad line in raw_buf by finding the next EOL. No need to
* memmove raw_buf here; CopyLoadRawBuf() will compact it when more
* raw data is needed.
*/
if (FindEolInUnverifyRawBuf(cstate, &scanidx))
{
cstate->raw_buf_index += scanidx;
}
else
{
/* Current page can not find eol, to skip current raw buffer */
cstate->raw_buf_len = 0;
cstate->raw_buf_index = 0;

/* leave a hint to identify find eol after next raw page read */
cstate->find_eol_with_rawreading = true;
}
}

/* reset input buf, so we can redo conversion/verification */
cstate->input_reached_error = false;
cstate->input_buf_index = 0;
cstate->input_buf_len = 0;

/* reset line_buf */
resetStringInfo(&cstate->line_buf);
cstate->line_buf_valid = false;
}

static bool
Expand Down
3 changes: 3 additions & 0 deletions src/test/regress/data/copy_enc_err_euccn.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1|good1
2|bad¡
3|good3
5 changes: 5 additions & 0 deletions src/test/regress/data/copy_enc_err_euccn_multi.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1|good1
2|bad¡
3|good3
4|bad¡
5|good5
3 changes: 3 additions & 0 deletions src/test/regress/data/copy_enc_err_utf8.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
1|good1
2|badÂ
3|good3
5 changes: 5 additions & 0 deletions src/test/regress/data/copy_enc_err_utf8_multi.data
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1|good1
2|badÂ
3|good3
4|badþ
5|good5
1 change: 1 addition & 0 deletions src/test/regress/expected/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,4 @@
/tag.out
/ao_unique_index_partition.out
/bfv_copy.out
/copy_encoding_error.out
1 change: 1 addition & 0 deletions src/test/regress/greenplum_schedule
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ test: gp_dispatch_keepalives
# copy command
# copy form a file with different EOL
test: copy_eol
test: copy_encoding_error

test: dedupset

Expand Down
107 changes: 107 additions & 0 deletions src/test/regress/input/copy_encoding_error.source
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
--
-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
--
-- Regression test for https://github.com/apache/cloudberry/issues/1425
-- COPY FROM should correctly count encoding errors as single rejected rows,
-- not double-count them. Also, encoding error SREH should work when
-- transcoding is required.
--

-- ===================================================================
-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
--
-- The file has 3 lines:
-- line 1: valid
-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
-- line 3: valid
--
-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
-- and 1 < 2. Before the fix, the error was double-counted (counted as 2),
-- which would cause the reject limit to be reached on the next error check.
-- ===================================================================

CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);

COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;

-- Verify that valid rows (lines 1 and 3) were imported.
SELECT * FROM copy_enc_err ORDER BY a;

-- Verify that exactly 1 error was logged (not 2).
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');

SELECT gp_truncate_error_log('copy_enc_err');
TRUNCATE copy_enc_err;

-- ===================================================================
-- Test 2: Non-transcoding with multiple bad lines
--
-- The file has 5 lines: lines 2 and 4 are bad.
-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
-- ===================================================================

COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;

-- All 3 valid rows should be imported.
SELECT * FROM copy_enc_err ORDER BY a;

-- Exactly 2 errors should be logged.
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');

SELECT gp_truncate_error_log('copy_enc_err');
TRUNCATE copy_enc_err;

-- ===================================================================
-- Test 3: Non-transcoding, reject limit reached correctly
--
-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
-- rejectcount (2) >= rejectlimit (2).
-- ===================================================================

COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;

SELECT gp_truncate_error_log('copy_enc_err');

-- ===================================================================
-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
--
-- The file has 3 lines with data that claims to be EUC_CN:
-- line 1: valid ASCII (valid in EUC_CN)
-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
-- line 3: valid ASCII (valid in EUC_CN)
--
-- Before the fix, this would error with:
-- "Data validation error: since the source data need transcoding
-- sreh can not handle yet."
-- After the fix, it should skip line 2 and import lines 1 and 3.
-- ===================================================================

COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;

-- Valid rows should be imported.
SELECT * FROM copy_enc_err ORDER BY a;

-- Exactly 1 error should be logged.
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');

SELECT gp_truncate_error_log('copy_enc_err');
TRUNCATE copy_enc_err;

-- ===================================================================
-- Test 5: Transcoding with multiple bad lines
-- ===================================================================

COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;

SELECT * FROM copy_enc_err ORDER BY a;

SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');

-- Cleanup
SELECT gp_truncate_error_log('copy_enc_err');
DROP TABLE copy_enc_err;
161 changes: 161 additions & 0 deletions src/test/regress/output/copy_encoding_error.source
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
--
-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
--
-- Regression test for https://github.com/apache/cloudberry/issues/1425
-- COPY FROM should correctly count encoding errors as single rejected rows,
-- not double-count them. Also, encoding error SREH should work when
-- transcoding is required.
--
-- ===================================================================
-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
--
-- The file has 3 lines:
-- line 1: valid
-- line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
-- line 3: valid
--
-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
-- and 1 < 2. Before the fix, the error was double-counted (counted as 2),
-- which would cause the reject limit to be reached on the next error check.
-- ===================================================================
CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
-- Verify that valid rows (lines 1 and 3) were imported.
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
(2 rows)

-- Verify that exactly 1 error was logged (not 2).
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
1
(1 row)

SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)

TRUNCATE copy_enc_err;
-- ===================================================================
-- Test 2: Non-transcoding with multiple bad lines
--
-- The file has 5 lines: lines 2 and 4 are bad.
-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
-- All 3 valid rows should be imported.
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
5 | good5
(3 rows)

-- Exactly 2 errors should be logged.
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
2
(1 row)

SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)

TRUNCATE copy_enc_err;
-- ===================================================================
-- Test 3: Non-transcoding, reject limit reached correctly
--
-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
-- rejectcount (2) >= rejectlimit (2).
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
ERROR: segment reject limit reached, aborting operation
DETAIL: Last error was: invalid byte sequence for encoding "UTF8": 0xfe
CONTEXT: COPY copy_enc_err, line 3
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)

-- ===================================================================
-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
--
-- The file has 3 lines with data that claims to be EUC_CN:
-- line 1: valid ASCII (valid in EUC_CN)
-- line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
-- line 3: valid ASCII (valid in EUC_CN)
--
-- Before the fix, this would error with:
-- "Data validation error: since the source data need transcoding
-- sreh can not handle yet."
-- After the fix, it should skip line 2 and import lines 1 and 3.
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
NOTICE: found 1 data formatting errors (1 or more input rows), rejected related input data
-- Valid rows should be imported.
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
(2 rows)

-- Exactly 1 error should be logged.
SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
1
(1 row)

SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)

TRUNCATE copy_enc_err;
-- ===================================================================
-- Test 5: Transcoding with multiple bad lines
-- ===================================================================
COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
NOTICE: found 2 data formatting errors (2 or more input rows), rejected related input data
SELECT * FROM copy_enc_err ORDER BY a;
a | b
---+-------
1 | good1
3 | good3
5 | good5
(3 rows)

SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
error_count
-------------
2
(1 row)

-- Cleanup
SELECT gp_truncate_error_log('copy_enc_err');
gp_truncate_error_log
-----------------------
t
(1 row)

DROP TABLE copy_enc_err;
1 change: 1 addition & 0 deletions src/test/regress/sql/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,4 @@
/tag.sql
/ao_unique_index_partition.sql
/bfv_copy.sql
/copy_encoding_error.sql
Loading