From 285c03ae9d1e9adabd3e68c69d622e169e958c30 Mon Sep 17 00:00:00 2001
From: Zhang Mingli <avamingli@gmail.com>
Date: Tue, 3 Mar 2026 08:56:25 +0800
Subject: [PATCH] Fix COPY FROM encoding error double-counting and enable SREH
 for transcoding

COPY FROM with SEGMENT REJECT LIMIT had two bugs when encountering
invalid multi-byte encoding sequences:

1. Encoding errors were double-counted: HandleCopyError() incremented
   rejectcount, then RemoveInvalidDataInBuf() incremented it again for
   the same error. This caused the reject limit to be reached twice as
   fast as expected.

2. SREH (Single Row Error Handling) was completely disabled when
   transcoding was required (file encoding != database encoding). Any
   encoding error during transcoding would raise an ERROR instead of
   skipping the bad row.

Fix by removing the duplicate rejectcount++ from RemoveInvalidDataInBuf(),
removing the !need_transcoding guard that blocked SREH for transcoding,
and adding proper buffer cleanup for the transcoding case (advance
raw_buf past the bad line using FindEolInUnverifyRawBuf).

Add regression tests covering both non-transcoding (invalid UTF-8) and
transcoding (invalid EUC_CN to UTF-8) cases with various reject limits.

Fixes https://github.com/apache/cloudberry/issues/1425
---
 src/backend/commands/copyfromparse.c          |  46 +++--
 src/test/regress/data/copy_enc_err_euccn.data |   3 +
 .../data/copy_enc_err_euccn_multi.data        |   5 +
 src/test/regress/data/copy_enc_err_utf8.data  |   3 +
 .../regress/data/copy_enc_err_utf8_multi.data |   5 +
 src/test/regress/expected/.gitignore          |   1 +
 src/test/regress/greenplum_schedule           |   1 +
 .../regress/input/copy_encoding_error.source  | 107 ++++++++++++
 .../regress/output/copy_encoding_error.source | 161 ++++++++++++++++++
 src/test/regress/sql/.gitignore               |   1 +
 10 files changed, 318 insertions(+), 15 deletions(-)
 create mode 100644 src/test/regress/data/copy_enc_err_euccn.data
 create mode 100644 src/test/regress/data/copy_enc_err_euccn_multi.data
 create mode 100644 src/test/regress/data/copy_enc_err_utf8.data
 create mode 100644 src/test/regress/data/copy_enc_err_utf8_multi.data
 create mode 100644 src/test/regress/input/copy_encoding_error.source
 create mode 100644 src/test/regress/output/copy_encoding_error.source

diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
index 5d1ff1b53c8..6bf309478aa 100644
--- a/src/backend/commands/copyfromparse.c
+++ b/src/backend/commands/copyfromparse.c
@@ -649,8 +649,7 @@ CopyLoadInputBuf(CopyFromState cstate)
 		 */
 		if (cstate->input_reached_error)
 		{
-			/* so far, we only support no transcoding conversion error handling */
-			if (cstate->cdbsreh && !cstate->need_transcoding)
+			if (cstate->cdbsreh)
 			{
 				MemoryContext oldcontext = CurrentMemoryContext;
 				PG_TRY();
@@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
 void
 RemoveInvalidDataInBuf(CopyFromState cstate)
 {
-	int nbytes;
 	int scanidx;
 
 	if (cstate->errMode == ALL_OR_NOTHING)
@@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 
 	if (!cstate->need_transcoding)
 	{
+		int nbytes;
+
 		/*
 		 * According to `BeginCopyFrom`, if not need_transcoding these two
 		 * pointer share one memory space.
@@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 			/* leave a hint to identify find eol after next raw page read */
 			cstate->find_eol_with_rawreading = true;
 		}
-
-		/* reset input buf, so we can redo conversion/verification */
-		cstate->input_reached_error = false;
-		cstate->input_buf_index = 0;
-		cstate->input_buf_len = 0;
-
-		/* reset line_buf */
-		resetStringInfo(&cstate->line_buf);
-		cstate->line_buf_valid = false;
-		cstate->cdbsreh->rejectcount++;
 	}
 	else
 	{
-		ereport(ERROR, (errmsg("Data validation error: since the source data "
-							   "need transcoding sreh can not handle yet.")));
+		/*
+		 * Transcoding case: raw_buf and input_buf are separate buffers.
+		 * Skip the bad line in raw_buf by finding the next EOL.  No need to
+		 * memmove raw_buf here; CopyLoadRawBuf() will compact it when more
+		 * raw data is needed.
+		 */
+		if (FindEolInUnverifyRawBuf(cstate, &scanidx))
+		{
+			cstate->raw_buf_index += scanidx;
+		}
+		else
+		{
+			/* Current page can not find eol, to skip current raw buffer */
+			cstate->raw_buf_len = 0;
+			cstate->raw_buf_index = 0;
+
+			/* leave a hint to identify find eol after next raw page read */
+			cstate->find_eol_with_rawreading = true;
+		}
 	}
+
+	/* reset input buf, so we can redo conversion/verification */
+	cstate->input_reached_error = false;
+	cstate->input_buf_index = 0;
+	cstate->input_buf_len = 0;
+
+	/* reset line_buf */
+	resetStringInfo(&cstate->line_buf);
+	cstate->line_buf_valid = false;
 }
 
 static bool
diff --git a/src/test/regress/data/copy_enc_err_euccn.data b/src/test/regress/data/copy_enc_err_euccn.data
new file mode 100644
index 00000000000..a73dd217cd3
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_euccn.data
@@ -0,0 +1,3 @@
+1|good1
+2|bad¡
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_euccn_multi.data b/src/test/regress/data/copy_enc_err_euccn_multi.data
new file mode 100644
index 00000000000..8a5a30ae25e
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_euccn_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|bad¡
+3|good3
+4|bad¡
+5|good5
diff --git a/src/test/regress/data/copy_enc_err_utf8.data b/src/test/regress/data/copy_enc_err_utf8.data
new file mode 100644
index 00000000000..e8297cc05ca
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_utf8.data
@@ -0,0 +1,3 @@
+1|good1
+2|badÂ
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_utf8_multi.data b/src/test/regress/data/copy_enc_err_utf8_multi.data
new file mode 100644
index 00000000000..9544e7d3c13
--- /dev/null
+++ b/src/test/regress/data/copy_enc_err_utf8_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|badÂ
+3|good3
+4|badþ
+5|good5
diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore
index 927156dbc3f..c837ca324d5 100644
--- a/src/test/regress/expected/.gitignore
+++ b/src/test/regress/expected/.gitignore
@@ -69,3 +69,4 @@
 /tag.out
 /ao_unique_index_partition.out
 /bfv_copy.out
+/copy_encoding_error.out
diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule
index 039e8d7e9c4..3c8f7965b28 100755
--- a/src/test/regress/greenplum_schedule
+++ b/src/test/regress/greenplum_schedule
@@ -35,6 +35,7 @@ test: gp_dispatch_keepalives
 # copy command
 # copy form a file with different EOL
 test: copy_eol
+test: copy_encoding_error
 
 test: dedupset
 
diff --git a/src/test/regress/input/copy_encoding_error.source b/src/test/regress/input/copy_encoding_error.source
new file mode 100644
index 00000000000..6302442b761
--- /dev/null
+++ b/src/test/regress/input/copy_encoding_error.source
@@ -0,0 +1,107 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+SELECT gp_truncate_error_log('copy_enc_err');
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+SELECT * FROM copy_enc_err ORDER BY a;
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/output/copy_encoding_error.source b/src/test/regress/output/copy_encoding_error.source
new file mode 100644
index 00000000000..3beceb6b2f3
--- /dev/null
+++ b/src/test/regress/output/copy_encoding_error.source
@@ -0,0 +1,161 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected related input data
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected related input data
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+ERROR:  segment reject limit reached, aborting operation
+DETAIL:  Last error was: invalid byte sequence for encoding "UTF8": 0xfe
+CONTEXT:  COPY copy_enc_err, line 3
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected related input data
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected related input data
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore
index 7cbb805e8f5..9b5f3660fa7 100644
--- a/src/test/regress/sql/.gitignore
+++ b/src/test/regress/sql/.gitignore
@@ -63,3 +63,4 @@
 /tag.sql
 /ao_unique_index_partition.sql
 /bfv_copy.sql
+/copy_encoding_error.sql