apache · avamingli · Mar 3, 2026 · Mar 3, 2026
diff --git a/src/backend/commands/copyfromparse.c b/src/backend/commands/copyfromparse.c
@@ -649,8 +649,7 @@ CopyLoadInputBuf(CopyFromState cstate)
 		 */
 		if (cstate->input_reached_error)
 		{
-			/* so far, we only support no transcoding conversion error handling */
-			if (cstate->cdbsreh && !cstate->need_transcoding)
+			if (cstate->cdbsreh)
 			{
 				MemoryContext oldcontext = CurrentMemoryContext;
 				PG_TRY();
@@ -1788,7 +1787,6 @@ CopyReadBinaryAttribute(CopyFromState cstate, FmgrInfo *flinfo,
 void
 RemoveInvalidDataInBuf(CopyFromState cstate)
 {
-	int nbytes;
 	int scanidx;
 
 	if (cstate->errMode == ALL_OR_NOTHING)
@@ -1800,6 +1798,8 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 
 	if (!cstate->need_transcoding)
 	{
+		int nbytes;
+
 		/*
 		 * According to `BeginCopyFrom`, if not need_transcoding these two
 		 * pointer share one memory space.
@@ -1826,22 +1826,38 @@ RemoveInvalidDataInBuf(CopyFromState cstate)
 			/* leave a hint to identify find eol after next raw page read */
 			cstate->find_eol_with_rawreading = true;
 		}
-
-		/* reset input buf, so we can redo conversion/verification */
-		cstate->input_reached_error = false;
-		cstate->input_buf_index = 0;
-		cstate->input_buf_len = 0;
-
-		/* reset line_buf */
-		resetStringInfo(&cstate->line_buf);
-		cstate->line_buf_valid = false;
-		cstate->cdbsreh->rejectcount++;
 	}
 	else
 	{
-		ereport(ERROR, (errmsg("Data validation error: since the source data "
-							   "need transcoding sreh can not handle yet.")));
+		/*
+		 * Transcoding case: raw_buf and input_buf are separate buffers.
+		 * Skip the bad line in raw_buf by finding the next EOL.  No need to
+		 * memmove raw_buf here; CopyLoadRawBuf() will compact it when more
+		 * raw data is needed.
+		 */
+		if (FindEolInUnverifyRawBuf(cstate, &scanidx))
+		{
+			cstate->raw_buf_index += scanidx;
+		}
+		else
+		{
+			/* Current page can not find eol, to skip current raw buffer */
+			cstate->raw_buf_len = 0;
+			cstate->raw_buf_index = 0;
+
+			/* leave a hint to identify find eol after next raw page read */
+			cstate->find_eol_with_rawreading = true;
+		}
 	}
+
+	/* reset input buf, so we can redo conversion/verification */
+	cstate->input_reached_error = false;
+	cstate->input_buf_index = 0;
+	cstate->input_buf_len = 0;
+
+	/* reset line_buf */
+	resetStringInfo(&cstate->line_buf);
+	cstate->line_buf_valid = false;
 }
 
 static bool

diff --git a/src/test/regress/data/copy_enc_err_euccn.data b/src/test/regress/data/copy_enc_err_euccn.data
@@ -0,0 +1,3 @@
+1|good1
+2|bad¡
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_euccn_multi.data b/src/test/regress/data/copy_enc_err_euccn_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|bad¡
+3|good3
+4|bad¡
+5|good5
diff --git a/src/test/regress/data/copy_enc_err_utf8.data b/src/test/regress/data/copy_enc_err_utf8.data
@@ -0,0 +1,3 @@
+1|good1
+2|badÂ
+3|good3
diff --git a/src/test/regress/data/copy_enc_err_utf8_multi.data b/src/test/regress/data/copy_enc_err_utf8_multi.data
@@ -0,0 +1,5 @@
+1|good1
+2|badÂ
+3|good3
+4|badþ
+5|good5
diff --git a/src/test/regress/expected/.gitignore b/src/test/regress/expected/.gitignore
@@ -69,3 +69,4 @@
 /tag.out
 /ao_unique_index_partition.out
 /bfv_copy.out
+/copy_encoding_error.out
diff --git a/src/test/regress/greenplum_schedule b/src/test/regress/greenplum_schedule
@@ -35,6 +35,7 @@ test: gp_dispatch_keepalives
 # copy command
 # copy form a file with different EOL
 test: copy_eol
+test: copy_encoding_error
 
 test: dedupset
 

diff --git a/src/test/regress/input/copy_encoding_error.source b/src/test/regress/input/copy_encoding_error.source
@@ -0,0 +1,107 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+SELECT gp_truncate_error_log('copy_enc_err');
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+SELECT gp_truncate_error_log('copy_enc_err');
+TRUNCATE copy_enc_err;
+
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+
+SELECT * FROM copy_enc_err ORDER BY a;
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/output/copy_encoding_error.source b/src/test/regress/output/copy_encoding_error.source
@@ -0,0 +1,161 @@
+--
+-- Test COPY FROM with invalid multi-byte encoding and SEGMENT REJECT LIMIT.
+--
+-- Regression test for https://github.com/apache/cloudberry/issues/1425
+-- COPY FROM should correctly count encoding errors as single rejected rows,
+-- not double-count them.  Also, encoding error SREH should work when
+-- transcoding is required.
+--
+-- ===================================================================
+-- Test 1: Non-transcoding case (invalid UTF-8 into UTF-8 database)
+--
+-- The file has 3 lines:
+--   line 1: valid
+--   line 2: ends with 0xC2 (incomplete 2-byte UTF-8 sequence before newline)
+--   line 3: valid
+--
+-- With SEGMENT REJECT LIMIT 2, this should succeed: only 1 error row,
+-- and 1 < 2.  Before the fix, the error was double-counted (counted as 2),
+-- which would cause the reject limit to be reached on the next error check.
+-- ===================================================================
+CREATE TABLE copy_enc_err(a int, b text) DISTRIBUTED BY (a);
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected related input data
+-- Verify that valid rows (lines 1 and 3) were imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Verify that exactly 1 error was logged (not 2).
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 2: Non-transcoding with multiple bad lines
+--
+-- The file has 5 lines: lines 2 and 4 are bad.
+-- With SEGMENT REJECT LIMIT 10, this should succeed with 2 errors.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected related input data
+-- All 3 valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+-- Exactly 2 errors should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 3: Non-transcoding, reject limit reached correctly
+--
+-- 2 bad lines with SEGMENT REJECT LIMIT 2 should fail, because
+-- rejectcount (2) >= rejectlimit (2).
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_utf8_multi.data' DELIMITER '|'
+    LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+ERROR:  segment reject limit reached, aborting operation
+DETAIL:  Last error was: invalid byte sequence for encoding "UTF8": 0xfe
+CONTEXT:  COPY copy_enc_err, line 3
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+-- ===================================================================
+-- Test 4: Transcoding case (invalid EUC_CN into UTF-8 database)
+--
+-- The file has 3 lines with data that claims to be EUC_CN:
+--   line 1: valid ASCII (valid in EUC_CN)
+--   line 2: ends with 0xA1 (starts a 2-byte EUC_CN char, but \n follows)
+--   line 3: valid ASCII (valid in EUC_CN)
+--
+-- Before the fix, this would error with:
+--   "Data validation error: since the source data need transcoding
+--    sreh can not handle yet."
+-- After the fix, it should skip line 2 and import lines 1 and 3.
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 2 ROWS;
+NOTICE:  found 1 data formatting errors (1 or more input rows), rejected related input data
+-- Valid rows should be imported.
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+(2 rows)
+
+-- Exactly 1 error should be logged.
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           1
+(1 row)
+
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+TRUNCATE copy_enc_err;
+-- ===================================================================
+-- Test 5: Transcoding with multiple bad lines
+-- ===================================================================
+COPY copy_enc_err FROM '@abs_srcdir@/data/copy_enc_err_euccn_multi.data' DELIMITER '|'
+    ENCODING 'euc_cn' LOG ERRORS SEGMENT REJECT LIMIT 10 ROWS;
+NOTICE:  found 2 data formatting errors (2 or more input rows), rejected related input data
+SELECT * FROM copy_enc_err ORDER BY a;
+ a |   b   
+---+-------
+ 1 | good1
+ 3 | good3
+ 5 | good5
+(3 rows)
+
+SELECT count(*) AS error_count FROM gp_read_error_log('copy_enc_err');
+ error_count 
+-------------
+           2
+(1 row)
+
+-- Cleanup
+SELECT gp_truncate_error_log('copy_enc_err');
+ gp_truncate_error_log 
+-----------------------
+ t
+(1 row)
+
+DROP TABLE copy_enc_err;
diff --git a/src/test/regress/sql/.gitignore b/src/test/regress/sql/.gitignore
@@ -63,3 +63,4 @@
 /tag.sql
 /ao_unique_index_partition.sql
 /bfv_copy.sql
+/copy_encoding_error.sql
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+|good1
+|bad¡
+|good3