apache · tuhaihe · Feb 28, 2026 · Feb 28, 2026
diff --git a/backup/statistics.go b/backup/statistics.go
@@ -14,6 +14,11 @@ import (
 	"github.com/lib/pq"
 )
 
+const (
+	// STATISTIC_KIND_NDV_BY_SEGMENTS is specific to Cloudberry Database 2.1.0+
+	STATISTIC_KIND_NDV_BY_SEGMENTS = 8
+)
+
 func PrintStatisticsStatements(statisticsFile *utils.FileWithByteCount, tocfile *toc.TOC, tables []Table, attStats map[uint32][]AttributeStatistic, tupleStats map[uint32]TupleStatistic) {
 	for _, table := range tables {
 		tupleQuery := GenerateTupleStatisticsQuery(table, tupleStats[table.Oid])
@@ -162,11 +167,11 @@ func generateAttributeSlotsQuery7(attStat AttributeStatistic) string {
 			realValues(attStat.Numbers3),
 			realValues(attStat.Numbers4),
 			realValues(attStat.Numbers5),
-			AnyValues(attStat.Values1, attStat.Type),
-			AnyValues(attStat.Values2, attStat.Type),
-			AnyValues(attStat.Values3, attStat.Type),
-			AnyValues(attStat.Values4, attStat.Type),
-			AnyValues(attStat.Values5, attStat.Type))
+			AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+			AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+			AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+			AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
+			AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
 	}
 	return attributeQuery
 }
@@ -230,11 +235,11 @@ func generateAttributeSlotsQuery6(attStat AttributeStatistic) string {
 			realValues(attStat.Numbers3),
 			realValues(attStat.Numbers4),
 			realValues(attStat.Numbers5),
-			AnyValues(attStat.Values1, attStat.Type),
-			AnyValues(attStat.Values2, attStat.Type),
-			AnyValues(attStat.Values3, attStat.Type),
-			AnyValues(attStat.Values4, attStat.Type),
-			AnyValues(attStat.Values5, attStat.Type))
+			AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+			AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+			AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+			AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
+			AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
 	}
 	return attributeQuery
 }
@@ -286,10 +291,10 @@ func generateAttributeSlotsQuery4(attStat AttributeStatistic) string {
 			realValues(attStat.Numbers2),
 			realValues(attStat.Numbers3),
 			realValues(attStat.Numbers4),
-			AnyValues(attStat.Values1, attStat.Type),
-			AnyValues(attStat.Values2, attStat.Type),
-			AnyValues(attStat.Values3, attStat.Type),
-			AnyValues(attStat.Values4, attStat.Type))
+			AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
+			AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
+			AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
+			AnyValues(attStat.Values4, attStat.Type, attStat.Kind4))
 	}
 	return attributeQuery
 }
@@ -317,10 +322,15 @@ func realValues(reals pq.StringArray) string {
 /*
  * A given type is not guaranteed to have a corresponding array type, so we need
  * to use array_in() instead of casting to an array.
+ * STATISTIC_KIND_NDV_BY_SEGMENTS (8) is a special case which stores an array of
+ * int8 values rather than the column's native type.
  */
-func AnyValues(any pq.StringArray, typ string) string {
+func AnyValues(any pq.StringArray, typ string, kind int) string {
 	if len(any) > 0 {
+		if kind == STATISTIC_KIND_NDV_BY_SEGMENTS {
+			return fmt.Sprintf(`array_in(%s, 'int8'::regtype::oid, -1)`, SliceToPostgresArray(any))
+		}
 		return fmt.Sprintf(`array_in(%s, '%s'::regtype::oid, -1)`, SliceToPostgresArray(any), typ)
 	}
-	return fmt.Sprintf("NULL")
+	return "NULL"
 }
diff --git a/backup/statistics_test.go b/backup/statistics_test.go
@@ -235,11 +235,15 @@ WHERE oid = '"""test''schema"""."""test''table"""'::regclass::oid;`))
 	})
 	Describe("AnyValues", func() {
 		It("returns properly casted string when length of anyvalues is greater than 0", func() {
-			castedString := backup.AnyValues([]string{"1", "2"}, "int")
+			castedString := backup.AnyValues([]string{"1", "2"}, "int", 1)
 			Expect(castedString).To(Equal(`array_in('{"1","2"}', 'int'::regtype::oid, -1)`))
 		})
+		It("returns int8 casted string when kind is STATISTIC_KIND_NDV_BY_SEGMENTS", func() {
+			castedString := backup.AnyValues([]string{"2"}, "bool", backup.STATISTIC_KIND_NDV_BY_SEGMENTS)
+			Expect(castedString).To(Equal(`array_in('{"2"}', 'int8'::regtype::oid, -1)`))
+		})
 		It("returns NULL if anyvalues is of length 0", func() {
-			castedString := backup.AnyValues([]string{}, "int")
+			castedString := backup.AnyValues([]string{}, "int", 1)
 			Expect(castedString).To(Equal(`NULL`))
 		})
 	})

diff --git a/integration/statistics_queries_test.go b/integration/statistics_queries_test.go
@@ -56,6 +56,39 @@ var _ = Describe("backup integration tests", func() {
 				expectedStats5J.Collation1 = 100
 				expectedStats5J.Collation2 = 100
 			}
+			if connectionPool.Version.IsCBDB() && connectionPool.Version.AtLeast("2.1.0") {
+				// Cloudberry Database 2.1.0 introduced STATISTIC_KIND_NDV_BY_SEGMENTS (8).
+				// In this test case, due to the small data volume, this statistic is
+				// automatically placed into the 3rd slot (stakind3) by the analyze command.
+				expectedStats5I.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+				expectedStats5J.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+				expectedStats5K.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS
+
+				// Set the operator OID for this new statistic kind
+				// i (int4) uses operator 97 (=)
+				// j (text) uses operator 664 (=) and collation 100
+				// k (bool) uses operator 58 (=)
+				expectedStats5I.Operator3 = 97
+				expectedStats5J.Operator3 = 664
+				expectedStats5J.Collation3 = 100
+				expectedStats5K.Operator3 = 58
+
+				// 4 distinct rows were inserted for 'i' (int) and 'j' (text) columns
+				expectedStats5I.Values3 = []string{"4"}
+				expectedStats5J.Values3 = []string{"4"}
+
+				// Why is 'k' (bool) 3.0000000596046448 instead of 2?
+				// 1. STATISTIC_KIND_NDV_BY_SEGMENTS (8) is the SUM of local NDVs across all segments, NOT the global NDV.
+				// 2. Based on the hash distribution (using 'i' as distribution key), the rows map to segments like so:
+				//    - Seg 0 gets 3 rows: (2,b,f), (3,c,t), (4,d,f). Local NDV for 'k' on Seg 0 = 2 ('f' and 't')
+				//    - Seg 1 gets 1 row: (1,a,t). Local NDV for 'k' on Seg 1 = 1 ('t')
+				//    - Seg 2 gets 0 rows. Local NDV for 'k' on Seg 2 = 0
+				//    - Sum of Local NDVs = 2 + 1 + 0 = 3
+				// 3. The optimizer uses this to estimate intermediate rows generated during a two-stage aggregation (Partial Agg).
+				// 4. The value is stored internally as a float4 (single precision) to save space, and when retrieved,
+				//    it is converted back to double precision (float8), resulting in the slight precision loss (3.0000000596046448).
+				expectedStats5K.Values3 = []string{"3.0000000596046448"}
+			}
 
 			// The order in which the stavalues1 values is returned is not guaranteed to be deterministic
 			sort.Strings(tableAttStatsI.Values1)