diff --git a/backup/statistics.go b/backup/statistics.go index 5c63866b..5c1f0e86 100644 --- a/backup/statistics.go +++ b/backup/statistics.go @@ -14,6 +14,11 @@ import ( "github.com/lib/pq" ) +const ( + // STATISTIC_KIND_NDV_BY_SEGMENTS is specific to Cloudberry Database 2.1.0+ + STATISTIC_KIND_NDV_BY_SEGMENTS = 8 +) + func PrintStatisticsStatements(statisticsFile *utils.FileWithByteCount, tocfile *toc.TOC, tables []Table, attStats map[uint32][]AttributeStatistic, tupleStats map[uint32]TupleStatistic) { for _, table := range tables { tupleQuery := GenerateTupleStatisticsQuery(table, tupleStats[table.Oid]) @@ -162,11 +167,11 @@ func generateAttributeSlotsQuery7(attStat AttributeStatistic) string { realValues(attStat.Numbers3), realValues(attStat.Numbers4), realValues(attStat.Numbers5), - AnyValues(attStat.Values1, attStat.Type), - AnyValues(attStat.Values2, attStat.Type), - AnyValues(attStat.Values3, attStat.Type), - AnyValues(attStat.Values4, attStat.Type), - AnyValues(attStat.Values5, attStat.Type)) + AnyValues(attStat.Values1, attStat.Type, attStat.Kind1), + AnyValues(attStat.Values2, attStat.Type, attStat.Kind2), + AnyValues(attStat.Values3, attStat.Type, attStat.Kind3), + AnyValues(attStat.Values4, attStat.Type, attStat.Kind4), + AnyValues(attStat.Values5, attStat.Type, attStat.Kind5)) } return attributeQuery } @@ -230,11 +235,11 @@ func generateAttributeSlotsQuery6(attStat AttributeStatistic) string { realValues(attStat.Numbers3), realValues(attStat.Numbers4), realValues(attStat.Numbers5), - AnyValues(attStat.Values1, attStat.Type), - AnyValues(attStat.Values2, attStat.Type), - AnyValues(attStat.Values3, attStat.Type), - AnyValues(attStat.Values4, attStat.Type), - AnyValues(attStat.Values5, attStat.Type)) + AnyValues(attStat.Values1, attStat.Type, attStat.Kind1), + AnyValues(attStat.Values2, attStat.Type, attStat.Kind2), + AnyValues(attStat.Values3, attStat.Type, attStat.Kind3), + AnyValues(attStat.Values4, attStat.Type, attStat.Kind4), + AnyValues(attStat.Values5, attStat.Type, attStat.Kind5)) } return attributeQuery } @@ -286,10 +291,10 @@ func generateAttributeSlotsQuery4(attStat AttributeStatistic) string { realValues(attStat.Numbers2), realValues(attStat.Numbers3), realValues(attStat.Numbers4), - AnyValues(attStat.Values1, attStat.Type), - AnyValues(attStat.Values2, attStat.Type), - AnyValues(attStat.Values3, attStat.Type), - AnyValues(attStat.Values4, attStat.Type)) + AnyValues(attStat.Values1, attStat.Type, attStat.Kind1), + AnyValues(attStat.Values2, attStat.Type, attStat.Kind2), + AnyValues(attStat.Values3, attStat.Type, attStat.Kind3), + AnyValues(attStat.Values4, attStat.Type, attStat.Kind4)) } return attributeQuery } @@ -317,10 +322,15 @@ func realValues(reals pq.StringArray) string { /* * A given type is not guaranteed to have a corresponding array type, so we need * to use array_in() instead of casting to an array. + * STATISTIC_KIND_NDV_BY_SEGMENTS (8) is a special case which stores an array of + * int8 values rather than the column's native type. */ -func AnyValues(any pq.StringArray, typ string) string { +func AnyValues(any pq.StringArray, typ string, kind int) string { if len(any) > 0 { + if kind == STATISTIC_KIND_NDV_BY_SEGMENTS { + return fmt.Sprintf(`array_in(%s, 'int8'::regtype::oid, -1)`, SliceToPostgresArray(any)) + } return fmt.Sprintf(`array_in(%s, '%s'::regtype::oid, -1)`, SliceToPostgresArray(any), typ) } - return fmt.Sprintf("NULL") + return "NULL" } diff --git a/backup/statistics_test.go b/backup/statistics_test.go index 5d58c8e2..4aed8b03 100644 --- a/backup/statistics_test.go +++ b/backup/statistics_test.go @@ -235,11 +235,15 @@ WHERE oid = '"""test''schema"""."""test''table"""'::regclass::oid;`)) }) Describe("AnyValues", func() { It("returns properly casted string when length of anyvalues is greater than 0", func() { - castedString := backup.AnyValues([]string{"1", "2"}, "int") + castedString := backup.AnyValues([]string{"1", "2"}, "int", 1) Expect(castedString).To(Equal(`array_in('{"1","2"}', 'int'::regtype::oid, -1)`)) }) + It("returns int8 casted string when kind is STATISTIC_KIND_NDV_BY_SEGMENTS", func() { + castedString := backup.AnyValues([]string{"2"}, "bool", backup.STATISTIC_KIND_NDV_BY_SEGMENTS) + Expect(castedString).To(Equal(`array_in('{"2"}', 'int8'::regtype::oid, -1)`)) + }) It("returns NULL if anyvalues is of length 0", func() { - castedString := backup.AnyValues([]string{}, "int") + castedString := backup.AnyValues([]string{}, "int", 1) Expect(castedString).To(Equal(`NULL`)) }) }) diff --git a/integration/statistics_queries_test.go b/integration/statistics_queries_test.go index a6932d61..b7de66e6 100644 --- a/integration/statistics_queries_test.go +++ b/integration/statistics_queries_test.go @@ -56,6 +56,39 @@ var _ = Describe("backup integration tests", func() { expectedStats5J.Collation1 = 100 expectedStats5J.Collation2 = 100 } + if connectionPool.Version.IsCBDB() && connectionPool.Version.AtLeast("2.1.0") { + // Cloudberry Database 2.1.0 introduced STATISTIC_KIND_NDV_BY_SEGMENTS (8). + // In this test case, due to the small data volume, this statistic is + // automatically placed into the 3rd slot (stakind3) by the analyze command. + expectedStats5I.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS + expectedStats5J.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS + expectedStats5K.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS + + // Set the operator OID for this new statistic kind + // i (int4) uses operator 97 (=) + // j (text) uses operator 664 (=) and collation 100 + // k (bool) uses operator 58 (=) + expectedStats5I.Operator3 = 97 + expectedStats5J.Operator3 = 664 + expectedStats5J.Collation3 = 100 + expectedStats5K.Operator3 = 58 + + // 4 distinct rows were inserted for 'i' (int) and 'j' (text) columns + expectedStats5I.Values3 = []string{"4"} + expectedStats5J.Values3 = []string{"4"} + + // Why is 'k' (bool) 3.0000000596046448 instead of 2? + // 1. STATISTIC_KIND_NDV_BY_SEGMENTS (8) is the SUM of local NDVs across all segments, NOT the global NDV. + // 2. Based on the hash distribution (using 'i' as distribution key), the rows map to segments like so: + // - Seg 0 gets 3 rows: (2,b,f), (3,c,t), (4,d,f). Local NDV for 'k' on Seg 0 = 2 ('f' and 't') + // - Seg 1 gets 1 row: (1,a,t). Local NDV for 'k' on Seg 1 = 1 ('t') + // - Seg 2 gets 0 rows. Local NDV for 'k' on Seg 2 = 0 + // - Sum of Local NDVs = 2 + 1 + 0 = 3 + // 3. The optimizer uses this to estimate intermediate rows generated during a two-stage aggregation (Partial Agg). + // 4. The value is stored internally as a float4 (single precision) to save space, and when retrieved, + // it is converted back to double precision (float8), resulting in the slight precision loss (3.0000000596046448). + expectedStats5K.Values3 = []string{"3.0000000596046448"} + } // The order in which the stavalues1 values is returned is not guaranteed to be deterministic sort.Strings(tableAttStatsI.Values1)