Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions backup/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ import (
"github.com/lib/pq"
)

const (
// STATISTIC_KIND_NDV_BY_SEGMENTS is specific to Cloudberry Database 2.1.0+
STATISTIC_KIND_NDV_BY_SEGMENTS = 8
)

func PrintStatisticsStatements(statisticsFile *utils.FileWithByteCount, tocfile *toc.TOC, tables []Table, attStats map[uint32][]AttributeStatistic, tupleStats map[uint32]TupleStatistic) {
for _, table := range tables {
tupleQuery := GenerateTupleStatisticsQuery(table, tupleStats[table.Oid])
Expand Down Expand Up @@ -162,11 +167,11 @@ func generateAttributeSlotsQuery7(attStat AttributeStatistic) string {
realValues(attStat.Numbers3),
realValues(attStat.Numbers4),
realValues(attStat.Numbers5),
AnyValues(attStat.Values1, attStat.Type),
AnyValues(attStat.Values2, attStat.Type),
AnyValues(attStat.Values3, attStat.Type),
AnyValues(attStat.Values4, attStat.Type),
AnyValues(attStat.Values5, attStat.Type))
AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
}
return attributeQuery
}
Expand Down Expand Up @@ -230,11 +235,11 @@ func generateAttributeSlotsQuery6(attStat AttributeStatistic) string {
realValues(attStat.Numbers3),
realValues(attStat.Numbers4),
realValues(attStat.Numbers5),
AnyValues(attStat.Values1, attStat.Type),
AnyValues(attStat.Values2, attStat.Type),
AnyValues(attStat.Values3, attStat.Type),
AnyValues(attStat.Values4, attStat.Type),
AnyValues(attStat.Values5, attStat.Type))
AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
AnyValues(attStat.Values4, attStat.Type, attStat.Kind4),
AnyValues(attStat.Values5, attStat.Type, attStat.Kind5))
}
return attributeQuery
}
Expand Down Expand Up @@ -286,10 +291,10 @@ func generateAttributeSlotsQuery4(attStat AttributeStatistic) string {
realValues(attStat.Numbers2),
realValues(attStat.Numbers3),
realValues(attStat.Numbers4),
AnyValues(attStat.Values1, attStat.Type),
AnyValues(attStat.Values2, attStat.Type),
AnyValues(attStat.Values3, attStat.Type),
AnyValues(attStat.Values4, attStat.Type))
AnyValues(attStat.Values1, attStat.Type, attStat.Kind1),
AnyValues(attStat.Values2, attStat.Type, attStat.Kind2),
AnyValues(attStat.Values3, attStat.Type, attStat.Kind3),
AnyValues(attStat.Values4, attStat.Type, attStat.Kind4))
}
return attributeQuery
}
Expand Down Expand Up @@ -317,10 +322,15 @@ func realValues(reals pq.StringArray) string {
/*
* A given type is not guaranteed to have a corresponding array type, so we need
* to use array_in() instead of casting to an array.
* STATISTIC_KIND_NDV_BY_SEGMENTS (8) is a special case which stores an array of
* int8 values rather than the column's native type.
*/
func AnyValues(any pq.StringArray, typ string) string {
func AnyValues(any pq.StringArray, typ string, kind int) string {
if len(any) > 0 {
if kind == STATISTIC_KIND_NDV_BY_SEGMENTS {
return fmt.Sprintf(`array_in(%s, 'int8'::regtype::oid, -1)`, SliceToPostgresArray(any))
}
return fmt.Sprintf(`array_in(%s, '%s'::regtype::oid, -1)`, SliceToPostgresArray(any), typ)
}
return fmt.Sprintf("NULL")
return "NULL"
}
8 changes: 6 additions & 2 deletions backup/statistics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -235,11 +235,15 @@ WHERE oid = '"""test''schema"""."""test''table"""'::regclass::oid;`))
})
Describe("AnyValues", func() {
It("returns properly casted string when length of anyvalues is greater than 0", func() {
castedString := backup.AnyValues([]string{"1", "2"}, "int")
castedString := backup.AnyValues([]string{"1", "2"}, "int", 1)
Expect(castedString).To(Equal(`array_in('{"1","2"}', 'int'::regtype::oid, -1)`))
})
It("returns int8 casted string when kind is STATISTIC_KIND_NDV_BY_SEGMENTS", func() {
castedString := backup.AnyValues([]string{"2"}, "bool", backup.STATISTIC_KIND_NDV_BY_SEGMENTS)
Expect(castedString).To(Equal(`array_in('{"2"}', 'int8'::regtype::oid, -1)`))
})
It("returns NULL if anyvalues is of length 0", func() {
castedString := backup.AnyValues([]string{}, "int")
castedString := backup.AnyValues([]string{}, "int", 1)
Expect(castedString).To(Equal(`NULL`))
})
})
Expand Down
33 changes: 33 additions & 0 deletions integration/statistics_queries_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,39 @@ var _ = Describe("backup integration tests", func() {
expectedStats5J.Collation1 = 100
expectedStats5J.Collation2 = 100
}
if connectionPool.Version.IsCBDB() && connectionPool.Version.AtLeast("2.1.0") {
// Cloudberry Database 2.1.0 introduced STATISTIC_KIND_NDV_BY_SEGMENTS (8).
// In this test case, due to the small data volume, this statistic is
// automatically placed into the 3rd slot (stakind3) by the analyze command.
expectedStats5I.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS
expectedStats5J.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS
expectedStats5K.Kind3 = backup.STATISTIC_KIND_NDV_BY_SEGMENTS

// Set the operator OID for this new statistic kind
// i (int4) uses operator 97 (=)
// j (text) uses operator 664 (=) and collation 100
// k (bool) uses operator 58 (=)
expectedStats5I.Operator3 = 97
expectedStats5J.Operator3 = 664
expectedStats5J.Collation3 = 100
expectedStats5K.Operator3 = 58

// 4 distinct rows were inserted for 'i' (int) and 'j' (text) columns
expectedStats5I.Values3 = []string{"4"}
expectedStats5J.Values3 = []string{"4"}

// Why is 'k' (bool) 3.0000000596046448 instead of 2?
// 1. STATISTIC_KIND_NDV_BY_SEGMENTS (8) is the SUM of local NDVs across all segments, NOT the global NDV.
// 2. Based on the hash distribution (using 'i' as distribution key), the rows map to segments like so:
// - Seg 0 gets 3 rows: (2,b,f), (3,c,t), (4,d,f). Local NDV for 'k' on Seg 0 = 2 ('f' and 't')
// - Seg 1 gets 1 row: (1,a,t). Local NDV for 'k' on Seg 1 = 1 ('t')
// - Seg 2 gets 0 rows. Local NDV for 'k' on Seg 2 = 0
// - Sum of Local NDVs = 2 + 1 + 0 = 3
// 3. The optimizer uses this to estimate intermediate rows generated during a two-stage aggregation (Partial Agg).
// 4. The value is stored internally as a float4 (single precision) to save space, and when retrieved,
// it is converted back to double precision (float8), resulting in the slight precision loss (3.0000000596046448).
expectedStats5K.Values3 = []string{"3.0000000596046448"}
}

// The order in which the stavalues1 values is returned is not guaranteed to be deterministic
sort.Strings(tableAttStatsI.Values1)
Expand Down