From 2e5b2fe9e8f4479380dad5a87aa9f47f14ffbcf1 Mon Sep 17 00:00:00 2001 From: Dianjin Wang Date: Thu, 5 Feb 2026 10:20:02 +0800 Subject: [PATCH 01/10] Update go-libs url to `apache/cloudberry-go-libs` (#47) * Update go-libs url to `apache/cloudberry-go-libs` * Run go mod tidy to update go.mod and go.sum file * Update cluster_test.go * Remove the nil check for `failedCommand` in GenerateOutput function. After replacing greenplum-db/go-libs with apache/cloudberry-go-libs, the `FailedCommands` field type changed from `[]*cluster.ShellCommand` (pointer slice) to `[]cluster.ShellCommand` (value slice). In Go, struct values cannot be compared to nil, which caused the build error: ``` invalid operation: failedCommand == nil (mismatched types cluster.ShellCommand and untyped nil) ``` This nil check is no longer needed because: 1. Value-type slice elements are always valid structs, never nil 2. The NewRemoteOutput function only appends commands with non-nil errors to FailedCommands, so empty iterations won't occur --- cli/README.md | 2 +- cli/cmd/cluster.go | 13 +++++-------- cli/cmd/cluster_test.go | 28 +++++++++++----------------- cli/cmd/cmd_suite_test.go | 4 ++-- cli/cmd/pxf.go | 2 +- cli/cmd/root.go | 2 +- cli/go.mod | 10 +++++----- cli/go.sum | 16 ++++++++-------- 8 files changed, 34 insertions(+), 43 deletions(-) diff --git a/cli/README.md b/cli/README.md index 39ee217ed..27d4aa765 100644 --- a/cli/README.md +++ b/cli/README.md @@ -34,7 +34,7 @@ go install github.com/go-delve/delve/cmd/dlv@latest ``` config max-string-len 1000 -break vendor/github.com/greenplum-db/gp-common-go-libs/cluster/cluster.go:351 +break vendor/github.com/apache/cloudberry-go-libs/cluster/cluster.go:351 continue print commandList ``` diff --git a/cli/cmd/cluster.go b/cli/cmd/cluster.go index 68a7b3012..ee3c6b749 100644 --- a/cli/cmd/cluster.go +++ b/cli/cmd/cluster.go @@ -6,9 +6,9 @@ import ( "os" "strings" - "github.com/greenplum-db/gp-common-go-libs/cluster" - "github.com/greenplum-db/gp-common-go-libs/dbconn" - "github.com/greenplum-db/gp-common-go-libs/gplog" + "github.com/apache/cloudberry-go-libs/cluster" + "github.com/apache/cloudberry-go-libs/dbconn" + "github.com/apache/cloudberry-go-libs/gplog" "github.com/spf13/cobra" "github.com/blang/semver" ) @@ -111,9 +111,6 @@ func GenerateOutput(cmd *command, clusterData *ClusterData) error { } response := "" for _, failedCommand := range clusterData.Output.FailedCommands { - if failedCommand == nil { - continue - } host := failedCommand.Host errorMessage := failedCommand.Stderr if len(errorMessage) == 0 { @@ -138,8 +135,8 @@ func doSetup() (*ClusterData, error) { connection := dbconn.NewDBConnFromEnvironment("postgres") err := connection.Connect(1) if err != nil { - gplog.Error(fmt.Sprintf("ERROR: Could not connect to GPDB.\n%s\n"+ - "Please make sure that your Greenplum database is running and you are on the coordinator node.", err.Error())) + gplog.Error(fmt.Sprintf("ERROR: Could not connect to Cloudberry.\n%s\n"+ + "Please make sure that your Apache Cloudberry is running and you are on the coordinator node.", err.Error())) return nil, err } diff --git a/cli/cmd/cluster_test.go b/cli/cmd/cluster_test.go index ee078821e..3fe4d7981 100644 --- a/cli/cmd/cluster_test.go +++ b/cli/cmd/cluster_test.go @@ -3,7 +3,7 @@ package cmd_test import ( "pxf-cli/cmd" - "github.com/greenplum-db/gp-common-go-libs/cluster" + "github.com/apache/cloudberry-go-libs/cluster" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -145,11 +145,7 @@ var _ = Describe("GenerateOutput()", func() { BeforeEach(func() { clusterData.Output = &cluster.RemoteOutput{ NumErrors: 0, - FailedCommands: []*cluster.ShellCommand{ - nil, - nil, - nil, - }, + FailedCommands: []cluster.ShellCommand{}, Commands: []cluster.ShellCommand{ { Host: "mdw", @@ -234,8 +230,8 @@ var _ = Describe("GenerateOutput()", func() { } clusterData.Output = &cluster.RemoteOutput{ NumErrors: 1, - FailedCommands: []*cluster.ShellCommand{ - &failedCommand, + FailedCommands: []cluster.ShellCommand{ + failedCommand, }, Commands: []cluster.ShellCommand{ { @@ -358,8 +354,8 @@ stderr line three` } clusterData.Output = &cluster.RemoteOutput{ NumErrors: 1, - FailedCommands: []*cluster.ShellCommand{ - &failedCommand, + FailedCommands: []cluster.ShellCommand{ + failedCommand, }, Commands: []cluster.ShellCommand{ { @@ -393,8 +389,8 @@ stderr line three` } clusterData.Output = &cluster.RemoteOutput{ NumErrors: 1, - FailedCommands: []*cluster.ShellCommand{ - &failedCommand, + FailedCommands: []cluster.ShellCommand{ + failedCommand, }, Commands: []cluster.ShellCommand{ { @@ -422,9 +418,7 @@ stderr line three` BeforeEach(func() { clusterDataWithOneHost.Output = &cluster.RemoteOutput{ NumErrors: 0, - FailedCommands: []*cluster.ShellCommand{ - nil, - }, + FailedCommands: []cluster.ShellCommand{}, Commands: []cluster.ShellCommand{ { Host: "mdw", @@ -496,8 +490,8 @@ stderr line three` } clusterDataWithOneHost.Output = &cluster.RemoteOutput{ NumErrors: 1, - FailedCommands: []*cluster.ShellCommand{ - &failedCommand, + FailedCommands: []cluster.ShellCommand{ + failedCommand, }, Commands: []cluster.ShellCommand{ failedCommand, diff --git a/cli/cmd/cmd_suite_test.go b/cli/cmd/cmd_suite_test.go index 191728d9c..37cae08c2 100644 --- a/cli/cmd/cmd_suite_test.go +++ b/cli/cmd/cmd_suite_test.go @@ -4,8 +4,8 @@ import ( "os/user" "testing" - "github.com/greenplum-db/gp-common-go-libs/operating" - "github.com/greenplum-db/gp-common-go-libs/testhelper" + "github.com/apache/cloudberry-go-libs/operating" + "github.com/apache/cloudberry-go-libs/testhelper" "github.com/onsi/gomega/gbytes" . "github.com/onsi/ginkgo/v2" diff --git a/cli/cmd/pxf.go b/cli/cmd/pxf.go index e7850ed7a..88e21ec9c 100644 --- a/cli/cmd/pxf.go +++ b/cli/cmd/pxf.go @@ -8,7 +8,7 @@ import ( "os" "strings" - "github.com/greenplum-db/gp-common-go-libs/cluster" + "github.com/apache/cloudberry-go-libs/cluster" ) type envVar string diff --git a/cli/cmd/root.go b/cli/cmd/root.go index 7117789ee..dc47bb8c3 100644 --- a/cli/cmd/root.go +++ b/cli/cmd/root.go @@ -3,7 +3,7 @@ package cmd import ( "os" - "github.com/greenplum-db/gp-common-go-libs/gplog" + "github.com/apache/cloudberry-go-libs/gplog" "github.com/spf13/cobra" ) diff --git a/cli/go.mod b/cli/go.mod index 6be152701..e05e448f4 100644 --- a/cli/go.mod +++ b/cli/go.mod @@ -3,7 +3,8 @@ module pxf-cli go 1.21.3 require ( - github.com/greenplum-db/gp-common-go-libs v1.0.16 + github.com/apache/cloudberry-go-libs v1.0.12-0.20250910014224-fc376e8a1056 + github.com/blang/semver v3.5.1+incompatible github.com/onsi/ginkgo/v2 v2.13.0 github.com/onsi/gomega v1.28.0 github.com/pkg/errors v0.9.1 @@ -12,7 +13,6 @@ require ( require ( github.com/DATA-DOG/go-sqlmock v1.5.0 // indirect - github.com/blang/semver v3.5.1+incompatible // indirect github.com/go-logr/logr v1.2.4 // indirect github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect github.com/google/go-cmp v0.6.0 // indirect @@ -28,9 +28,9 @@ require ( github.com/jackc/pgx/v4 v4.18.2 // indirect github.com/jmoiron/sqlx v1.3.5 // indirect github.com/spf13/pflag v1.0.3 // indirect - golang.org/x/crypto v0.20.0 // indirect - golang.org/x/net v0.21.0 // indirect - golang.org/x/sys v0.17.0 // indirect + golang.org/x/crypto v0.21.0 // indirect + golang.org/x/net v0.23.0 // indirect + golang.org/x/sys v0.18.0 // indirect golang.org/x/text v0.14.0 // indirect golang.org/x/tools v0.12.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/cli/go.sum b/cli/go.sum index 0e25b4f9c..3bd88f834 100644 --- a/cli/go.sum +++ b/cli/go.sum @@ -7,6 +7,8 @@ github.com/Masterminds/semver/v3 v3.1.1/go.mod h1:VPu/7SZ7ePZ3QOrcuXROw5FAcLl4a0 github.com/OneOfOne/xxhash v1.2.2/go.mod h1:HSdplMjZKSmBqAxg5vPj2TmRDmfkzw+cTzAElWljhcU= github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc= github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= +github.com/apache/cloudberry-go-libs v1.0.12-0.20250910014224-fc376e8a1056 h1:ycrFztmYATpidbSAU1rw60XuhuDxgBHtLD3Sueu947c= +github.com/apache/cloudberry-go-libs v1.0.12-0.20250910014224-fc376e8a1056/go.mod h1:lfHWkNYsno/lV+Nee0OoCmlOlBz5yvT6EW8WQEOUI5c= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= github.com/beorn7/perks v1.0.0/go.mod h1:KWe93zE9D1o94FZ5RNwFwVgaQK1VOXiVxmqh+CedLV8= @@ -66,8 +68,6 @@ github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ= -github.com/greenplum-db/gp-common-go-libs v1.0.16 h1:3YcbbSHZ5CEDesRXbSD08BDHcr88xwu73GYWmv5wXsw= -github.com/greenplum-db/gp-common-go-libs v1.0.16/go.mod h1:3vYQDev2Dke3W16fLYrApd/isXoi/lHspdbsqOJqRx0= github.com/grpc-ecosystem/go-grpc-middleware v1.0.0/go.mod h1:FiyG127CGDf3tlThmgyCl78X/SZQqEOJBCDaAfeWzPs= github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0/go.mod h1:8NvIoxWQoOIhqOTXgfV/d3M/q6VIi02HzZEHgUlZvzk= github.com/grpc-ecosystem/grpc-gateway v1.9.0/go.mod h1:vNeuVxBJEsws4ogUvrchl83t/GYV9WGTSLVdBhOQFDY= @@ -238,8 +238,8 @@ golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20201203163018-be400aefbc4c/go.mod h1:jdWPYTVW3xRLrWPugEBEK3UY2ZEsg3UU495nc5E+M+I= golang.org/x/crypto v0.0.0-20210616213533-5ff15b29337e/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.20.0 h1:jmAMJJZXr5KiCw05dfYK9QnqaqKLYXijU23lsEdcQqg= -golang.org/x/crypto v0.20.0/go.mod h1:Xwo95rrVNIoSMx9wa1JroENMToLWn3RNVrTBpLHgZPQ= +golang.org/x/crypto v0.21.0 h1:X31++rzVUdKhX5sWmSOFZxx8UW/ldWx55cbf08iNAMA= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= @@ -256,8 +256,8 @@ golang.org/x/net v0.0.0-20190522155817-f3200d17e092/go.mod h1:HSz+uSET+XFnRR8LxR golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20190813141303-74dc4d7220e7/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= -golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= -golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -279,8 +279,8 @@ golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= -golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201117132131-f5c789dd3221/go.mod h1:Nr5EML6q2oocZ2LXRh80K7BxOlk5/8JxuGnuhpl+muw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= From 54e342626e1e938723b4405e181e72ebfbba280d Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Thu, 5 Feb 2026 09:03:41 +0500 Subject: [PATCH 02/10] Upgrade Apache ORC library in PXF * Bump Apache `ORC` library to 1.7.11 In theory, we can bump up to 1.9.8. However pxf-hbase tests fail with new protobuf-3.x. For now, let's move fast rather than struggle in dependency hell. * Bump `ORC`'s dependencies to support `zstd` compression and tests for compression. Surprisingly, according to documentation, `zstd` should work well. However, old `aircompressor:0.8` doesn't have the ZStd codec, and attempting to use it will lead to an error. --- .../automation/features/orc/OrcReadTest.java | 18 ------------------ server/build.gradle | 2 +- server/gradle.properties | 4 ++-- .../hdfs/orc/ORCVectorizedAccessorTest.java | 18 ++++++++++++++++++ 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/orc/OrcReadTest.java b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/orc/OrcReadTest.java index 68e86d00c..c462aeb0e 100644 --- a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/orc/OrcReadTest.java +++ b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/orc/OrcReadTest.java @@ -157,30 +157,12 @@ public void orcReadMultiDimensionalLists() throws Exception { runSqlTest("features/orc/read/multidim_list_types"); } - /* - * FDW fails for the data that contain a NUL-byte (i.e. '\/u000'"). This behaviour is different from external-table but same as GPDB Heap - * FDW Failure: invalid byte sequence for encoding "UTF8": 0x00 - * - * GPDB also throws the same error when copying the data containing a NUL-byte - * - * postgres=# copy test from '/Users/pandeyhi/Documents/bad_data.txt' ; - * ERROR: invalid byte sequence for encoding "UTF8": 0x00 - * TODO Do we need to do some changes to make sure the external-table behaves the same way as GPDB/FDW? - * - */ - @FailsWithFDW @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void orcReadStringsContainingNullByte() throws Exception { prepareReadableExternalTable("pxf_orc_null_in_string", ORC_NULL_IN_STRING_COLUMNS, hdfsPath + ORC_NULL_IN_STRING); runSqlTest("features/orc/read/null_in_string"); } - // @Test(groups = {"features", "gpdb", "security", "hcfs"}) - // public void orcReadStringsContainingNullByte() throws Exception { - // prepareReadableExternalTable("pxf_orc_null_in_string", ORC_NULL_IN_STRING_COLUMNS, hdfsPath + ORC_NULL_IN_STRING); - // runTincTest("pxf.features.orc.read.null_in_string.runTest"); - // } - private void prepareReadableExternalTable(String name, String[] fields, String path) throws Exception { prepareReadableExternalTable(name, fields, path, false); } diff --git a/server/build.gradle b/server/build.gradle index cafaec6b2..499a0b721 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -91,7 +91,7 @@ configure(javaProjects) { dependency("commons-io:commons-io:2.7") dependency("commons-lang:commons-lang:2.6") dependency("commons-logging:commons-logging:1.1.3") - dependency("io.airlift:aircompressor:0.8") + dependency("io.airlift:aircompressor:0.27") dependency("javax.jdo:jdo-api:3.0.1") dependency("joda-time:joda-time:2.8.1") dependency("net.sf.opencsv:opencsv:2.3") diff --git a/server/gradle.properties b/server/gradle.properties index 772b69c43..eb6191df4 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -20,7 +20,7 @@ apiVersion=0 license=ASL 2.0 hadoopVersion=2.10.2 hiveVersion=2.3.8 -hiveStorageApiVersion=2.7.2 +hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 parquetVersion=1.11.1 @@ -28,4 +28,4 @@ awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true org.gradle.parallel=false -orcVersion=1.6.13 +orcVersion=1.7.11 diff --git a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/orc/ORCVectorizedAccessorTest.java b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/orc/ORCVectorizedAccessorTest.java index 6890e6aef..0fcf06252 100644 --- a/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/orc/ORCVectorizedAccessorTest.java +++ b/server/pxf-hdfs/src/test/java/org/apache/cloudberry/pxf/plugins/hdfs/orc/ORCVectorizedAccessorTest.java @@ -148,6 +148,24 @@ public void testOpenForWrite_SnappyCompression() throws IOException { runTestScenario_OpenForWrite(CompressionKind.SNAPPY, true); } + @Test + public void testOpenForWrite_ZstdCompression() throws IOException { + context.addOption("COMPRESSION_CODEC", "zstd"); + runTestScenario_OpenForWrite(CompressionKind.ZSTD, true); + } + + @Test + public void testOpenForWrite_Lz4Compression() throws IOException { + context.addOption("COMPRESSION_CODEC", "lz4"); + runTestScenario_OpenForWrite(CompressionKind.LZ4, true); + } + + @Test + public void testOpenForWrite_LZOCompression() throws IOException { + context.addOption("COMPRESSION_CODEC", "lzo"); + runTestScenario_OpenForWrite(CompressionKind.LZO, true); + } + @Test public void testOpenForWrite_OrcWriteTimeZoneUTCMissing() throws IOException { runTestScenario_OpenForWrite(CompressionKind.ZLIB, true); From 44b9dad2c247edf9f8f7557156abf272450842ae Mon Sep 17 00:00:00 2001 From: Ed Espino Date: Thu, 29 Aug 2024 00:41:27 -0700 Subject: [PATCH 03/10] Release support Cherry-pick from 609435fa28ffeb2f659c6779c3ed90f150a5cb44 --- Makefile | 18 +++-- package/cloudberry-pxf.spec | 135 ++++++++++++++++++++++++++++++++++++ version | 2 +- 3 files changed, 150 insertions(+), 5 deletions(-) create mode 100644 package/cloudberry-pxf.spec diff --git a/Makefile b/Makefile index a9d5d0848..6d1430382 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,8 @@ SOURCE_EXTENSION_DIR = external-table TARGET_EXTENSION_DIR = gpextable LICENSE ?= ASL 2.0 -VENDOR ?= Open Source +VENDOR ?= Cloudberry Open Source +RELEASE ?= 1 default: all @@ -122,8 +123,17 @@ rpm: make -C cli stage make -C server stage set -e ;\ - PXF_MAIN_VERSION=$${PXF_VERSION//-SNAPSHOT/} ;\ - if [[ $${PXF_VERSION} == *"-SNAPSHOT" ]]; then PXF_RELEASE=SNAPSHOT; else PXF_RELEASE=1; fi ;\ + GP_MAJOR_VERSION=$$(cat $(SOURCE_EXTENSION_DIR)/build/metadata/gp_major_version) ;\ + PXF_FULL_VERSION=$${PXF_VERSION} ;\ + PXF_MAIN_VERSION=$$(echo $${PXF_FULL_VERSION} | sed -E 's/(-SNAPSHOT|-rc[0-9]+)$$//') ;\ + if [[ $${PXF_FULL_VERSION} == *"-SNAPSHOT" ]]; then \ + PXF_RELEASE=SNAPSHOT; \ + elif [[ $${PXF_FULL_VERSION} =~ -rc([0-9]+)$$ ]]; then \ + PXF_RELEASE="rc$${BASH_REMATCH[1]}"; \ + else \ + PXF_RELEASE=1; \ + fi ;\ + rm -rf build/rpmbuild ;\ mkdir -p build/rpmbuild/{BUILD,RPMS,SOURCES,SPECS} ;\ cp -a build/stage/$${PXF_PACKAGE_NAME}/pxf/* build/rpmbuild/SOURCES ;\ cp package/*.spec build/rpmbuild/SPECS/ ;\ @@ -133,7 +143,7 @@ rpm: --define "pxf_release $${PXF_RELEASE}" \ --define "license ${LICENSE}" \ --define "vendor ${VENDOR}" \ - -bb $${PWD}/build/rpmbuild/SPECS/pxf-cbdb$${GP_MAJOR_VERSION}.spec + -bb $${PWD}/build/rpmbuild/SPECS/cloudberry-pxf.spec rpm-tar: rpm rm -rf build/{stagerpm,distrpm} diff --git a/package/cloudberry-pxf.spec b/package/cloudberry-pxf.spec new file mode 100644 index 000000000..95542b684 --- /dev/null +++ b/package/cloudberry-pxf.spec @@ -0,0 +1,135 @@ +# Disable repacking of jars, since it takes forever +%define __jar_repack %{nil} + +# Disable build-id in rpm +%define _build_id_links none +# Disable automatic dependency processing both for requirements and provides +AutoReqProv: no + +Name: cloudberry-pxf +Version: %{pxf_version} +Release: %{pxf_release}%{?dist} + +Summary: Cloudberry PXF (Platform Extension Framework) for advanced data access +License: %{license} +URL: https://cloudberrydb.org +Vendor: %{vendor} +Group: Applications/Databases + +Prefix: /usr/local/%{name}-%{version} + +# Java server can be installed on a new node, only bash is needed for +# management scripts + +Requires: bash + +# Require Cloudberry Database - .so file makes sense only when +# installing on Cloudberry node, so inherit Cloudberry's dependencies +# implicitly + +Requires: cloudberry-db + +# Weak dependencies either OpenJDK 8 or 11 +Suggests: java-1.8.0-openjdk +Suggests: java-11-openjdk + +%description +Cloudberry PXF (Platform Extension Framework) is an advanced data +access framework that provides connectivity to a wide range of data +sources. It enables high-speed, parallel data access across +distributed systems, making it an essential component for performing +advanced analytics with the Cloudberry Database. PXF seamlessly +integrates and efficiently queries external data sources, including, +but not limited to: + +- HDFS files +- Hive tables +- HBase tables +- Databases that support JDBC +- Cloud-based data stores such as Amazon S3 and Google Cloud Storage (GCS) + +Supported file formats include, but are not limited to: + +- Text files (e.g., CSV, TSV) +- Sequence files +- Avro files +- Parquet files +- ORC files +- RCFile (Record Columnar File) +- JSON files +- Avro Object Container Files + +Whether accessing structured, semi-structured, or unstructured data, +PXF ensures that users can efficiently interact with a diverse set of +data environments and file formats. The examples provided above +represent only a subset of the broad range of sources and formats +supported by PXF. + +For more information, visit the official Cloudberry Database website +at https://cloudberrydb.org. + +%prep +# If the pxf_version macro is not defined, it gets interpreted as a literal string, need %% to escape it +if [ %{pxf_version} = '%%{pxf_version}' ] ; then + echo "The macro (variable) pxf_version must be supplied as rpmbuild ... --define='pxf_version [VERSION]'" + exit 1 +fi + +%install +%__mkdir -p %{buildroot}/%{prefix} +%__cp -R %{_sourcedir}/* %{buildroot}/%{prefix} + +# Create symlink +%__ln_s %{prefix} %{buildroot}/usr/local/%{name} + +%post +sed -i "s|directory =.*|directory = '${RPM_INSTALL_PREFIX}/gpextable/'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" +sed -i "s|module_pathname =.*|module_pathname = '${RPM_INSTALL_PREFIX}/gpextable/pxf'|g" "${RPM_INSTALL_PREFIX}/gpextable/pxf.control" + +# Change ownership to gpadmin.gpadmin if the gpadmin user exists +if id "gpadmin" &>/dev/null; then + chown -R gpadmin:gpadmin ${RPM_INSTALL_PREFIX} +fi + +%files +%{prefix} +/usr/local/%{name} + +# If a file is not marked as a config file, or if a file has not been altered +# since installation, then it will be silently replaced by the version from the +# RPM. + +# If a config file has been edited on disk, but is not actually different from +# the file in the RPM then the edited version will be silently left in place. + +# When a config file has been edited and is different from the file in +# the RPM, then the behavior is the following: +# - %config(noreplace): The edited version will be left in place, and the new +# version will be installed with an .rpmnew suffix. +# - %config: The new file will be installed, and the the old edited version +# will be renamed with an .rpmsave suffix. + +# Configuration directories/files +%config(noreplace) %{prefix}/conf/pxf-application.properties +%config(noreplace) %{prefix}/conf/pxf-env.sh +%config(noreplace) %{prefix}/conf/pxf-log4j2.xml +%config(noreplace) %{prefix}/conf/pxf-profiles.xml + +%pre +# cleanup files and directories created by 'pxf init' command +# only applies for old installations (pre 6.0.0) +%__rm -f "${RPM_INSTALL_PREFIX}/conf/pxf-private.classpath" +%__rm -rf "${RPM_INSTALL_PREFIX}/pxf-service" + +%posttrans +# PXF v5 RPM installation removes the run directory during the %preun step. +# The lack of run directory prevents PXF v6+ from starting up. +# %posttrans of the new package is the only step that runs after the %preun +# of the old package +%{__install} -d -m 700 "${RPM_INSTALL_PREFIX}/run" + +%preun +# Remove symlink on uninstall +if [ $1 -eq 0 ] ; then + %__rm -f /usr/local/%{name} +fi diff --git a/version b/version index 3eefcb9dd..dc1e644a1 100644 --- a/version +++ b/version @@ -1 +1 @@ -1.0.0 +1.6.0 From 88405c86a0513a593b007fb13cbe4bd3e75a9b2d Mon Sep 17 00:00:00 2001 From: Dianjin Wang Date: Wed, 4 Feb 2026 14:28:50 +0800 Subject: [PATCH 04/10] Update Makefile and cloudberry-pxf for Apache Cloudberry --- Makefile | 2 +- package/cloudberry-pxf.spec | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Makefile b/Makefile index 6d1430382..caaac20ed 100644 --- a/Makefile +++ b/Makefile @@ -13,7 +13,7 @@ SOURCE_EXTENSION_DIR = external-table TARGET_EXTENSION_DIR = gpextable LICENSE ?= ASL 2.0 -VENDOR ?= Cloudberry Open Source +VENDOR ?= Apache Cloudberry (Incubating) RELEASE ?= 1 default: all diff --git a/package/cloudberry-pxf.spec b/package/cloudberry-pxf.spec index 95542b684..44a144e6e 100644 --- a/package/cloudberry-pxf.spec +++ b/package/cloudberry-pxf.spec @@ -10,9 +10,9 @@ Name: cloudberry-pxf Version: %{pxf_version} Release: %{pxf_release}%{?dist} -Summary: Cloudberry PXF (Platform Extension Framework) for advanced data access +Summary: Apache Cloudberry PXF (Platform Extension Framework) for advanced data access License: %{license} -URL: https://cloudberrydb.org +URL: https://cloudberry.apache.org Vendor: %{vendor} Group: Applications/Databases @@ -23,7 +23,7 @@ Prefix: /usr/local/%{name}-%{version} Requires: bash -# Require Cloudberry Database - .so file makes sense only when +# Require Apache Cloudberry - .so file makes sense only when # installing on Cloudberry node, so inherit Cloudberry's dependencies # implicitly @@ -34,11 +34,11 @@ Suggests: java-1.8.0-openjdk Suggests: java-11-openjdk %description -Cloudberry PXF (Platform Extension Framework) is an advanced data +Apache Cloudberry PXF (Platform Extension Framework) is an advanced data access framework that provides connectivity to a wide range of data sources. It enables high-speed, parallel data access across distributed systems, making it an essential component for performing -advanced analytics with the Cloudberry Database. PXF seamlessly +advanced analytics with the Apache Cloudberry. PXF seamlessly integrates and efficiently queries external data sources, including, but not limited to: @@ -65,8 +65,8 @@ data environments and file formats. The examples provided above represent only a subset of the broad range of sources and formats supported by PXF. -For more information, visit the official Cloudberry Database website -at https://cloudberrydb.org. +For more information, visit the official Apache Cloudberry website +at https://cloudberry.apache.org. %prep # If the pxf_version macro is not defined, it gets interpreted as a literal string, need %% to escape it From 29cb92d1f99e9d1d04f5342c3e08e289265cf0e3 Mon Sep 17 00:00:00 2001 From: Dianjin Wang Date: Thu, 5 Feb 2026 15:13:43 +0800 Subject: [PATCH 05/10] Remove dev files from source dir * Remove obsoleted files under `dev` dir * Move `dev/start_minio.bash` to `ci/docker/pxf-cbdb-dev/ubuntu/script/start_minio.bash` See: https://github.com/apache/cloudberry-pxf/issues/48 --- README.md | 52 ++--- .../pxf-cbdb-dev/ubuntu/script/entrypoint.sh | 2 +- .../ubuntu/script/entrypoint_kerberos.sh | 2 +- .../ubuntu/script}/start_minio.bash | 0 dev/.gitignore | 1 - dev/README.md | 17 -- dev/bootstrap.bash | 42 ---- dev/build_and_install_gpdb.bash | 6 - dev/build_gpdb.bash | 13 -- dev/configure_singlecluster.bash | 189 ----------------- dev/init_greenplum.bash | 41 ---- dev/init_hadoop.bash | 198 ------------------ dev/install_gpdb.bash | 5 - dev/install_greenplum.bash | 40 ---- dev/install_pxf.bash | 25 --- dev/smoke_shortcut.sh | 39 ---- dev/start.bash | 20 -- 17 files changed, 22 insertions(+), 670 deletions(-) rename {dev => ci/docker/pxf-cbdb-dev/ubuntu/script}/start_minio.bash (100%) delete mode 100644 dev/.gitignore delete mode 100644 dev/README.md delete mode 100755 dev/bootstrap.bash delete mode 100755 dev/build_and_install_gpdb.bash delete mode 100755 dev/build_gpdb.bash delete mode 100755 dev/configure_singlecluster.bash delete mode 100755 dev/init_greenplum.bash delete mode 100755 dev/init_hadoop.bash delete mode 100755 dev/install_gpdb.bash delete mode 100755 dev/install_greenplum.bash delete mode 100755 dev/install_pxf.bash delete mode 100755 dev/smoke_shortcut.sh delete mode 100755 dev/start.bash diff --git a/README.md b/README.md index 03489d1c7..26bbc1c4c 100755 --- a/README.md +++ b/README.md @@ -51,7 +51,8 @@ To build PXF, you must have: Assuming you have installed Cloudberry into `/usr/local/cloudberrydb` directory, run its environment script: ``` - source /usr/local/cloudberrydb/greenplum_path.sh + source /usr/local/cloudberrydb/greenplum_path.sh # For Cloudberry 2.0 + source /usr/local/cloudberrydb/cloudberry-env.sh # For Cloudberry 2.1+ ``` 3. JDK 1.8 or JDK 11 to compile/run @@ -171,34 +172,25 @@ cp ${PXF_HOME}/templates/*-site.xml ${PXF_BASE}/servers/default > [!Note] > Since the docker container will house all Single cluster Hadoop, Cloudberry and PXF, we recommend that you have at least 4 cpus and 6GB memory allocated to Docker. These settings are available under docker preferences. -The quick and easy is to download the Cloudberry RPM from GitHub and move it into the `/downloads` folder. Then run `./dev/start.bash` to get a docker image with a running Cloudberry, Hadoop cluster and an installed PXF. +We provide a Docker-based development environment that includes Cloudberry, Hadoop, and PXF. See [automation/README.Docker.md](automation/README.Docker.md) for detailed instructions. -#### Setup Cloudberry in the Docker image - -Configure, build and install Cloudberry. This will be needed only when you use the container for the first time with Cloudberry source. +**Quick Start:** ```bash -~/workspace/pxf/dev/build_gpdb.bash -sudo mkdir /usr/local/cloudberry-db-devel -sudo chown gpadmin:gpadmin /usr/local/cloudberry-db-devel -~/workspace/pxf/dev/install_gpdb.bash -``` +# Build and start the development container +docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml build +docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml up -d -For subsequent minor changes to Cloudberry source you can simply do the following: -```bash -~/workspace/pxf/dev/install_gpdb.bash -``` +# Enter the container and run setup +docker exec -it pxf-cbdb-dev bash -c \ + "cd /home/gpadmin/workspace/cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu && ./script/entrypoint.sh" -Run all the instructions below and run GROUP=smoke (in one script): -```bash -~/workspace/pxf/dev/smoke_shortcut.sh -``` +# Run tests +docker exec -it pxf-cbdb-dev bash -c \ + "cd /home/gpadmin/workspace/cloudberry-pxf/ci/docker/pxf-cbdb-dev/ubuntu && ./script/run_tests.sh" -Create Cloudberry Cluster -```bash -source /usr/local/cloudberrydb-db-devel/greenplum_path.sh -make -C ~/workspace/cbdb create-demo-cluster -source ~/workspace/cbdb/gpAux/gpdemo/gpdemo-env.sh +# Stop and clean up +docker compose -f ci/docker/pxf-cbdb-dev/ubuntu/docker-compose.yml down -v ``` #### Setup Hadoop @@ -206,9 +198,7 @@ Hdfs will be needed to demonstrate functionality. You can choose to start additi Setup [User Impersonation](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Superusers.html) prior to starting the hadoop components (this allows the `gpadmin` user to access hadoop data). -```bash -~/workspace/pxf/dev/configure_singlecluster.bash -``` +The Docker development environment automatically configures Hadoop. For manual setup, see [automation/README.Docker.md](automation/README.Docker.md). Setup and start HDFS ```bash @@ -233,13 +223,11 @@ popd ``` #### Setup Minio (optional) -Minio is an S3-API compatible local storage solution. The development docker image comes with Minio software pre-installed. To start the Minio server, run the following script: -```bash -source ~/workspace/pxf/dev/start_minio.bash -``` +Minio is an S3-API compatible local storage solution. The development docker image comes with Minio software pre-installed. MinIO is automatically started by the Docker development environment. + After the server starts, you can access Minio UI at `http://localhost:9000` from the host OS. Use `admin` for the access key and `password` for the secret key when connecting to your local Minio instance. -The script also sets `PROTOCOL=minio` so that the automation framework will use the local Minio server when running S3 automation tests. If later you would like to run Hadoop HDFS tests, unset this variable with `unset PROTOCOL` command. +To run S3 automation tests, set `PROTOCOL=minio`. If later you would like to run Hadoop HDFS tests, unset this variable with `unset PROTOCOL` command. #### Setup PXF @@ -330,7 +318,7 @@ no JDK set for Gradle. Just cancel and retry. It goes away the second time. - Download bin_gpdb (from any of the pipelines) - Download pxf_tarball (from any of the pipelines) -These instructions allow you to run a Kerberized cluster +These instructions allow you to run a Kerberized cluster. See [automation/README.Docker.md](automation/README.Docker.md) for detailed Kerberos setup instructions. ```bash docker run --rm -it \ diff --git a/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint.sh b/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint.sh index ed7396d83..80cbb35a8 100755 --- a/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint.sh +++ b/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint.sh @@ -468,7 +468,7 @@ start_hive_services() { deploy_minio() { log "deploying MinIO" - bash "${REPO_DIR}/dev/start_minio.bash" + bash "${PXF_SCRIPTS}/start_minio.bash" } main() { diff --git a/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh b/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh index 8469fc333..f64fabeea 100755 --- a/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh +++ b/ci/docker/pxf-cbdb-dev/ubuntu/script/entrypoint_kerberos.sh @@ -293,7 +293,7 @@ setup_ssl_material() { deploy_minio() { log "deploying MinIO (for S3 tests)" - bash "${REPO_ROOT}/dev/start_minio.bash" + bash "${PXF_SCRIPTS}/start_minio.bash" } configure_pxf_s3() { diff --git a/dev/start_minio.bash b/ci/docker/pxf-cbdb-dev/ubuntu/script/start_minio.bash similarity index 100% rename from dev/start_minio.bash rename to ci/docker/pxf-cbdb-dev/ubuntu/script/start_minio.bash diff --git a/dev/.gitignore b/dev/.gitignore deleted file mode 100644 index eb2194299..000000000 --- a/dev/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dataproc_env_files/ diff --git a/dev/README.md b/dev/README.md deleted file mode 100644 index e3aa2a6dd..000000000 --- a/dev/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# Profiling - -### Visual VM Profiling - -To perform memory profiling add the following line to PXF's environment settings (`pxf/conf/pxf-env.sh`) on the machine where we want to debug: - -``` -export CATALINA_OPTS="-Dcom.sun.management.jmxremote=true -Dcom.sun.management.jmxremote.rmi.port=9090 -Dcom.sun.management.jmxremote.port=9090 -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.local.only=false -Djava.rmi.server.hostname=127.0.0.1" -``` - -### JProfiler - -To perform memory profiling in JProfiler add the following setting to your `PXF_JVM_OPTS`: - -``` -export PXF_JVM_OPTS="-Xmx2g -Xms1g -agentpath:/Applications/JProfiler.app/Contents/Resources/app/bin/macos/libjprofilerti.jnilib=port=8849" -``` diff --git a/dev/bootstrap.bash b/dev/bootstrap.bash deleted file mode 100755 index a4e10a279..000000000 --- a/dev/bootstrap.bash +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -# setup environment for gpadmin -#export PS1="[\u@\h \W]\$ " -#export HADOOP_ROOT=~/workspace/singlecluster -#export PXF_JVM_OPTS="-Xmx512m -Xms256m" -#export BUILD_PARAMS="-x test" - -export JAVA_HOME=/etc/alternatives/java_sdk - -# install and init Greenplum as gpadmin user -su - gpadmin -c ${SCRIPT_DIR}/install_greenplum.bash - -# now GPHOME should be discoverable by .pxfrc -source ~gpadmin/.pxfrc -chown -R gpadmin:gpadmin ${GPHOME} - -# remove existing PXF, if any, that could come pre-installed with Greenplum RPM -if [[ -d ${GPHOME}/pxf ]]; then - echo; echo "=====> Removing PXF installed with GPDB <====="; echo - rm -rf ${GPHOME}/pxf - rm ${GPHOME}/lib/postgresql/pxf.so - rm ${GPHOME}/share/postgresql/extension/pxf.control - rm ${GPHOME}/share/postgresql/extension/pxf*.sql -fi - -# prepare PXF_HOME for PXF installation -mkdir -p ${PXF_HOME} -chown -R gpadmin:gpadmin ${PXF_HOME} - -# configure and start Hadoop single cluster -chmod a+w /singlecluster -SLAVES=1 ${SCRIPT_DIR}/init_hadoop.bash - -su - gpadmin -c " - source ~/.pxfrc && - env && - ${SCRIPT_DIR}/init_greenplum.bash && - ${SCRIPT_DIR}/install_pxf.bash - " diff --git a/dev/build_and_install_gpdb.bash b/dev/build_and_install_gpdb.bash deleted file mode 100755 index 76d6cdfc8..000000000 --- a/dev/build_and_install_gpdb.bash +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash - -CWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -${CWDIR}/build_gpdb.bash -${CWDIR}/install_gpdb.bash diff --git a/dev/build_gpdb.bash b/dev/build_gpdb.bash deleted file mode 100755 index 6f299b149..000000000 --- a/dev/build_gpdb.bash +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -pushd ~/workspace/gpdb -make clean -./configure \ - --enable-debug \ - --with-perl \ - --with-python \ - --with-libxml \ - --disable-orca \ - --prefix=/usr/local/greenplum-db-devel -make -j8 -popd \ No newline at end of file diff --git a/dev/configure_singlecluster.bash b/dev/configure_singlecluster.bash deleted file mode 100755 index 9c7e72751..000000000 --- a/dev/configure_singlecluster.bash +++ /dev/null @@ -1,189 +0,0 @@ -#!/bin/bash - ->~/workspace/singlecluster/hadoop/etc/hadoop/core-site.xml cat < - - - - - - - - fs.defaultFS - hdfs://0.0.0.0:8020 - - - ipc.ping.interval - 900000 - - - hadoop.proxyuser.gpadmin.hosts - * - - - hadoop.proxyuser.gpadmin.groups - * - - - hadoop.security.authorization - true - - - hbase.security.authorization - true - - - hbase.rpc.protection - authentication - - - hbase.coprocessor.master.classes - org.apache.hadoop.hbase.security.access.AccessController - - - hbase.coprocessor.region.classes - org.apache.hadoop.hbase.security.access.AccessController,org.apache.hadoop.hbase.security.access.SecureBulkLoadEndpoint - - - hbase.coprocessor.regionserver.classes - org.apache.hadoop.hbase.security.access.AccessController - - -EOF - ->~/workspace/singlecluster/hbase/conf/hbase-site.xml cat < - - - - - hbase.rootdir - hdfs://0.0.0.0:8020/hbase - - - dfs.replication - 3 - - - dfs.support.append - true - - - hbase.cluster.distributed - true - - - hbase.zookeeper.quorum - 127.0.0.1 - - - hbase.zookeeper.property.clientPort - 2181 - - - hadoop.proxyuser.gpadmin.hosts - * - - - hadoop.proxyuser.gpadmin.groups - * - - - hadoop.security.authorization - true - - - hbase.security.authorization - true - - - hbase.rpc.protection - authentication - - - hbase.coprocessor.master.classes - org.apache.hadoop.hbase.security.access.AccessController - - - hbase.coprocessor.region.classes - org.apache.hadoop.hbase.security.access.AccessController,org.apache.hadoop.hbase.security.access.SecureBulkLoadEndpoint - - - hbase.coprocessor.regionserver.classes - org.apache.hadoop.hbase.security.access.AccessController - - -EOF - ->~/workspace/singlecluster/hive/conf/hive-site.xml cat < - - hive.metastore.warehouse.dir - /hive/warehouse - - - hive.metastore.uris - thrift://localhost:9083 - - - hive.server2.enable.impersonation - true - Set this property to enable impersonation in Hive Server 2 - - - hive.server2.enable.doAs - false - Set this property to enable impersonation in Hive Server 2 - - - hive.execution.engine - mr - Chooses execution engine. Options are: mr(default), tez, or spark - - - hive.metastore.schema.verification - false - Modify schema instead of reporting error - - - datanucleus.autoCreateTables - True - - - hive.metastore.integral.jdo.pushdown - True - - -EOF diff --git a/dev/init_greenplum.bash b/dev/init_greenplum.bash deleted file mode 100755 index abffc34fc..000000000 --- a/dev/init_greenplum.bash +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -GPHOME=${GPHOME:=/usr/local/greenplum-db} -PYTHONHOME='' source "${GPHOME}/greenplum_path.sh" - -# Create config and data dirs. -data_dirs=(~gpadmin/data{1..3}/primary) -dirs=(~gpadmin/{gpconfigs,data/master} "${data_dirs[@]}") -mkdir -p "${dirs[@]}" - -sed -e "s/MASTER_HOSTNAME=mdw/MASTER_HOSTNAME=\$(hostname -f)/g" \ - -e "s|declare -a DATA_DIRECTORY.*|declare -a DATA_DIRECTORY=( ${data_dirs[*]} )|g" \ - -e "s|MASTER_DIRECTORY=.*|MASTER_DIRECTORY=~gpadmin/data/master|g" \ - "${GPHOME}/docs/cli_help/gpconfigs/gpinitsystem_config" >~gpadmin/gpconfigs/gpinitsystem_config -chmod +w ~gpadmin/gpconfigs/gpinitsystem_config - -#Script to start segments and create directories. -hostname -f >/tmp/hosts.txt - -# gpinitsystem fails in concourse environment without this "ping" workaround. "[FATAL]:-Unknown host..." -sudo chmod u+s /bin/ping - -pgrep sshd || sudo /usr/sbin/sshd -gpssh-exkeys -f /tmp/hosts.txt - -# 5X gpinitsystem returns 1 exit code on warnings. -# so we ignore return code of 1, but otherwise we fail -set +e -gpinitsystem -a -c ~gpadmin/gpconfigs/gpinitsystem_config -h /tmp/hosts.txt --su_password=changeme -(( $? > 1 )) && exit 1 -set -e - -echo 'host all all 0.0.0.0/0 password' >>~gpadmin/data/master/gpseg-1/pg_hba.conf - -# reload pg_hba.conf -MASTER_DATA_DIRECTORY=~gpadmin/data/master/gpseg-1 gpstop -u - -sleep 3 -psql -d template1 -c "CREATE DATABASE gpadmin;" \ No newline at end of file diff --git a/dev/init_hadoop.bash b/dev/init_hadoop.bash deleted file mode 100755 index 843173625..000000000 --- a/dev/init_hadoop.bash +++ /dev/null @@ -1,198 +0,0 @@ -#!/usr/bin/env bash - ->/singlecluster/hadoop/etc/hadoop/core-site.xml cat < - - - - - - - - fs.defaultFS - hdfs://0.0.0.0:8020 - - - ipc.ping.interval - 900000 - - - hadoop.proxyuser.gpadmin.hosts - * - - - hadoop.proxyuser.gpadmin.groups - * - - - hadoop.security.authorization - true - - - hbase.security.authorization - true - - - hbase.rpc.protection - authentication - - - hbase.coprocessor.master.classes - org.apache.hadoop.hbase.security.access.AccessController - - - hbase.coprocessor.region.classes - org.apache.hadoop.hbase.security.access.AccessController,org.apache.hadoop.hbase.security.access.SecureBulkLoadEndpoint - - - hbase.coprocessor.regionserver.classes - org.apache.hadoop.hbase.security.access.AccessController - - -EOF - ->/singlecluster/hbase/conf/hbase-site.xml cat < - - - - - hbase.rootdir - hdfs://0.0.0.0:8020/hbase - - - dfs.replication - 3 - - - dfs.support.append - true - - - hbase.cluster.distributed - true - - - hbase.zookeeper.quorum - 127.0.0.1 - - - hbase.zookeeper.property.clientPort - 2181 - - - hadoop.proxyuser.gpadmin.hosts - * - - - hadoop.proxyuser.gpadmin.groups - * - - - hadoop.security.authorization - true - - - hbase.security.authorization - true - - - hbase.rpc.protection - authentication - - - hbase.coprocessor.master.classes - org.apache.hadoop.hbase.security.access.AccessController - - - hbase.coprocessor.region.classes - org.apache.hadoop.hbase.security.access.AccessController,org.apache.hadoop.hbase.security.access.SecureBulkLoadEndpoint - - - hbase.coprocessor.regionserver.classes - org.apache.hadoop.hbase.security.access.AccessController - - -EOF - ->/singlecluster/hive/conf/hive-site.xml cat < - - hive.metastore.warehouse.dir - /hive/warehouse - - - hive.metastore.uris - thrift://localhost:9083 - - - hive.server2.enable.impersonation - true - Set this property to enable impersonation in Hive Server 2 - - - hive.server2.enable.doAs - false - Set this property to enable impersonation in Hive Server 2 - - - hive.execution.engine - mr - Chooses execution engine. Options are: mr(default), tez, or spark - - - hive.metastore.schema.verification - false - Modify schema instead of reporting error - - - datanucleus.autoCreateTables - True - - - hive.metastore.integral.jdo.pushdown - True - - -EOF - -pushd /singlecluster/bin > /dev/null - echo y | ./init-gphd.sh - ./start-hdfs.sh - ./start-yarn.sh - ./start-hive.sh - ./start-zookeeper.sh - ./start-hbase.sh -popd > /dev/null diff --git a/dev/install_gpdb.bash b/dev/install_gpdb.bash deleted file mode 100755 index a4a662640..000000000 --- a/dev/install_gpdb.bash +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -pushd ~/workspace/gpdb -make -j4 install -popd diff --git a/dev/install_greenplum.bash b/dev/install_greenplum.bash deleted file mode 100755 index c485912f1..000000000 --- a/dev/install_greenplum.bash +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -CWDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -pushd ${CWDIR}/../downloads > /dev/null - -# CentOS releases contain a /etc/redhat-release which is symlinked to /etc/centos-release -if [[ -f /etc/redhat-release ]]; then - major_version=$(cat /etc/redhat-release | tr -dc '0-9.'|cut -d \. -f1) - ARTIFACT_OS="rhel${major_version}" - LATEST_RPM=$(ls greenplum*${ARTIFACT_OS}*.rpm | sort -r | head -1) - - if [[ -z $LATEST_RPM ]]; then - echo "ERROR: No greenplum RPM found in ${PWD}" - popd > /dev/null - exit 1 - fi - - echo "Installing GPDB from ${LATEST_RPM} ..." - sudo rpm --quiet -ivh "${LATEST_RPM}" - -elif [[ -f /etc/debian_version ]]; then - ARTIFACT_OS="ubuntu" - LATEST_DEB=$(ls *greenplum*ubuntu*.deb | sort -r | head -1) - - if [[ -z $LATEST_DEB ]]; then - echo "ERROR: No greenplum DEB found in ${PWD}" - popd > /dev/null - exit 1 - fi - - echo "Installing GPDB from ${LATEST_DEB} ..." - # apt-get wants a full path - sudo apt-get install -qq "${PWD}/${LATEST_DEB}" -else - echo "Unsupported operating system '$(source /etc/os-release && echo "${PRETTY_NAME}")'. Exiting..." - exit 1 -fi - -popd > /dev/null diff --git a/dev/install_pxf.bash b/dev/install_pxf.bash deleted file mode 100755 index 97adcb5de..000000000 --- a/dev/install_pxf.bash +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env bash - -function display() { - echo - echo "=====> $1 <=====" - echo -} - -display "Compiling and Installing PXF" -make -C ~gpadmin/workspace/pxf install - -display "Initializing PXF" -pxf init - -display "Starting PXF" -pxf start - -display "Setting up default PXF server" -cp "${PXF_HOME}"/templates/*-site.xml "${PXF_HOME}"/servers/default - -display "Registering PXF Greenplum extension" -psql -d template1 -c "create extension pxf" - -#cd ~/workspace/pxf/automation -#make GROUP=smoke \ No newline at end of file diff --git a/dev/smoke_shortcut.sh b/dev/smoke_shortcut.sh deleted file mode 100755 index 34c23a5d7..000000000 --- a/dev/smoke_shortcut.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash -set -e - -~/workspace/pxf/dev/install_gpdb.bash - -source /usr/local/greenplum-db-devel/greenplum_path.sh -make -C ~/workspace/gpdb create-demo-cluster -source ~/workspace/gpdb/gpAux/gpdemo/gpdemo-env.sh - -~/workspace/pxf/dev/configure_singlecluster.bash - -pushd ~/workspace/singlecluster/bin - echo y | ./init-gphd.sh - ./start-hdfs.sh - ./start-yarn.sh - ./start-hive.sh - ./start-zookeeper.sh - ./start-hbase.sh -popd - -make -C ~/workspace/pxf install -export PXF_BASE=$PXF_HOME -export PXF_JVM_OPTS="-Xmx512m -Xms256m" -$PXF_HOME/bin/pxf init -$PXF_HOME/bin/pxf start - -cp "${PXF_BASE}"/templates/*-site.xml "${PXF_BASE}"/servers/default - -if [ -d ~/workspace/gpdb/gpAux/extensions/pxf ]; then - PXF_EXTENSIONS_DIR=gpAux/extensions/pxf -else - PXF_EXTENSIONS_DIR=gpcontrib/pxf -fi - -make -C ~/workspace/gpdb/${PXF_EXTENSIONS_DIR} installcheck -psql -d template1 -c "create extension pxf" - -cd ~/workspace/pxf/automation -make GROUP=smoke diff --git a/dev/start.bash b/dev/start.bash deleted file mode 100755 index ced0e2407..000000000 --- a/dev/start.bash +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/env bash - -if [[ -z ${GCR_PROJECT} ]]; then - echo "Please set GCR_PROJECT variable to the name of your Google Container Registry project" - exit 1 -fi - -docker run --rm -it \ - -p 5432:5432 \ - -p 5888:5888 \ - -p 8000:8000 \ - -p 5005:5005 \ - -p 8020:8020 \ - -p 9000:9000 \ - -p 9090:9090 \ - -p 50070:50070 \ - -w /home/gpadmin/workspace \ - -v ~/workspace/pxf:/home/gpadmin/workspace/pxf \ - gcr.io/${GCR_PROJECT}/gpdb-pxf-dev/gpdb6-centos7-test-pxf-hdp2:latest /bin/bash -c \ - "/home/gpadmin/workspace/pxf/dev/bootstrap.bash && su - gpadmin" \ No newline at end of file From 5d4a666bfc8b6162ee0723cd4f0806cf07f22c0e Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Wed, 4 Feb 2026 14:33:04 +0500 Subject: [PATCH 06/10] parquet-1.15.2 --- server/build.gradle | 3 ++- server/gradle.properties | 2 +- server/pxf-hdfs/build.gradle | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/server/build.gradle b/server/build.gradle index 499a0b721..a1b6275c2 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -164,7 +164,7 @@ configure(javaProjects) { } // Parquet dependencies - dependency("org.apache.parquet:parquet-format:2.7.0") + dependency("org.apache.parquet:parquet-format:2.11.0") dependencySet(group:"org.apache.parquet", version:"${parquetVersion}") { entry("parquet-column") entry("parquet-common") @@ -173,6 +173,7 @@ configure(javaProjects) { entry("parquet-hadoop") entry("parquet-jackson") entry("parquet-pig") + entry("parquet-format-structures") } // Thrift dependencies diff --git a/server/gradle.properties b/server/gradle.properties index eb6191df4..960ef7c9f 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.11.1 +parquetVersion=1.15.2 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true diff --git a/server/pxf-hdfs/build.gradle b/server/pxf-hdfs/build.gradle index 673e528ea..9705fb6ff 100644 --- a/server/pxf-hdfs/build.gradle +++ b/server/pxf-hdfs/build.gradle @@ -38,6 +38,7 @@ dependencies { implementation("org.apache.hadoop:hadoop-hdfs") { transitive = false } implementation("org.apache.hadoop:hadoop-hdfs-client") { transitive = false } implementation("org.apache.parquet:parquet-format") { transitive = false } + implementation("org.apache.parquet:parquet-format-structures") { transitive = false } implementation("org.apache.parquet:parquet-column") { transitive = false } implementation("org.apache.parquet:parquet-common") { transitive = false } implementation("org.apache.parquet:parquet-encoding") { transitive = false } From 7d17b9d208633472b5ca5906e5a2a81b30de52cd Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Wed, 4 Feb 2026 16:06:13 +0500 Subject: [PATCH 07/10] parquet-1.14.4 --- server/gradle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/gradle.properties b/server/gradle.properties index 960ef7c9f..0696a23e7 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.15.2 +parquetVersion=1.14.4 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true From e72f3de5da108cdcf143784ba30ab0d1e5440539 Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Wed, 4 Feb 2026 20:19:50 +0500 Subject: [PATCH 08/10] 1.13.1 --- server/gradle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/gradle.properties b/server/gradle.properties index 0696a23e7..2a9a452be 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.14.4 +parquetVersion=1.13.1 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true From 9edd2e860a7c036cc6f475d6a2ea4ebeb4426237 Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Thu, 5 Feb 2026 14:33:36 +0500 Subject: [PATCH 09/10] 1.12.3 =/ --- server/gradle.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/gradle.properties b/server/gradle.properties index 2a9a452be..42da880a3 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -23,7 +23,7 @@ hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 hbaseVersion=1.3.2 junitVersion=4.11 -parquetVersion=1.13.1 +parquetVersion=1.12.3 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true From c50e2ce6f82d515a0970d36e58b8e370e3b94d5f Mon Sep 17 00:00:00 2001 From: Nikolay Antonov Date: Thu, 5 Feb 2026 23:10:15 +0500 Subject: [PATCH 10/10] Add tests for Parquet compression --- .../features/parquet/ParquetWriteTest.java | 15 +++++++++++++++ docs/content/hdfs_parquet.html.md.erb | 4 ++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java index 358d2233c..18218eac1 100644 --- a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java @@ -195,6 +195,21 @@ public void parquetWritePrimitivesGZipClassName() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_gzip_classname", "pxf_parquet_read_primitives_gzip_classname", "parquet_write_primitives_gzip_classname", new String[]{"COMPRESSION_CODEC=org.apache.hadoop.io.compress.GzipCodec"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesSnappy() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_snappy", "pxf_parquet_read_primitives_snappy", "parquet_write_primitives_snappy", new String[]{"COMPRESSION_CODEC=snappy"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesUncompressed() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_uncompressed", "pxf_parquet_read_primitives_uncompressed", "parquet_write_primitives_uncompressed", new String[]{"COMPRESSION_CODEC=uncompressed"}); + } + + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesZStd() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/docs/content/hdfs_parquet.html.md.erb b/docs/content/hdfs_parquet.html.md.erb index 26ee4817e..9ad05b785 100644 --- a/docs/content/hdfs_parquet.html.md.erb +++ b/docs/content/hdfs_parquet.html.md.erb @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `lzo`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lzo`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. |