diff --git a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java index 18218eac..0bd9b611 100644 --- a/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java +++ b/automation/src/test/java/org/apache/cloudberry/pxf/automation/features/parquet/ParquetWriteTest.java @@ -210,6 +210,11 @@ public void parquetWritePrimitivesZStd() throws Exception { runWritePrimitivesScenario("pxf_parquet_write_primitives_zstd", "pxf_parquet_read_primitives_zstd", "parquet_write_primitives_zstd", new String[]{"COMPRESSION_CODEC=zstd"}); } + @Test(groups = {"features", "gpdb", "security", "hcfs"}) + public void parquetWritePrimitivesLZ4_RAW() throws Exception { + runWritePrimitivesScenario("pxf_parquet_write_primitives_lz4_raw", "pxf_parquet_read_primitives_lz4_raw", "parquet_write_primitives_lz4_raw", new String[]{"COMPRESSION_CODEC=lz4_raw"}); + } + // Numeric precision not defined, test writing data precision in [1, 38]. All the data should be written correctly. @Test(groups = {"features", "gpdb", "security", "hcfs"}) public void parquetWriteUndefinedPrecisionNumeric() throws Exception { diff --git a/ci/singlecluster/Dockerfile b/ci/singlecluster/Dockerfile index 8e8c4621..abb60e3c 100644 --- a/ci/singlecluster/Dockerfile +++ b/ci/singlecluster/Dockerfile @@ -12,14 +12,14 @@ RUN sudo apt-get update && \ ENV HADOOP_VERSION=3.1.2 ENV HIVE_VERSION=3.1.3 ENV ZOOKEEPER_VERSION=3.5.9 -ENV HBASE_VERSION=2.0.6 +ENV HBASE_VERSION=2.3.7 ENV TEZ_VERSION=0.9.2 # checksums from archive.apache.org ENV HADOOP_SHA512="0e0ee817c89b3c4eb761eca7f16640742a83b0e99b6fda26c1bee2baabedad93aab86e252bf5f1e2381c6d464bc4003d10c7cc0f61b2062f4c59732ca24d1bd9" ENV HIVE_SHA256="0c9b6a6359a7341b6029cc9347435ee7b379f93846f779d710b13f795b54bb16" ENV ZOOKEEPER_SHA512="0e5a64713abc6f36d961dd61a06f681868171a9d9228366e512a01324806d263e05508029c94d8e18307811867cdc39d848e736c252bf56c461273ef74c66a45" -ENV HBASE_SHA512="a0e10904ecf7f059b77bc0ce704254046a978126db720cc7e55dc53b87097715da64b8391fe3cc94348bc432871ad8f29891dc8df1ea052eb628da0fdca97c93" +ENV HBASE_SHA512="1032521025660daa70260cdc931f52a26c87596be444451fe1fa88b526ede55e9d6b4220e91ff6f7422bec11f30d64fa6745e95a9c36971fdb1a264a2c745693" ENV TEZ_SHA512="a2d94bd9fa778d42a8bac9d9da8e263e469ddfef93968b06434716554995f490231de5607541ac236e770aa0158b64250c38bc1cd57dbfa629fea705f2ffa2f5" # faster mirror: @@ -63,7 +63,7 @@ RUN mkdir -p $ZOOKEEPER_ROOT && \ RUN mkdir -p $HBASE_ROOT && \ curl -fSL "$HBASE_URL" -o hbase.tar.gz && \ echo "$HBASE_SHA512 hbase.tar.gz" | sha512sum -c && \ - tar xvf hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" && \ + tar xvf hbase.tar.gz -C $HBASE_ROOT --strip-components 1 --exclude="docs/*" --exclude="lib/*-tests.jar" --exclude="lib/shaded-clients" && \ rm hbase.tar.gz RUN mkdir -p $TEZ_ROOT && \ diff --git a/ci/singlecluster/README.HDP3.md b/ci/singlecluster/README.HDP3.md index 16506de1..3a906e50 100644 --- a/ci/singlecluster/README.HDP3.md +++ b/ci/singlecluster/README.HDP3.md @@ -7,7 +7,7 @@ It contains the following versions: - Hadoop 3.3.6 - Hive 3.1.3 - Zookeeper 3.5.9 -- HBase 2.0.6 +- HBase 2.3.7 - Tez 0.9.2 This version of Single cluster requires users to make some manual changes to the configuration files once the tarball has been unpacked (see Initialization steps below). diff --git a/docs/content/hdfs_parquet.html.md.erb b/docs/content/hdfs_parquet.html.md.erb index 9ad05b78..856d9fbf 100644 --- a/docs/content/hdfs_parquet.html.md.erb +++ b/docs/content/hdfs_parquet.html.md.erb @@ -23,7 +23,7 @@ under the License. Use the PXF HDFS connector to read and write Parquet-format data. This section describes how to read and write HDFS files that are stored in Parquet format, including how to create, query, and insert into external tables that reference files in the HDFS data store. -PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, and `zstd`. +PXF supports reading or writing Parquet files compressed with these codecs: `snappy`, `gzip`, 'lz4_raw' and `zstd`. PXF currently supports reading and writing primitive Parquet data types only. @@ -182,7 +182,7 @@ The PXF `hdfs:parquet` profile supports encoding- and compression-related write | Write Option | Value Description | |-------|-------------------------------------| -| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | +| COMPRESSION_CODEC | The compression codec alias. Supported compression codecs for writing Parquet data include: `snappy`, `gzip`, `lz4_raw`, `zstd`, and `uncompressed` . If this option is not provided, PXF compresses the data using `snappy` compression. | | ROWGROUP_SIZE | A Parquet file consists of one or more row groups, a logical partitioning of the data into rows. `ROWGROUP_SIZE` identifies the size (in bytes) of the row group. The default row group size is `8 * 1024 * 1024` bytes. | | PAGE_SIZE | A row group consists of column chunks that are divided up into pages. `PAGE_SIZE` is the size (in bytes) of such a page. The default page size is `1 * 1024 * 1024` bytes. | | ENABLE\_DICTIONARY | A boolean value that specifies whether or not to enable dictionary encoding. The default value is `true`; dictionary encoding is enabled when PXF writes Parquet files. | diff --git a/server/build.gradle b/server/build.gradle index a1b6275c..3eb6b087 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -90,15 +90,17 @@ configure(javaProjects) { dependency("commons-configuration:commons-configuration:1.10") dependency("commons-io:commons-io:2.7") dependency("commons-lang:commons-lang:2.6") + dependency("commons-lang:commons-lang3:3.9") dependency("commons-logging:commons-logging:1.1.3") - dependency("io.airlift:aircompressor:0.27") + dependency("io.airlift:aircompressor:2.0.2") dependency("javax.jdo:jdo-api:3.0.1") dependency("joda-time:joda-time:2.8.1") dependency("net.sf.opencsv:opencsv:2.3") dependency("org.antlr:antlr-runtime:3.5.2") dependency("org.apache.commons:commons-compress:1.20") + dependency("org.apache.commons:commons-crypto:1.0.0") dependency("org.apache.htrace:htrace-core:3.1.0-incubating") - dependency("org.apache.htrace:htrace-core4:4.0.1-incubating") + dependency("org.apache.htrace:htrace-core4:4.2.0-incubating") dependency("org.apache.zookeeper:zookeeper:3.4.6") dependency("org.codehaus.woodstox:stax2-api:3.1.4") @@ -120,7 +122,7 @@ configure(javaProjects) { dependency("org.threeten:threeten-extra:1.5.0") dependency("org.tukaani:xz:1.8") dependency("org.wildfly.openssl:wildfly-openssl:1.0.7.Final") - dependency("org.xerial.snappy:snappy-java:1.1.10.4") + dependency("org.xerial.snappy:snappy-java:1.1.10.7") // Hadoop dependencies dependencySet(group:"org.apache.hadoop", version:"${hadoopVersion}") { @@ -139,11 +141,28 @@ configure(javaProjects) { // HBase dependencies dependencySet(group:"org.apache.hbase", version:"${hbaseVersion}") { - entry("hbase-annotations") entry("hbase-client") entry("hbase-common") entry("hbase-protocol") + entry("hbase-protocol-shaded") + entry("hbase-logging") + entry("hbase-hadoop-compat") + entry("hbase-hadoop2-compat") + entry("hbase-metrics-api") + entry("hbase-metrics") } + dependencySet(group:"org.apache.hbase.thirdparty", version:"3.3.0") { + entry("hbase-shaded-protobuf") + entry("hbase-shaded-miscellaneous") + entry("hbase-shaded-gson") + entry("hbase-shaded-netty") + entry("hbase-unsafe") + } + dependency("org.apache.yetus:audience-annotations:0.5.0") + dependency("io.opentelemetry:opentelemetry-api:1.49.0") + dependency("io.opentelemetry:opentelemetry-context:1.49.0") + dependency("io.opentelemetry.semconv:opentelemetry-semconv:1.29.0-alpha") + dependency("io.dropwizard.metrics:metrics-core:3.2.6") // Hive dependencies dependency("org.apache.hive:hive-storage-api:${hiveStorageApiVersion}") @@ -193,7 +212,7 @@ configure(javaProjects) { entry("avro") entry("avro-mapred") } - // Zstd support for Avro + // Zstd support for Avro/Parquet dependency("com.github.luben:zstd-jni:1.5.7-6") // Jackson 1.x dependencies @@ -237,7 +256,7 @@ configure(javaProjects) { options.compilerArgs += [ "-g", "-Xlint:varargs", "-Xlint:cast", "-Xlint:classfile", "-Xlint:dep-ann", "-Xlint:divzero", "-Xlint:empty", "-Xlint:finally", "-Xlint:overrides", "-Xlint:path", "-Xlint:-processing", "-Xlint:static", - "-Xlint:try", "-Xlint:fallthrough", "-Xlint:deprecation", "-Xlint:unchecked", "-Xlint:-options", "-Werror" + "-Xlint:try", "-Xlint:fallthrough", "-Xlint:unchecked", "-Xlint:-options", "-Werror" ] } @@ -245,7 +264,7 @@ configure(javaProjects) { options.compilerArgs += [ "-g", "-Xlint:varargs", "-Xlint:cast", "-Xlint:classfile", "-Xlint:dep-ann", "-Xlint:divzero", "-Xlint:empty", "-Xlint:finally", "-Xlint:overrides", "-Xlint:path", "-Xlint:-processing", "-Xlint:static", - "-Xlint:try", "-Xlint:fallthrough", "-Xlint:deprecation", "-Xlint:unchecked", "-Xlint:-options", "-Werror" + "-Xlint:try", "-Xlint:fallthrough", "-Xlint:unchecked", "-Xlint:-options", "-Werror" ] } diff --git a/server/gradle.properties b/server/gradle.properties index 42da880a..e0f41634 100644 --- a/server/gradle.properties +++ b/server/gradle.properties @@ -21,9 +21,9 @@ license=ASL 2.0 hadoopVersion=2.10.2 hiveVersion=2.3.8 hiveStorageApiVersion=2.7.3 -hbaseVersion=1.3.2 +hbaseVersion=2.3.7 junitVersion=4.11 -parquetVersion=1.12.3 +parquetVersion=1.15.2 awsJavaSdk=1.12.261 springBootVersion=2.7.18 org.gradle.daemon=true diff --git a/server/pxf-hbase/build.gradle b/server/pxf-hbase/build.gradle index 026d602b..5924b8f2 100644 --- a/server/pxf-hbase/build.gradle +++ b/server/pxf-hbase/build.gradle @@ -14,23 +14,47 @@ dependencies { *******************************/ compileOnly("com.google.code.findbugs:annotations") - compileOnly("org.apache.hbase:hbase-annotations") /******************************* - * Implementation Dependencies + * Project Dependencies *******************************/ implementation(project(':pxf-api')) - implementation("com.google.protobuf:protobuf-java") implementation("commons-collections:commons-collections") - implementation("org.apache.hbase:hbase-client") { transitive = false } - implementation("org.apache.hbase:hbase-common") { transitive = false } - implementation("org.apache.hbase:hbase-protocol") { transitive = false } - implementation("org.apache.htrace:htrace-core") { transitive = false } - implementation("org.apache.zookeeper:zookeeper") { transitive = false } - implementation("io.netty:netty-common") { transitive = false } - implementation("io.netty:netty-transport") { transitive = false } - implementation("com.yammer.metrics:metrics-core") { transitive = false } + + /******************************* + * Hbase + *******************************/ + + implementation("org.apache.hbase:hbase-client") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-protobuf") { transitive = false } + implementation("org.apache.hbase:hbase-common") { transitive = false } + implementation("org.apache.hbase:hbase-logging") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-miscellaneous") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-gson") { transitive = false } + implementation("org.apache.hbase.thirdparty:hbase-shaded-netty") { transitive = false } + implementation("org.apache.commons:commons-lang3") { transitive = false } + implementation("org.apache.commons:commons-crypto") { transitive = false } + implementation("org.apache.hadoop:hadoop-common") { transitive = false } + implementation("org.apache.hadoop:hadoop-auth") { transitive = false } + implementation("org.apache.hbase:hbase-hadoop-compat") { transitive = false } + implementation("org.apache.hbase:hbase-metrics-api") { transitive = false } + implementation("org.apache.hbase:hbase-metrics") { transitive = false } + implementation("org.apache.hbase:hbase-hadoop2-compat") { transitive = false } + implementation("org.apache.hbase:hbase-protocol-shaded") { transitive = false } + implementation("org.apache.hbase:hbase-protocol") { transitive = false } + implementation("com.google.protobuf:protobuf-java") { transitive = false } + implementation("org.apache.zookeeper:zookeeper") { transitive = false } + implementation("io.netty:netty-common") { transitive = false } + implementation("io.netty:netty-transport") { transitive = false } +// skip JRuby - it is part of interactive shell +// implementation("org.jruby.jcodings:jcodings:1.0.58") { transitive = false } +// implementation("org.jruby.joni:joni:2.2.1") { transitive = false } + implementation("org.apache.yetus:audience-annotations") { transitive = false } + implementation("io.opentelemetry:opentelemetry-api") { transitive = false } + implementation("io.opentelemetry:opentelemetry-context") { transitive = false } + implementation("io.opentelemetry.semconv:opentelemetry-semconv") { transitive = false } + implementation("io.dropwizard.metrics:metrics-core:3.2.6") { transitive = false } implementation("org.springframework.boot:spring-boot-starter-log4j2") @@ -39,7 +63,6 @@ dependencies { *******************************/ testCompileOnly("com.google.code.findbugs:annotations") - testCompileOnly("org.apache.hbase:hbase-annotations") testImplementation("com.esotericsoftware:minlog") testImplementation("com.esotericsoftware:reflectasm") testImplementation('org.springframework.boot:spring-boot-starter-test') diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java index 21d5d42d..774fb9c2 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/HBaseDataFragmenter.java @@ -81,7 +81,6 @@ public FragmentStats getFragmentStats() { public List getFragments() throws Exception { // check that Zookeeper and HBase master are available - HBaseAdmin.checkHBaseAvailable(configuration); connection = ConnectionFactory.createConnection(configuration); Admin hbaseAdmin = connection.getAdmin(); if (!HBaseUtilities.isTableAvailable(hbaseAdmin, context.getDataSource())) { diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java index 3627ca97..6dedaf85 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseDoubleComparator.java @@ -19,11 +19,11 @@ * under the License. */ -import com.google.protobuf.ByteString; -import com.google.protobuf.InvalidProtocolBufferException; +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; +import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.ByteArrayComparable; -import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; public class HBaseDoubleComparator extends ByteArrayComparable { diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java index bf26b003..a5291de6 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseFloatComparator.java @@ -19,11 +19,11 @@ * under the License. */ -import com.google.protobuf.ByteString; -import com.google.protobuf.InvalidProtocolBufferException; +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; +import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.ByteArrayComparable; -import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; public class HBaseFloatComparator extends ByteArrayComparable{ diff --git a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java index ffc0a10e..a1e589c5 100644 --- a/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java +++ b/server/pxf-hbase/src/main/java/org/apache/cloudberry/pxf/plugins/hbase/utilities/HBaseIntegerComparator.java @@ -23,11 +23,11 @@ import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.filter.ByteArrayComparable; import org.apache.hadoop.hbase.filter.SubstringComparator; -import org.apache.hadoop.hbase.protobuf.generated.ComparatorProtos; +import org.apache.hadoop.hbase.shaded.protobuf.generated.ComparatorProtos; import org.apache.hadoop.hbase.util.Bytes; -import com.google.protobuf.ByteString; -import com.google.protobuf.InvalidProtocolBufferException; +import org.apache.hbase.thirdparty.com.google.protobuf.ByteString; +import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException; /** * This is a Filter comparator for HBase It is external to PXF HBase code.