From ddd028833ddd2df7d3b122e79a66f1bddea5fe7d Mon Sep 17 00:00:00 2001 From: guhaiyan0221 <142767528+guhaiyan0221@users.noreply.github.com> Date: Fri, 6 Mar 2026 19:14:49 +0800 Subject: [PATCH 1/3] [perf][cp] Support fast path for Spark comparison function in join filter (#344) --- bolt/functions/lib/SIMDComparisonUtil.h | 60 +++++++++ bolt/functions/sparksql/Comparisons.cpp | 23 ++-- bolt/functions/sparksql/DecimalCompare.cpp | 6 +- .../benchmarks/SIMDCompareBenchmark.cpp | 22 +++- .../sparksql/tests/ComparisonsTest.cpp | 47 ++++++- .../sparksql/tests/DecimalCompareTest.cpp | 118 ++++++++++++++++++ 6 files changed, 252 insertions(+), 24 deletions(-) diff --git a/bolt/functions/lib/SIMDComparisonUtil.h b/bolt/functions/lib/SIMDComparisonUtil.h index 634278dda..f9ac91904 100644 --- a/bolt/functions/lib/SIMDComparisonUtil.h +++ b/bolt/functions/lib/SIMDComparisonUtil.h @@ -29,7 +29,9 @@ */ #pragma once +#include "bolt/expression/EvalCtx.h" #include "bolt/expression/VectorFunction.h" +#include "bolt/vector/DecodedVector.h" namespace bytedance::bolt::functions { @@ -140,6 +142,10 @@ void applyAutoSimdComparisonInternal( }); } } + +inline bool isDictEncoding(const VectorPtr& arg) { + return arg->encoding() == VectorEncoding::Simple::DICTIONARY; +} } // namespace detail template < @@ -238,6 +244,7 @@ template void applyAutoSimdComparison( const SelectivityVector& rows, std::vector& args, + exec::EvalCtx& context, VectorPtr& result, Args... cmpArgs) { const Compare cmp; @@ -258,6 +265,42 @@ void applyAutoSimdComparison( } }, result); + } else if (args[0]->isFlatEncoding() && detail::isDictEncoding(args[1])) { + const A* __restrict rawA = + args[0]->asUnchecked>()->template rawValues(); + DecodedVector decodedB(*args[1], rows); + const B* __restrict rawB = decodedB.data(); + const vector_size_t* __restrict indexB = decodedB.indices(); + detail::applyAutoSimdComparisonInternal( + rows, + rawA, + rawB, + [&](const A* __restrict rawA, const B* __restrict rawB, int i) { + if constexpr (sizeof...(cmpArgs) > 0) { + return Compare::apply(rawA[i], rawB[indexB[i]], cmpArgs...); + } else { + return cmp(rawA[i], rawB[indexB[i]]); + } + }, + result); + } else if (detail::isDictEncoding(args[0]) && args[1]->isFlatEncoding()) { + DecodedVector decodedA(*args[0], rows); + const A* __restrict rawA = decodedA.data(); + const vector_size_t* __restrict indexA = decodedA.indices(); + const B* __restrict rawB = + args[1]->asUnchecked>()->template rawValues(); + detail::applyAutoSimdComparisonInternal( + rows, + rawA, + rawB, + [&](const A* __restrict rawA, const B* __restrict rawB, int i) { + if constexpr (sizeof...(cmpArgs) > 0) { + return Compare::apply(rawA[indexA[i]], rawB[i], cmpArgs...); + } else { + return cmp(rawA[indexA[i]], rawB[i]); + } + }, + result); } else if (args[0]->isConstantEncoding() && args[1]->isFlatEncoding()) { const A constA = args[0]->asUnchecked>()->valueAt(0); const A* __restrict rawA = &constA; @@ -313,4 +356,21 @@ void applyAutoSimdComparison( BOLT_UNREACHABLE(); } } + +template +bool shouldApplyAutoSimdComparison( + const SelectivityVector& rows, + std::vector& args) { + if (rows.end() - rows.begin() > 64) { + if ((args[0]->isFlatEncoding() || args[0]->isConstantEncoding()) && + (args[1]->isFlatEncoding() || args[1]->isConstantEncoding())) { + return true; + } else if (args[0]->isFlatEncoding() && detail::isDictEncoding(args[1])) { + return true; + } else if (detail::isDictEncoding(args[0]) && args[1]->isFlatEncoding()) { + return true; + } + } + return false; +} } // namespace bytedance::bolt::functions diff --git a/bolt/functions/sparksql/Comparisons.cpp b/bolt/functions/sparksql/Comparisons.cpp index 32b5a4999..8479cb477 100644 --- a/bolt/functions/sparksql/Comparisons.cpp +++ b/bolt/functions/sparksql/Comparisons.cpp @@ -55,21 +55,20 @@ class ComparisonFunction final : public exec::VectorFunction { const Cmp cmp; context.ensureWritable(rows, BOOLEAN(), result); result->clearNulls(rows); - if ((args[0]->isFlatEncoding() || args[0]->isConstantEncoding()) && - (args[1]->isFlatEncoding() || args[1]->isConstantEncoding())) { - if constexpr ( - kind == TypeKind::TINYINT || kind == TypeKind::SMALLINT || - kind == TypeKind::INTEGER || kind == TypeKind::BIGINT) { - if (rows.isAllSelected()) { - applySimdComparison(rows, args, result); - return; - } - } - if (rows.end() - rows.begin() > 64) { - applyAutoSimdComparison(rows, args, result); + if constexpr ( + kind == TypeKind::TINYINT || kind == TypeKind::SMALLINT || + kind == TypeKind::INTEGER || kind == TypeKind::BIGINT) { + if ((args[0]->isFlatEncoding() || args[0]->isConstantEncoding()) && + (args[1]->isFlatEncoding() || args[1]->isConstantEncoding()) && + rows.isAllSelected()) { + applySimdComparison(rows, args, result); return; } } + if (shouldApplyAutoSimdComparison(rows, args)) { + applyAutoSimdComparison(rows, args, context, result); + return; + } auto* flatResult = result->asUnchecked>(); if (args[0]->isFlatEncoding() && args[1]->isFlatEncoding()) { diff --git a/bolt/functions/sparksql/DecimalCompare.cpp b/bolt/functions/sparksql/DecimalCompare.cpp index b5290d673..9953d4c3a 100644 --- a/bolt/functions/sparksql/DecimalCompare.cpp +++ b/bolt/functions/sparksql/DecimalCompare.cpp @@ -129,11 +129,9 @@ class DecimalCompareFunction : public exec::VectorFunction { exec::EvalCtx& context, VectorPtr& result) const override { prepareResults(rows, resultType, context, result); - if ((args[0]->isFlatEncoding() || args[0]->isConstantEncoding()) && - (args[1]->isFlatEncoding() || args[1]->isConstantEncoding()) && - rows.end() - rows.begin() > 64) { + if (shouldApplyAutoSimdComparison(rows, args)) { applyAutoSimdComparison( - rows, args, result, deltaScale_, need256_); + rows, args, context, result, deltaScale_, need256_); return; } diff --git a/bolt/functions/sparksql/benchmarks/SIMDCompareBenchmark.cpp b/bolt/functions/sparksql/benchmarks/SIMDCompareBenchmark.cpp index 9000287e9..272a9d168 100644 --- a/bolt/functions/sparksql/benchmarks/SIMDCompareBenchmark.cpp +++ b/bolt/functions/sparksql/benchmarks/SIMDCompareBenchmark.cpp @@ -33,6 +33,7 @@ #include "bolt/benchmarks/ExpressionBenchmarkBuilder.h" #include "bolt/functions/sparksql/Register.h" +#include "bolt/vector/fuzzer/VectorFuzzer.h" using namespace bytedance; @@ -55,12 +56,29 @@ int main(int argc, char** argv) { ExpressionBenchmarkBuilder benchmarkBuilder; for (auto nullRatio : {0.0, 0.1, 0.9}) { for (auto& inputType : inputTypes) { + VectorFuzzer::Options opts; + opts.vectorSize = 10000; + opts.nullRatio = nullRatio; + VectorFuzzer fuzzer(opts, benchmarkBuilder.pool()); + std::vector childrenVectors = { + fuzzer.fuzzDictionary(fuzzer.fuzzFlat(inputType)), + fuzzer.fuzzFlat(inputType)}; benchmarkBuilder .addBenchmarkSet( fmt::format( - "{}#{}\%null", inputType->toString(), nullRatio * 100), + "Dict#{}#{}\%null", inputType->toString(), nullRatio * 100), + fuzzer.fuzzRow( + std::move(childrenVectors), {"c0", "c1"}, opts.vectorSize)) + .addExpression("equalto", "equalto(c0, c1)") + .addExpression("greaterthan", "greaterthan(c0, c1)") + .addExpression("greaterthanorequal", "greaterthanorequal(c0, c1)") + .withIterations(100); + benchmarkBuilder + .addBenchmarkSet( + fmt::format( + "Flat#{}#{}\%null", inputType->toString(), nullRatio * 100), ROW({"c0", "c1"}, {inputType, inputType})) - .withFuzzerOptions({.vectorSize = 10000, .nullRatio = nullRatio}) + .withFuzzerOptions(opts) .addExpression("equalto", "equalto(c0, c1)") .addExpression("greaterthan", "greaterthan(c0, c1)") .addExpression("greaterthanorequal", "greaterthanorequal(c0, c1)") diff --git a/bolt/functions/sparksql/tests/ComparisonsTest.cpp b/bolt/functions/sparksql/tests/ComparisonsTest.cpp index 0d2c9c203..b848353df 100644 --- a/bolt/functions/sparksql/tests/ComparisonsTest.cpp +++ b/bolt/functions/sparksql/tests/ComparisonsTest.cpp @@ -30,6 +30,7 @@ #include "bolt/common/base/tests/GTestUtils.h" #include "bolt/functions/sparksql/tests/SparkFunctionBaseTest.h" +#include "bolt/vector/DecodedVector.h" #include namespace bytedance::bolt::functions::sparksql::test { @@ -94,6 +95,8 @@ class ComparisonsTest : public SparkFunctionBaseTest { auto right = rVector->template as>()->rawValues(); auto constVector = fuzzer.fuzzConstant(type); auto constant = constVector->template as>()->value(); + auto lDictVector = fuzzer.fuzzDictionary(lVector); + auto rDictVector = fuzzer.fuzzDictionary(rVector); SelectivityVector rows(size); rows.setValidRange(0, unSelectedRows, false); @@ -112,10 +115,9 @@ class ComparisonsTest : public SparkFunctionBaseTest { // Flat, Const std::vector rConstVectors = {lVector, constVector}; rowVector = fuzzer.fuzzRow(std::move(rConstVectors), {"c0", "c1"}, size); - result = - evaluate>("greaterthanorequal(c0, c1)", rowVector); + result = evaluate>("equalto(c0, c1)", rowVector); for (auto i = unSelectedRows; i < size; i++) { - expectedResult[i] = left[i] >= constant; + expectedResult[i] = left[i] == constant; } bolt::test::assertEqualVectors( makeFlatVector(expectedResult), result, rows); @@ -123,10 +125,43 @@ class ComparisonsTest : public SparkFunctionBaseTest { // Const, Flat std::vector lConstVectors = {constVector, rVector}; rowVector = fuzzer.fuzzRow(std::move(lConstVectors), {"c0", "c1"}, size); - result = - evaluate>("greaterthanorequal(c0, c1)", rowVector); + result = evaluate>("lessthan(c0, c1)", rowVector); + for (auto i = unSelectedRows; i < size; i++) { + expectedResult[i] = constant < right[i]; + } + bolt::test::assertEqualVectors( + makeFlatVector(expectedResult), result, rows); + + // Dict(Flat), Flat + childrenVectors = {lDictVector, rVector}; + rowVector = fuzzer.fuzzRow(std::move(childrenVectors), {"c0", "c1"}, size); + result = evaluate>("lessthanorequal(c0, c1)", rowVector); + DecodedVector lDecodedVector(*lDictVector); + for (auto i = unSelectedRows; i < size; i++) { + expectedResult[i] = lDecodedVector.valueAt(i) <= right[i]; + } + bolt::test::assertEqualVectors( + makeFlatVector(expectedResult), result, rows); + + // Flat, Dict(Flat) + childrenVectors = {lVector, rDictVector}; + rowVector = fuzzer.fuzzRow(std::move(childrenVectors), {"c0", "c1"}, size); + result = evaluate>("greaterthan(c0, c1)", rowVector); + DecodedVector rDecodedVector(*rDictVector); + for (auto i = unSelectedRows; i < size; i++) { + expectedResult[i] = left[i] > rDecodedVector.valueAt(i); + } + bolt::test::assertEqualVectors( + makeFlatVector(expectedResult), result, rows); + + // Dict(Dict(Flat)), Flat + auto lDictDictVector = fuzzer.fuzzDictionary(lDictVector); + childrenVectors = {lDictDictVector, rVector}; + rowVector = fuzzer.fuzzRow(std::move(childrenVectors), {"c0", "c1"}, size); + result = evaluate>("lessthanorequal(c0, c1)", rowVector); + DecodedVector decodedVector(*lDictDictVector); for (auto i = unSelectedRows; i < size; i++) { - expectedResult[i] = constant >= right[i]; + expectedResult[i] = decodedVector.valueAt(i) <= right[i]; } bolt::test::assertEqualVectors( makeFlatVector(expectedResult), result, rows); diff --git a/bolt/functions/sparksql/tests/DecimalCompareTest.cpp b/bolt/functions/sparksql/tests/DecimalCompareTest.cpp index 00a519bad..558a9b3bb 100644 --- a/bolt/functions/sparksql/tests/DecimalCompareTest.cpp +++ b/bolt/functions/sparksql/tests/DecimalCompareTest.cpp @@ -183,6 +183,26 @@ TEST_F(DecimalCompareTest, gt) { makeFlatVector( 130, [](auto row) { return row % 2 == 0 ? false : true; }), row); + + // Dict(Flat), Flat + testCompareExpr( + "decimal_greaterthan(c0, c1)", + { + wrapInDictionary( + makeIndices(140, [](auto row) { return row % 2; }), + makeFlatVector( + 2, + [](auto row) { return row % 2 == 0 ? 1000 : 3000; }, + nullptr, + DECIMAL(6, 2))), + makeFlatVector( + 140, + [](auto row) { return row % 2 == 0 ? 100 : 130; }, + nullptr, + DECIMAL(5, 1)), + }, + makeFlatVector( + 140, [](auto row) { return row % 2 == 0 ? false : true; })); } TEST_F(DecimalCompareTest, gte) { @@ -318,6 +338,25 @@ TEST_F(DecimalCompareTest, gte) { }, makeFlatVector(130, [](auto /*row*/) { return true; }), row); + + // Dict(Flat), Flat + testCompareExpr( + "decimal_greaterthanorequal(c0, c1)", + { + wrapInDictionary( + makeIndices(140, [](auto row) { return row % 2; }), + makeFlatVector( + 2, + [](auto row) { return row % 2 == 0 ? 1000 : 3000; }, + nullptr, + DECIMAL(6, 2))), + makeFlatVector( + 140, + [](auto row) { return row % 2 == 0 ? 100 : 130; }, + nullptr, + DECIMAL(5, 1)), + }, + makeFlatVector(140, [](auto /*row*/) { return true; })); } TEST_F(DecimalCompareTest, eq) { @@ -455,6 +494,26 @@ TEST_F(DecimalCompareTest, eq) { makeFlatVector( 130, [](auto row) { return row % 2 == 0 ? true : false; }), row); + + // Flat, Dict(Flat) + testCompareExpr( + "decimal_equalto(c0, c1)", + { + makeFlatVector( + 140, + [](auto row) { return row % 2 == 0 ? 100 : 130; }, + nullptr, + DECIMAL(5, 1)), + wrapInDictionary( + makeIndices(140, [](auto row) { return row % 2; }), + makeFlatVector( + 2, + [](auto row) { return row % 2 == 0 ? 1000 : 3000; }, + nullptr, + DECIMAL(6, 2))), + }, + makeFlatVector( + 140, [](auto row) { return row % 2 == 0 ? true : false; })); } TEST_F(DecimalCompareTest, neq) { @@ -591,6 +650,26 @@ TEST_F(DecimalCompareTest, neq) { makeFlatVector( 130, [](auto row) { return row % 2 == 0 ? false : true; }), row); + + // Dict(Flat), Flat + testCompareExpr( + "decimal_notequalto(c0, c1)", + { + wrapInDictionary( + makeIndices(140, [](auto row) { return row % 2; }), + makeFlatVector( + 2, + [](auto row) { return row % 2 == 0 ? 1000 : 3000; }, + nullptr, + DECIMAL(6, 2))), + makeFlatVector( + 140, + [](auto row) { return row % 2 == 0 ? 100 : 130; }, + nullptr, + DECIMAL(5, 1)), + }, + makeFlatVector( + 140, [](auto row) { return row % 2 == 0 ? false : true; })); } TEST_F(DecimalCompareTest, lt) { @@ -726,6 +805,25 @@ TEST_F(DecimalCompareTest, lt) { }, makeFlatVector(130, [](auto /*row*/) { return false; }), row); + + // Dict(Flat), Flat + testCompareExpr( + "decimal_lessthan(c0, c1)", + { + wrapInDictionary( + makeIndices(140, [](auto row) { return row % 2; }), + makeFlatVector( + 2, + [](auto row) { return row % 2 == 0 ? 1000 : 3000; }, + nullptr, + DECIMAL(6, 2))), + makeFlatVector( + 140, + [](auto row) { return row % 2 == 0 ? 100 : 130; }, + nullptr, + DECIMAL(5, 1)), + }, + makeFlatVector(140, [](auto /*row*/) { return false; })); } TEST_F(DecimalCompareTest, lte) { @@ -863,6 +961,26 @@ TEST_F(DecimalCompareTest, lte) { makeFlatVector( 130, [](auto row) { return row % 2 == 0 ? true : false; }), row); + + // Dict(Flat), Flat + testCompareExpr( + "decimal_lessthanorequal(c0, c1)", + { + wrapInDictionary( + makeIndices(140, [](auto row) { return row % 2; }), + makeFlatVector( + 2, + [](auto row) { return row % 2 == 0 ? 1000 : 3000; }, + nullptr, + DECIMAL(6, 2))), + makeFlatVector( + 140, + [](auto row) { return row % 2 == 0 ? 100 : 130; }, + nullptr, + DECIMAL(5, 1)), + }, + makeFlatVector( + 140, [](auto row) { return row % 2 == 0 ? true : false; })); } } // namespace From 37d39e7b64bde334c19d30f689a3861a4b6071c5 Mon Sep 17 00:00:00 2001 From: Gu Haiyan Date: Fri, 6 Mar 2026 19:30:11 +0800 Subject: [PATCH 2/3] [fix] fix ABFS compilation error --- CMakeLists.txt | 5 +---- Makefile | 3 ++- bolt/CMakeLists.txt | 1 + .../hive/storage_adapters/abfs/AbfsFileSystem.cpp | 2 -- conanfile.py | 12 ++++++++++++ 5 files changed, 16 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6c6266540..8dac155df 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -263,7 +263,6 @@ if(${BOLT_BUILD_BENCHMARKS}) set(BOLT_BUILD_TEST_UTILS ON) set(BOLT_BUILD_TESTING OFF) set(BOLT_ENABLE_EXAMPLES OFF) - set(BOLT_ENABLE_ABFS OFF) set(BOLT_ENABLE_SUBSTRAIT OFF) set(BOLT_CODEGEN_SUPPORT OFF) endif() @@ -369,9 +368,7 @@ if(BOLT_ENABLE_ABFS) if(AZURESDK_ROOT_DIR) list(APPEND CMAKE_PREFIX_PATH ${AZURESDK_ROOT_DIR}) endif() - # files-datalake is built on blobs - find_package(azure-storage-blobs-cpp CONFIG REQUIRED) - find_package(azure-storage-files-datalake-cpp CONFIG REQUIRED) + find_package(AzureSDK CONFIG REQUIRED) add_definitions(-DBOLT_ENABLE_ABFS) endif() diff --git a/Makefile b/Makefile index b9b276140..fee7ed47d 100644 --- a/Makefile +++ b/Makefile @@ -229,7 +229,8 @@ compile_db_all: $(MAKE) _compile_db \ BUILD_TYPE=Release \ BOLT_BUILD_BENCHMARKS="ON" \ - CONAN_OPTIONS=" -o bolt/*:spark_compatible=True -o bolt/*:enable_testutil=True -o bolt/*:enable_s3=True -o bolt/*:enable_gcs=True" \ + CONAN_OPTIONS=" -o bolt/*:spark_compatible=True -o bolt/*:enable_testutil=True -o bolt/*:enable_s3=True \ + -o bolt/*:enable_gcs=True -o bolt/*:enable_abfs=True" \ CONAN_CONFIG=" -c tools.build:skip_test=False" export_base: diff --git a/bolt/CMakeLists.txt b/bolt/CMakeLists.txt index 38225f01c..b3108e176 100644 --- a/bolt/CMakeLists.txt +++ b/bolt/CMakeLists.txt @@ -269,6 +269,7 @@ target_link_libraries(bolt_engine PUBLIC $<$:celeborn-cpp-client::celeborn-cpp-client> $<$:aws-sdk-cpp::aws-sdk-cpp> $<$:google-cloud-cpp::storage> + $<$:Azure::azure-storage-blobs> CURL::libcurl ) diff --git a/bolt/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp b/bolt/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp index 6329f5454..f2e37b6b8 100644 --- a/bolt/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp +++ b/bolt/connectors/hive/storage_adapters/abfs/AbfsFileSystem.cpp @@ -31,7 +31,6 @@ #include "bolt/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h" #include "bolt/common/file/File.h" #include "bolt/connectors/hive/HiveConfig.h" -#include "bolt/connectors/hive/storage_adapters/abfs/AbfsReadFile.h" #include "bolt/connectors/hive/storage_adapters/abfs/AbfsUtil.h" #include "bolt/core/Config.h" @@ -44,7 +43,6 @@ #include "bolt/common/file/File.h" #include "bolt/connectors/hive/HiveConfig.h" #include "bolt/connectors/hive/storage_adapters/abfs/AbfsReadFile.h" -#include "bolt/connectors/hive/storage_adapters/abfs/AbfsWriteFile.h" namespace bytedance::bolt::filesystems::abfs { using namespace Azure::Storage::Blobs; class AbfsConfig { diff --git a/conanfile.py b/conanfile.py index 1965736b3..52df5de92 100644 --- a/conanfile.py +++ b/conanfile.py @@ -86,6 +86,7 @@ class BoltConan(ConanFile): "enable_hdfs": [True, False], "enable_s3": [True, False], "enable_gcs": [True, False], + "enable_abfs": [True, False], "use_arrow_hdfs": [True, False], "enable_asan": [True, False], "enable_jit": [True, False], @@ -112,6 +113,7 @@ class BoltConan(ConanFile): "enable_hdfs": True, "enable_s3": False, "enable_gcs": False, + "enable_abfs": False, "use_arrow_hdfs": True, "enable_arrow_connector": False, "enable_jit": True, @@ -196,6 +198,12 @@ def requirements(self): transitive_headers=True, transitive_libs=True, ) + if self.options.get_safe("enable_abfs"): + self.requires( + "azure-sdk-for-cpp/1.16.1", + transitive_headers=True, + transitive_libs=True, + ) self.requires("simdjson/3.12.3", transitive_headers=True) self.requires( "sonic-cpp/1.0.2-bolt", transitive_headers=True, transitive_libs=True @@ -504,6 +512,10 @@ def generate(self): if self.options.get_safe("enable_gcs"): tc.cache_variables["BOLT_ENABLE_GCS"] = "ON" + tc.cache_variables["BOLT_ENABLE_ABFS"] = "OFF" + if self.options.get_safe("enable_abfs"): + tc.cache_variables["BOLT_ENABLE_ABFS"] = "ON" + tc.cache_variables["BOLT_FORCE_COLORED_OUTPUT"] = "ON" if self.options.enable_crc: tc.cache_variables["BOLT_ENABLE_CRC"] = "ON" From ae45428be463486849f106c55abd38f23ce0a3fd Mon Sep 17 00:00:00 2001 From: guhaiyan0221 <142767528+guhaiyan0221@users.noreply.github.com> Date: Fri, 6 Mar 2026 20:37:01 +0800 Subject: [PATCH 3/3] [feat][cp] Support ENUM type in parquet reader (#340) --- bolt/dwio/parquet/reader/ParquetReader.cpp | 8 +++++- .../parquet/tests/examples/enum_type.parquet | Bin 0 -> 391 bytes .../tests/reader/ParquetReaderTest.cpp | 25 ++++++++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 bolt/dwio/parquet/tests/examples/enum_type.parquet diff --git a/bolt/dwio/parquet/reader/ParquetReader.cpp b/bolt/dwio/parquet/reader/ParquetReader.cpp index 132bf869e..bf6d757be 100644 --- a/bolt/dwio/parquet/reader/ParquetReader.cpp +++ b/bolt/dwio/parquet/reader/ParquetReader.cpp @@ -928,10 +928,16 @@ TypePtr ReaderBase::convertType( "{} converted type can only be set for thrift::Type::(FIXED_LEN_)BYTE_ARRAY.", thrift::to_string(schemaElement.converted_type)); } + case thrift::ConvertedType::ENUM: { + BOLT_CHECK_EQ( + schemaElement.type, + thrift::Type::BYTE_ARRAY, + "ENUM converted type can only be set for value of thrift::Type::BYTE_ARRAY"); + return VARCHAR(); + } case thrift::ConvertedType::MAP: case thrift::ConvertedType::MAP_KEY_VALUE: case thrift::ConvertedType::LIST: - case thrift::ConvertedType::ENUM: case thrift::ConvertedType::TIME_MILLIS: case thrift::ConvertedType::TIME_MICROS: case thrift::ConvertedType::BSON: diff --git a/bolt/dwio/parquet/tests/examples/enum_type.parquet b/bolt/dwio/parquet/tests/examples/enum_type.parquet new file mode 100644 index 0000000000000000000000000000000000000000..90b53745ce921e8ce7212cc38104cf3e1e3b3e3b GIT binary patch literal 391 zcmYk3%}T>S5P-Ka#0aHO>4YpSm`g*I8k(g4c(5Ksq}b9IFzI&F2AZ^{sd_BpO%Sg> zhY#Svn+K0QdGQ%UJnE+P;K0uA{CqnzOt;lDK?DMa@cMK6kUrpsh|tcRjMhRu$C-Y9 z52fDOeG9!b@kfRS0Kqmi;rV^>BomN0BmpP>VSUGOyy3tT(4+1zw>X2d0#-MUT0O3h zI~@?P!kt73VZr=P42T?Bs^l}Yx{$IEAg3%-u!j;x(q+!&l5W)ORpiGk zGJ?r~jg8O^n7YZX-C#OqNIm^)Z$F5%OBT)iNvN5IX&IJQ=+FJ}K&#c5U840;!?x>H iMk^MzspnRyUGCSIXWN!(TNZ7UcQGM+m@j_1dnumberOfRows(), 3ULL); + + auto rowType = reader->typeWithId(); + EXPECT_EQ(rowType->type()->kind(), TypeKind::ROW); + EXPECT_EQ(rowType->size(), 1ULL); + + EXPECT_EQ(rowType->childAt(0)->type()->kind(), TypeKind::VARCHAR); + + auto fileSchema = ROW({"test"}, {VARCHAR()}); + auto rowReaderOpts = getReaderOpts(fileSchema); + rowReaderOpts.setScanSpec(makeScanSpec(fileSchema)); + auto rowReader = reader->createRowReader(rowReaderOpts); + + auto expected = + makeRowVector({makeFlatVector({"FOO", "BAR", "FOO"})}); + + assertReadWithReaderAndExpected(fileSchema, *rowReader, expected, *leafPool_); +} + TEST_F(ParquetReaderTest, readBinaryAsStringFromNation) { const std::string filename("nation.parquet"); const std::string sample(getExampleFilePath(filename));