From 91cdef8aa2dddc8b429d00c8a71731a3f15fa30d Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Tue, 28 Oct 2025 13:57:18 -0700 Subject: [PATCH 01/10] mac vcpkg --- ci/vcpkg/vcpkg.json | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/ci/vcpkg/vcpkg.json b/ci/vcpkg/vcpkg.json index 5a46da815c3..eefe0098ba2 100644 --- a/ci/vcpkg/vcpkg.json +++ b/ci/vcpkg/vcpkg.json @@ -90,6 +90,27 @@ } ] }, + "gandiva-llvm": { + "description": "Gandiva support", + "dependencies": [ + { + "name": "llvm", + "default-features": false, + "version>=": "21.1.1", + "features": [ + "clang", + "default-targets", + "enable-bindings", + "enable-terminfo", + "enable-zlib", + "enable-zstd", + "enable-rtti", + "lld", + "tools" + ] + } + ] + }, "gcs": { "description": "GCS support", "dependencies": [ From 4003cba24f57794b9b0342a623fab099c5589329 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Mon, 17 Nov 2025 14:55:22 -0800 Subject: [PATCH 02/10] Add cpu attrs to llvm code generation. --- cpp/src/gandiva/engine.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index c91afd7c4b7..5806413919d 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -305,6 +305,7 @@ Engine::Engine(const std::shared_ptr& conf, // LLVM 10 doesn't like the expr function name to be the same as the module name auto module_id = "gdv_module_" + std::to_string(reinterpret_cast(this)); module_ = std::make_unique(module_id, *context_); + module_->setDataLayout(target_machine_->createDataLayout()); } Engine::~Engine() {} From bdb46f5b639bf0dae097bd8425b3962584d4e3d9 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Mon, 17 Nov 2025 16:48:15 -0800 Subject: [PATCH 03/10] unit test --- cpp/src/gandiva/CMakeLists.txt | 1 + cpp/src/gandiva/target_datalayout_test.cc | 205 ++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 cpp/src/gandiva/target_datalayout_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index a7f5f9dacff..44ebfb2c413 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -276,6 +276,7 @@ add_gandiva_test(internals-test hash_utils_test.cc gdv_function_stubs_test.cc interval_holder_test.cc + target_datalayout_test.cc tests/test_util.cc EXTRA_LINK_LIBS re2::re2 diff --git a/cpp/src/gandiva/target_datalayout_test.cc b/cpp/src/gandiva/target_datalayout_test.cc new file mode 100644 index 00000000000..b5421ca25ea --- /dev/null +++ b/cpp/src/gandiva/target_datalayout_test.cc @@ -0,0 +1,205 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "gandiva/llvm_generator.h" +#include "gandiva/tests/test_util.h" + +namespace gandiva { + +class TestTargetDataLayout : public ::testing::Test { + protected: + void SetUp() override {} +}; + +// Test that verifies the target data layout string representation +// is consistent with the CPU architecture. This test is portable +// across different architectures. +TEST_F(TestTargetDataLayout, VerifyDataLayoutForArchitecture) { + // Create an LLVM generator with default configuration + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); + + // Get the module from the generator + llvm::Module* module = generator->module(); + ASSERT_NE(module, nullptr); + + // Get the data layout from the module + const llvm::DataLayout& data_layout = module->getDataLayout(); + std::string data_layout_str = data_layout.getStringRepresentation(); + + // Verify that the data layout string is not empty + EXPECT_FALSE(data_layout_str.empty()); + + // Get the host CPU architecture information + std::string host_cpu = llvm::sys::getHostCPUName().str(); + std::string triple = llvm::sys::getDefaultTargetTriple(); + + // Log the information for debugging + std::cout << "Host CPU: " << host_cpu << std::endl; + std::cout << "Target Triple: " << triple << std::endl; + std::cout << "Data Layout: " << data_layout_str << std::endl; + + // Verify basic data layout properties based on architecture + // The data layout string should contain endianness information + // 'e' for little-endian, 'E' for big-endian + EXPECT_TRUE(data_layout_str[0] == 'e' || data_layout_str[0] == 'E') + << "Data layout should start with endianness specifier"; + + // Check for pointer size information in the data layout + // Most modern architectures use 64-bit pointers + if (triple.find("x86_64") != std::string::npos || + triple.find("aarch64") != std::string::npos || + triple.find("arm64") != std::string::npos) { + // For 64-bit architectures, expect pointer size to be 64 bits + EXPECT_NE(data_layout_str.find("p:64:64"), std::string::npos) + << "Expected 64-bit pointer size for 64-bit architecture"; + } else if (triple.find("i386") != std::string::npos || + triple.find("i686") != std::string::npos || + triple.find("arm") != std::string::npos) { + // For 32-bit architectures, expect pointer size to be 32 bits + EXPECT_NE(data_layout_str.find("p:32:32"), std::string::npos) + << "Expected 32-bit pointer size for 32-bit architecture"; + } + + // Verify that integer alignments are present + EXPECT_NE(data_layout_str.find("i"), std::string::npos) + << "Data layout should contain integer type information"; + + // Verify endianness consistency + if (data_layout_str[0] == 'e') { + std::cout << "Architecture is little-endian" << std::endl; + } else if (data_layout_str[0] == 'E') { + std::cout << "Architecture is big-endian" << std::endl; + } + + // Additional verification: Check that the data layout is valid + // by querying specific properties + EXPECT_GT(data_layout.getPointerSize(), 0u) + << "Pointer size should be greater than 0"; + EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()) + << "Data layout should have a defined endianness"; +} + +// Test that verifies data layout consistency across multiple generator instances +TEST_F(TestTargetDataLayout, VerifyDataLayoutConsistency) { + // Create two LLVM generators with the same configuration + ASSERT_OK_AND_ASSIGN(auto generator1, LLVMGenerator::Make(TestConfiguration(), false)); + ASSERT_OK_AND_ASSIGN(auto generator2, LLVMGenerator::Make(TestConfiguration(), false)); + + // Get the data layout from both modules + const llvm::DataLayout& data_layout1 = generator1->module()->getDataLayout(); + const llvm::DataLayout& data_layout2 = generator2->module()->getDataLayout(); + + std::string data_layout_str1 = data_layout1.getStringRepresentation(); + std::string data_layout_str2 = data_layout2.getStringRepresentation(); + + // Verify that both generators produce the same data layout + EXPECT_EQ(data_layout_str1, data_layout_str2) + << "Data layout should be consistent across generator instances"; + + // Verify that pointer sizes match + EXPECT_EQ(data_layout1.getPointerSize(), data_layout2.getPointerSize()) + << "Pointer sizes should match"; + + // Verify that endianness matches + EXPECT_EQ(data_layout1.isLittleEndian(), data_layout2.isLittleEndian()) + << "Endianness should match"; +} + +// Test that verifies specific data layout properties for common types +TEST_F(TestTargetDataLayout, VerifyTypeAlignments) { + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); + + llvm::Module* module = generator->module(); + const llvm::DataLayout& data_layout = module->getDataLayout(); + + // Get the LLVM context and types + llvm::LLVMContext* context = generator->context(); + LLVMTypes* types = generator->types(); + + // Verify alignment for common integer types + EXPECT_GT(data_layout.getABITypeAlign(types->i8_type()).value(), 0u) + << "i8 type should have non-zero alignment"; + EXPECT_GT(data_layout.getABITypeAlign(types->i32_type()).value(), 0u) + << "i32 type should have non-zero alignment"; + EXPECT_GT(data_layout.getABITypeAlign(types->i64_type()).value(), 0u) + << "i64 type should have non-zero alignment"; + + // Verify alignment for pointer types + EXPECT_GT(data_layout.getABITypeAlign(types->i8_ptr_type()).value(), 0u) + << "i8* type should have non-zero alignment"; + + // Verify type sizes + EXPECT_EQ(data_layout.getTypeAllocSize(types->i8_type()), 1u) + << "i8 type should be 1 byte"; + EXPECT_EQ(data_layout.getTypeAllocSize(types->i32_type()), 4u) + << "i32 type should be 4 bytes"; + EXPECT_EQ(data_layout.getTypeAllocSize(types->i64_type()), 8u) + << "i64 type should be 8 bytes"; + + // Log the data layout string for reference + std::cout << "Data Layout: " << data_layout.getStringRepresentation() << std::endl; +} + +// Test that verifies data layout for different CPU configurations +TEST_F(TestTargetDataLayout, VerifyDataLayoutForHostCPU) { + // Create a configuration that targets the host CPU + auto config = TestConfiguration(); + + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(config, false)); + + llvm::Module* module = generator->module(); + const llvm::DataLayout& data_layout = module->getDataLayout(); + std::string data_layout_str = data_layout.getStringRepresentation(); + + // Get host information + std::string host_cpu = llvm::sys::getHostCPUName().str(); + std::string triple = llvm::sys::getDefaultTargetTriple(); + + std::cout << "Testing with Host CPU: " << host_cpu << std::endl; + std::cout << "Target Triple: " << triple << std::endl; + std::cout << "Data Layout: " << data_layout_str << std::endl; + + // Verify that the data layout is appropriate for the host + EXPECT_FALSE(data_layout_str.empty()); + + // Check architecture-specific properties + if (triple.find("x86_64") != std::string::npos) { + // x86_64 should be little-endian + EXPECT_EQ(data_layout_str[0], 'e') << "x86_64 should be little-endian"; + // x86_64 should have 64-bit pointers + EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) + << "x86_64 should have 64-bit pointers"; + } else if (triple.find("aarch64") != std::string::npos || + triple.find("arm64") != std::string::npos) { + // ARM64 should be little-endian (most common configuration) + EXPECT_EQ(data_layout_str[0], 'e') << "ARM64 should be little-endian"; + // ARM64 should have 64-bit pointers + EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) + << "ARM64 should have 64-bit pointers"; + } + + // Verify that the data layout is valid by checking basic properties + EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()); + EXPECT_GT(data_layout.getPointerSize(), 0u); +} + +} // namespace gandiva + From a3833204e96f7c7b4f208eaebf01a46059378611 Mon Sep 17 00:00:00 2001 From: "logan.riggs@gmail.com" Date: Tue, 18 Nov 2025 01:25:08 +0000 Subject: [PATCH 04/10] Update test --- cpp/src/gandiva/target_datalayout_test.cc | 145 +--------------------- 1 file changed, 2 insertions(+), 143 deletions(-) diff --git a/cpp/src/gandiva/target_datalayout_test.cc b/cpp/src/gandiva/target_datalayout_test.cc index b5421ca25ea..f5c1c8b9262 100644 --- a/cpp/src/gandiva/target_datalayout_test.cc +++ b/cpp/src/gandiva/target_datalayout_test.cc @@ -17,7 +17,7 @@ #include #include -#include +#include #include "gandiva/llvm_generator.h" #include "gandiva/tests/test_util.h" @@ -56,150 +56,9 @@ TEST_F(TestTargetDataLayout, VerifyDataLayoutForArchitecture) { std::cout << "Target Triple: " << triple << std::endl; std::cout << "Data Layout: " << data_layout_str << std::endl; - // Verify basic data layout properties based on architecture - // The data layout string should contain endianness information - // 'e' for little-endian, 'E' for big-endian - EXPECT_TRUE(data_layout_str[0] == 'e' || data_layout_str[0] == 'E') - << "Data layout should start with endianness specifier"; - - // Check for pointer size information in the data layout - // Most modern architectures use 64-bit pointers - if (triple.find("x86_64") != std::string::npos || - triple.find("aarch64") != std::string::npos || - triple.find("arm64") != std::string::npos) { - // For 64-bit architectures, expect pointer size to be 64 bits - EXPECT_NE(data_layout_str.find("p:64:64"), std::string::npos) - << "Expected 64-bit pointer size for 64-bit architecture"; - } else if (triple.find("i386") != std::string::npos || - triple.find("i686") != std::string::npos || - triple.find("arm") != std::string::npos) { - // For 32-bit architectures, expect pointer size to be 32 bits - EXPECT_NE(data_layout_str.find("p:32:32"), std::string::npos) - << "Expected 32-bit pointer size for 32-bit architecture"; - } - - // Verify that integer alignments are present - EXPECT_NE(data_layout_str.find("i"), std::string::npos) - << "Data layout should contain integer type information"; - - // Verify endianness consistency - if (data_layout_str[0] == 'e') { - std::cout << "Architecture is little-endian" << std::endl; - } else if (data_layout_str[0] == 'E') { - std::cout << "Architecture is big-endian" << std::endl; - } - - // Additional verification: Check that the data layout is valid - // by querying specific properties - EXPECT_GT(data_layout.getPointerSize(), 0u) - << "Pointer size should be greater than 0"; - EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()) - << "Data layout should have a defined endianness"; -} - -// Test that verifies data layout consistency across multiple generator instances -TEST_F(TestTargetDataLayout, VerifyDataLayoutConsistency) { - // Create two LLVM generators with the same configuration - ASSERT_OK_AND_ASSIGN(auto generator1, LLVMGenerator::Make(TestConfiguration(), false)); - ASSERT_OK_AND_ASSIGN(auto generator2, LLVMGenerator::Make(TestConfiguration(), false)); - - // Get the data layout from both modules - const llvm::DataLayout& data_layout1 = generator1->module()->getDataLayout(); - const llvm::DataLayout& data_layout2 = generator2->module()->getDataLayout(); - - std::string data_layout_str1 = data_layout1.getStringRepresentation(); - std::string data_layout_str2 = data_layout2.getStringRepresentation(); - - // Verify that both generators produce the same data layout - EXPECT_EQ(data_layout_str1, data_layout_str2) - << "Data layout should be consistent across generator instances"; - - // Verify that pointer sizes match - EXPECT_EQ(data_layout1.getPointerSize(), data_layout2.getPointerSize()) - << "Pointer sizes should match"; - - // Verify that endianness matches - EXPECT_EQ(data_layout1.isLittleEndian(), data_layout2.isLittleEndian()) - << "Endianness should match"; -} - -// Test that verifies specific data layout properties for common types -TEST_F(TestTargetDataLayout, VerifyTypeAlignments) { - ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); - - llvm::Module* module = generator->module(); - const llvm::DataLayout& data_layout = module->getDataLayout(); - - // Get the LLVM context and types - llvm::LLVMContext* context = generator->context(); - LLVMTypes* types = generator->types(); - - // Verify alignment for common integer types - EXPECT_GT(data_layout.getABITypeAlign(types->i8_type()).value(), 0u) - << "i8 type should have non-zero alignment"; - EXPECT_GT(data_layout.getABITypeAlign(types->i32_type()).value(), 0u) - << "i32 type should have non-zero alignment"; - EXPECT_GT(data_layout.getABITypeAlign(types->i64_type()).value(), 0u) - << "i64 type should have non-zero alignment"; - - // Verify alignment for pointer types - EXPECT_GT(data_layout.getABITypeAlign(types->i8_ptr_type()).value(), 0u) - << "i8* type should have non-zero alignment"; - - // Verify type sizes - EXPECT_EQ(data_layout.getTypeAllocSize(types->i8_type()), 1u) - << "i8 type should be 1 byte"; - EXPECT_EQ(data_layout.getTypeAllocSize(types->i32_type()), 4u) - << "i32 type should be 4 bytes"; - EXPECT_EQ(data_layout.getTypeAllocSize(types->i64_type()), 8u) - << "i64 type should be 8 bytes"; - - // Log the data layout string for reference - std::cout << "Data Layout: " << data_layout.getStringRepresentation() << std::endl; -} - -// Test that verifies data layout for different CPU configurations -TEST_F(TestTargetDataLayout, VerifyDataLayoutForHostCPU) { - // Create a configuration that targets the host CPU - auto config = TestConfiguration(); - - ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(config, false)); - - llvm::Module* module = generator->module(); - const llvm::DataLayout& data_layout = module->getDataLayout(); - std::string data_layout_str = data_layout.getStringRepresentation(); - - // Get host information - std::string host_cpu = llvm::sys::getHostCPUName().str(); - std::string triple = llvm::sys::getDefaultTargetTriple(); - - std::cout << "Testing with Host CPU: " << host_cpu << std::endl; - std::cout << "Target Triple: " << triple << std::endl; - std::cout << "Data Layout: " << data_layout_str << std::endl; - - // Verify that the data layout is appropriate for the host + // Verify that the data layout string is not empty EXPECT_FALSE(data_layout_str.empty()); - // Check architecture-specific properties - if (triple.find("x86_64") != std::string::npos) { - // x86_64 should be little-endian - EXPECT_EQ(data_layout_str[0], 'e') << "x86_64 should be little-endian"; - // x86_64 should have 64-bit pointers - EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) - << "x86_64 should have 64-bit pointers"; - } else if (triple.find("aarch64") != std::string::npos || - triple.find("arm64") != std::string::npos) { - // ARM64 should be little-endian (most common configuration) - EXPECT_EQ(data_layout_str[0], 'e') << "ARM64 should be little-endian"; - // ARM64 should have 64-bit pointers - EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) - << "ARM64 should have 64-bit pointers"; } - - // Verify that the data layout is valid by checking basic properties - EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()); - EXPECT_GT(data_layout.getPointerSize(), 0u); -} - } // namespace gandiva From f41b1fba5328ed31396c143441ab9014a74cc78d Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Wed, 10 Dec 2025 15:28:50 -0800 Subject: [PATCH 05/10] Only build llvm in release mode --- ci/vcpkg/overlay/llvm/portfile.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ci/vcpkg/overlay/llvm/portfile.cmake b/ci/vcpkg/overlay/llvm/portfile.cmake index b6d5bdacd02..71f8f6d710f 100644 --- a/ci/vcpkg/overlay/llvm/portfile.cmake +++ b/ci/vcpkg/overlay/llvm/portfile.cmake @@ -2,6 +2,8 @@ set(VCPKG_POLICY_ALLOW_EMPTY_FOLDERS enabled) vcpkg_check_linkage(ONLY_STATIC_LIBRARY) +# Only build release configuration to speed up build time +set(VCPKG_BUILD_TYPE release) # [BOLT] Allow to compile with MSVC (#151189) vcpkg_download_distfile( From e0900961c23fc7967c0349a0e331f9022e8da351 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 12 Dec 2025 18:32:33 -0800 Subject: [PATCH 06/10] Revert "Update test" This reverts commit a3833204e96f7c7b4f208eaebf01a46059378611. --- cpp/src/gandiva/target_datalayout_test.cc | 145 +++++++++++++++++++++- 1 file changed, 143 insertions(+), 2 deletions(-) diff --git a/cpp/src/gandiva/target_datalayout_test.cc b/cpp/src/gandiva/target_datalayout_test.cc index f5c1c8b9262..b5421ca25ea 100644 --- a/cpp/src/gandiva/target_datalayout_test.cc +++ b/cpp/src/gandiva/target_datalayout_test.cc @@ -17,7 +17,7 @@ #include #include -#include +#include #include "gandiva/llvm_generator.h" #include "gandiva/tests/test_util.h" @@ -56,9 +56,150 @@ TEST_F(TestTargetDataLayout, VerifyDataLayoutForArchitecture) { std::cout << "Target Triple: " << triple << std::endl; std::cout << "Data Layout: " << data_layout_str << std::endl; - // Verify that the data layout string is not empty + // Verify basic data layout properties based on architecture + // The data layout string should contain endianness information + // 'e' for little-endian, 'E' for big-endian + EXPECT_TRUE(data_layout_str[0] == 'e' || data_layout_str[0] == 'E') + << "Data layout should start with endianness specifier"; + + // Check for pointer size information in the data layout + // Most modern architectures use 64-bit pointers + if (triple.find("x86_64") != std::string::npos || + triple.find("aarch64") != std::string::npos || + triple.find("arm64") != std::string::npos) { + // For 64-bit architectures, expect pointer size to be 64 bits + EXPECT_NE(data_layout_str.find("p:64:64"), std::string::npos) + << "Expected 64-bit pointer size for 64-bit architecture"; + } else if (triple.find("i386") != std::string::npos || + triple.find("i686") != std::string::npos || + triple.find("arm") != std::string::npos) { + // For 32-bit architectures, expect pointer size to be 32 bits + EXPECT_NE(data_layout_str.find("p:32:32"), std::string::npos) + << "Expected 32-bit pointer size for 32-bit architecture"; + } + + // Verify that integer alignments are present + EXPECT_NE(data_layout_str.find("i"), std::string::npos) + << "Data layout should contain integer type information"; + + // Verify endianness consistency + if (data_layout_str[0] == 'e') { + std::cout << "Architecture is little-endian" << std::endl; + } else if (data_layout_str[0] == 'E') { + std::cout << "Architecture is big-endian" << std::endl; + } + + // Additional verification: Check that the data layout is valid + // by querying specific properties + EXPECT_GT(data_layout.getPointerSize(), 0u) + << "Pointer size should be greater than 0"; + EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()) + << "Data layout should have a defined endianness"; +} + +// Test that verifies data layout consistency across multiple generator instances +TEST_F(TestTargetDataLayout, VerifyDataLayoutConsistency) { + // Create two LLVM generators with the same configuration + ASSERT_OK_AND_ASSIGN(auto generator1, LLVMGenerator::Make(TestConfiguration(), false)); + ASSERT_OK_AND_ASSIGN(auto generator2, LLVMGenerator::Make(TestConfiguration(), false)); + + // Get the data layout from both modules + const llvm::DataLayout& data_layout1 = generator1->module()->getDataLayout(); + const llvm::DataLayout& data_layout2 = generator2->module()->getDataLayout(); + + std::string data_layout_str1 = data_layout1.getStringRepresentation(); + std::string data_layout_str2 = data_layout2.getStringRepresentation(); + + // Verify that both generators produce the same data layout + EXPECT_EQ(data_layout_str1, data_layout_str2) + << "Data layout should be consistent across generator instances"; + + // Verify that pointer sizes match + EXPECT_EQ(data_layout1.getPointerSize(), data_layout2.getPointerSize()) + << "Pointer sizes should match"; + + // Verify that endianness matches + EXPECT_EQ(data_layout1.isLittleEndian(), data_layout2.isLittleEndian()) + << "Endianness should match"; +} + +// Test that verifies specific data layout properties for common types +TEST_F(TestTargetDataLayout, VerifyTypeAlignments) { + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); + + llvm::Module* module = generator->module(); + const llvm::DataLayout& data_layout = module->getDataLayout(); + + // Get the LLVM context and types + llvm::LLVMContext* context = generator->context(); + LLVMTypes* types = generator->types(); + + // Verify alignment for common integer types + EXPECT_GT(data_layout.getABITypeAlign(types->i8_type()).value(), 0u) + << "i8 type should have non-zero alignment"; + EXPECT_GT(data_layout.getABITypeAlign(types->i32_type()).value(), 0u) + << "i32 type should have non-zero alignment"; + EXPECT_GT(data_layout.getABITypeAlign(types->i64_type()).value(), 0u) + << "i64 type should have non-zero alignment"; + + // Verify alignment for pointer types + EXPECT_GT(data_layout.getABITypeAlign(types->i8_ptr_type()).value(), 0u) + << "i8* type should have non-zero alignment"; + + // Verify type sizes + EXPECT_EQ(data_layout.getTypeAllocSize(types->i8_type()), 1u) + << "i8 type should be 1 byte"; + EXPECT_EQ(data_layout.getTypeAllocSize(types->i32_type()), 4u) + << "i32 type should be 4 bytes"; + EXPECT_EQ(data_layout.getTypeAllocSize(types->i64_type()), 8u) + << "i64 type should be 8 bytes"; + + // Log the data layout string for reference + std::cout << "Data Layout: " << data_layout.getStringRepresentation() << std::endl; +} + +// Test that verifies data layout for different CPU configurations +TEST_F(TestTargetDataLayout, VerifyDataLayoutForHostCPU) { + // Create a configuration that targets the host CPU + auto config = TestConfiguration(); + + ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(config, false)); + + llvm::Module* module = generator->module(); + const llvm::DataLayout& data_layout = module->getDataLayout(); + std::string data_layout_str = data_layout.getStringRepresentation(); + + // Get host information + std::string host_cpu = llvm::sys::getHostCPUName().str(); + std::string triple = llvm::sys::getDefaultTargetTriple(); + + std::cout << "Testing with Host CPU: " << host_cpu << std::endl; + std::cout << "Target Triple: " << triple << std::endl; + std::cout << "Data Layout: " << data_layout_str << std::endl; + + // Verify that the data layout is appropriate for the host EXPECT_FALSE(data_layout_str.empty()); + // Check architecture-specific properties + if (triple.find("x86_64") != std::string::npos) { + // x86_64 should be little-endian + EXPECT_EQ(data_layout_str[0], 'e') << "x86_64 should be little-endian"; + // x86_64 should have 64-bit pointers + EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) + << "x86_64 should have 64-bit pointers"; + } else if (triple.find("aarch64") != std::string::npos || + triple.find("arm64") != std::string::npos) { + // ARM64 should be little-endian (most common configuration) + EXPECT_EQ(data_layout_str[0], 'e') << "ARM64 should be little-endian"; + // ARM64 should have 64-bit pointers + EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) + << "ARM64 should have 64-bit pointers"; } + + // Verify that the data layout is valid by checking basic properties + EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()); + EXPECT_GT(data_layout.getPointerSize(), 0u); +} + } // namespace gandiva From 4fea9b09059d806946ecfce781f7706ec136b246 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 12 Dec 2025 18:32:44 -0800 Subject: [PATCH 07/10] Revert "unit test" This reverts commit bdb46f5b639bf0dae097bd8425b3962584d4e3d9. --- cpp/src/gandiva/CMakeLists.txt | 1 - cpp/src/gandiva/target_datalayout_test.cc | 205 ---------------------- 2 files changed, 206 deletions(-) delete mode 100644 cpp/src/gandiva/target_datalayout_test.cc diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 44ebfb2c413..a7f5f9dacff 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -276,7 +276,6 @@ add_gandiva_test(internals-test hash_utils_test.cc gdv_function_stubs_test.cc interval_holder_test.cc - target_datalayout_test.cc tests/test_util.cc EXTRA_LINK_LIBS re2::re2 diff --git a/cpp/src/gandiva/target_datalayout_test.cc b/cpp/src/gandiva/target_datalayout_test.cc deleted file mode 100644 index b5421ca25ea..00000000000 --- a/cpp/src/gandiva/target_datalayout_test.cc +++ /dev/null @@ -1,205 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include -#include -#include - -#include "gandiva/llvm_generator.h" -#include "gandiva/tests/test_util.h" - -namespace gandiva { - -class TestTargetDataLayout : public ::testing::Test { - protected: - void SetUp() override {} -}; - -// Test that verifies the target data layout string representation -// is consistent with the CPU architecture. This test is portable -// across different architectures. -TEST_F(TestTargetDataLayout, VerifyDataLayoutForArchitecture) { - // Create an LLVM generator with default configuration - ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); - - // Get the module from the generator - llvm::Module* module = generator->module(); - ASSERT_NE(module, nullptr); - - // Get the data layout from the module - const llvm::DataLayout& data_layout = module->getDataLayout(); - std::string data_layout_str = data_layout.getStringRepresentation(); - - // Verify that the data layout string is not empty - EXPECT_FALSE(data_layout_str.empty()); - - // Get the host CPU architecture information - std::string host_cpu = llvm::sys::getHostCPUName().str(); - std::string triple = llvm::sys::getDefaultTargetTriple(); - - // Log the information for debugging - std::cout << "Host CPU: " << host_cpu << std::endl; - std::cout << "Target Triple: " << triple << std::endl; - std::cout << "Data Layout: " << data_layout_str << std::endl; - - // Verify basic data layout properties based on architecture - // The data layout string should contain endianness information - // 'e' for little-endian, 'E' for big-endian - EXPECT_TRUE(data_layout_str[0] == 'e' || data_layout_str[0] == 'E') - << "Data layout should start with endianness specifier"; - - // Check for pointer size information in the data layout - // Most modern architectures use 64-bit pointers - if (triple.find("x86_64") != std::string::npos || - triple.find("aarch64") != std::string::npos || - triple.find("arm64") != std::string::npos) { - // For 64-bit architectures, expect pointer size to be 64 bits - EXPECT_NE(data_layout_str.find("p:64:64"), std::string::npos) - << "Expected 64-bit pointer size for 64-bit architecture"; - } else if (triple.find("i386") != std::string::npos || - triple.find("i686") != std::string::npos || - triple.find("arm") != std::string::npos) { - // For 32-bit architectures, expect pointer size to be 32 bits - EXPECT_NE(data_layout_str.find("p:32:32"), std::string::npos) - << "Expected 32-bit pointer size for 32-bit architecture"; - } - - // Verify that integer alignments are present - EXPECT_NE(data_layout_str.find("i"), std::string::npos) - << "Data layout should contain integer type information"; - - // Verify endianness consistency - if (data_layout_str[0] == 'e') { - std::cout << "Architecture is little-endian" << std::endl; - } else if (data_layout_str[0] == 'E') { - std::cout << "Architecture is big-endian" << std::endl; - } - - // Additional verification: Check that the data layout is valid - // by querying specific properties - EXPECT_GT(data_layout.getPointerSize(), 0u) - << "Pointer size should be greater than 0"; - EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()) - << "Data layout should have a defined endianness"; -} - -// Test that verifies data layout consistency across multiple generator instances -TEST_F(TestTargetDataLayout, VerifyDataLayoutConsistency) { - // Create two LLVM generators with the same configuration - ASSERT_OK_AND_ASSIGN(auto generator1, LLVMGenerator::Make(TestConfiguration(), false)); - ASSERT_OK_AND_ASSIGN(auto generator2, LLVMGenerator::Make(TestConfiguration(), false)); - - // Get the data layout from both modules - const llvm::DataLayout& data_layout1 = generator1->module()->getDataLayout(); - const llvm::DataLayout& data_layout2 = generator2->module()->getDataLayout(); - - std::string data_layout_str1 = data_layout1.getStringRepresentation(); - std::string data_layout_str2 = data_layout2.getStringRepresentation(); - - // Verify that both generators produce the same data layout - EXPECT_EQ(data_layout_str1, data_layout_str2) - << "Data layout should be consistent across generator instances"; - - // Verify that pointer sizes match - EXPECT_EQ(data_layout1.getPointerSize(), data_layout2.getPointerSize()) - << "Pointer sizes should match"; - - // Verify that endianness matches - EXPECT_EQ(data_layout1.isLittleEndian(), data_layout2.isLittleEndian()) - << "Endianness should match"; -} - -// Test that verifies specific data layout properties for common types -TEST_F(TestTargetDataLayout, VerifyTypeAlignments) { - ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(TestConfiguration(), false)); - - llvm::Module* module = generator->module(); - const llvm::DataLayout& data_layout = module->getDataLayout(); - - // Get the LLVM context and types - llvm::LLVMContext* context = generator->context(); - LLVMTypes* types = generator->types(); - - // Verify alignment for common integer types - EXPECT_GT(data_layout.getABITypeAlign(types->i8_type()).value(), 0u) - << "i8 type should have non-zero alignment"; - EXPECT_GT(data_layout.getABITypeAlign(types->i32_type()).value(), 0u) - << "i32 type should have non-zero alignment"; - EXPECT_GT(data_layout.getABITypeAlign(types->i64_type()).value(), 0u) - << "i64 type should have non-zero alignment"; - - // Verify alignment for pointer types - EXPECT_GT(data_layout.getABITypeAlign(types->i8_ptr_type()).value(), 0u) - << "i8* type should have non-zero alignment"; - - // Verify type sizes - EXPECT_EQ(data_layout.getTypeAllocSize(types->i8_type()), 1u) - << "i8 type should be 1 byte"; - EXPECT_EQ(data_layout.getTypeAllocSize(types->i32_type()), 4u) - << "i32 type should be 4 bytes"; - EXPECT_EQ(data_layout.getTypeAllocSize(types->i64_type()), 8u) - << "i64 type should be 8 bytes"; - - // Log the data layout string for reference - std::cout << "Data Layout: " << data_layout.getStringRepresentation() << std::endl; -} - -// Test that verifies data layout for different CPU configurations -TEST_F(TestTargetDataLayout, VerifyDataLayoutForHostCPU) { - // Create a configuration that targets the host CPU - auto config = TestConfiguration(); - - ASSERT_OK_AND_ASSIGN(auto generator, LLVMGenerator::Make(config, false)); - - llvm::Module* module = generator->module(); - const llvm::DataLayout& data_layout = module->getDataLayout(); - std::string data_layout_str = data_layout.getStringRepresentation(); - - // Get host information - std::string host_cpu = llvm::sys::getHostCPUName().str(); - std::string triple = llvm::sys::getDefaultTargetTriple(); - - std::cout << "Testing with Host CPU: " << host_cpu << std::endl; - std::cout << "Target Triple: " << triple << std::endl; - std::cout << "Data Layout: " << data_layout_str << std::endl; - - // Verify that the data layout is appropriate for the host - EXPECT_FALSE(data_layout_str.empty()); - - // Check architecture-specific properties - if (triple.find("x86_64") != std::string::npos) { - // x86_64 should be little-endian - EXPECT_EQ(data_layout_str[0], 'e') << "x86_64 should be little-endian"; - // x86_64 should have 64-bit pointers - EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) - << "x86_64 should have 64-bit pointers"; - } else if (triple.find("aarch64") != std::string::npos || - triple.find("arm64") != std::string::npos) { - // ARM64 should be little-endian (most common configuration) - EXPECT_EQ(data_layout_str[0], 'e') << "ARM64 should be little-endian"; - // ARM64 should have 64-bit pointers - EXPECT_EQ(data_layout.getPointerSizeInBits(), 64u) - << "ARM64 should have 64-bit pointers"; - } - - // Verify that the data layout is valid by checking basic properties - EXPECT_TRUE(data_layout.isLittleEndian() || data_layout.isBigEndian()); - EXPECT_GT(data_layout.getPointerSize(), 0u); -} - -} // namespace gandiva - From efb1f9ba4b49baee88be6a22961808051c260114 Mon Sep 17 00:00:00 2001 From: Logan Riggs Date: Fri, 12 Dec 2025 18:32:54 -0800 Subject: [PATCH 08/10] Revert "Add cpu attrs to llvm code generation." This reverts commit 4003cba24f57794b9b0342a623fab099c5589329. --- cpp/src/gandiva/engine.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index 5806413919d..c91afd7c4b7 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -305,7 +305,6 @@ Engine::Engine(const std::shared_ptr& conf, // LLVM 10 doesn't like the expr function name to be the same as the module name auto module_id = "gdv_module_" + std::to_string(reinterpret_cast(this)); module_ = std::make_unique(module_id, *context_); - module_->setDataLayout(target_machine_->createDataLayout()); } Engine::~Engine() {} From 0abac3c37f583e84ebf12e9c2d2249e992ec2698 Mon Sep 17 00:00:00 2001 From: Tim Hurski Date: Thu, 18 Dec 2025 03:09:05 +0000 Subject: [PATCH 09/10] Fix Jar build --- cpp/src/gandiva/gdv_function_stubs.cc | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 070a1463a0e..55f6dd43cd1 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -1364,8 +1364,7 @@ arrow::Status ExportedStubFunctions::AddMappings(Engine* engine) const { }; engine->AddGlobalMappingForFunc( - "gdv_fn_cast_intervalday_utf8_int32", - types->i64_type() /*return_type*/, args, + "gdv_fn_cast_intervalday_utf8_int32", types->i64_type() /*return_type*/, args, reinterpret_cast(gdv_fn_cast_intervalday_utf8_int32)); // gdv_fn_cast_intervalyear_utf8 @@ -1382,21 +1381,6 @@ arrow::Status ExportedStubFunctions::AddMappings(Engine* engine) const { types->i32_type() /*return_type*/, args, reinterpret_cast(gdv_fn_cast_intervalyear_utf8)); -#define ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION( \ - LLVM_TYPE, DATA_TYPE) \ - args = {types->i64_type(), types->i8_ptr_type(), types->i32_ptr_type(), \ - types->i64_type(), types->LLVM_TYPE##_ptr_type(), \ - types->i32_type(), types->i32_ptr_type()}; \ - engine->AddGlobalMappingForFunc( \ - "gdv_fn_populate_list_" #DATA_TYPE "_vector", \ - types->i32_type() /*return_type*/, args, \ - reinterpret_cast(gdv_fn_populate_list_##DATA_TYPE##_vector)); - - ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i32, int32_t) - ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i64, int64_t) - ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(float, float) - ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(double, double) - // gdv_fn_cast_intervalyear_utf8_int32 args = { types->i64_type(), // context From 5f07fba716529a763944ee6d5c4f754b8cdde5dc Mon Sep 17 00:00:00 2001 From: DenisTarasyuk <131180287+DenisTarasyuk@users.noreply.github.com> Date: Fri, 13 Jun 2025 05:17:02 +0300 Subject: [PATCH 10/10] GH-46708: [C++][Gandiva] Added zero return values for castDECIMAL_utf8 (#46709) ### Rationale for this change castDECIMAL_utf8 has undefined behavior if input value can not be parsed, which causes SIGSEGV for some expressions in Projector. ### What changes are included in this PR? Setting 0 to out_high, out_low in function castDECIMAL_utf8 to have those values initialised if input value can not be parsed. Added corresponding test that reproduces SIGSEGV in projector. ### Are these changes tested? Yes ### Are there any user-facing changes? No * GitHub Issue: #46708 Authored-by: DenisTarasyuk Signed-off-by: Sutou Kouhei --- .../gandiva/precompiled/decimal_wrapper.cc | 2 + cpp/src/gandiva/tests/decimal_test.cc | 41 +++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/cpp/src/gandiva/precompiled/decimal_wrapper.cc b/cpp/src/gandiva/precompiled/decimal_wrapper.cc index 082d5832d14..cffb7ae9781 100644 --- a/cpp/src/gandiva/precompiled/decimal_wrapper.cc +++ b/cpp/src/gandiva/precompiled/decimal_wrapper.cc @@ -406,6 +406,8 @@ void castDECIMAL_utf8(int64_t context, const char* in, int32_t in_length, gdv_fn_dec_from_string(context, in, in_length, &precision_from_str, &scale_from_str, &dec_high_from_str, &dec_low_from_str); if (status != 0) { + *out_high = 0; + *out_low = 0; return; } diff --git a/cpp/src/gandiva/tests/decimal_test.cc b/cpp/src/gandiva/tests/decimal_test.cc index 89ad020a619..6b29f8d9675 100644 --- a/cpp/src/gandiva/tests/decimal_test.cc +++ b/cpp/src/gandiva/tests/decimal_test.cc @@ -17,6 +17,7 @@ #include +#include #include #include "arrow/memory_pool.h" #include "arrow/status.h" @@ -1236,4 +1237,44 @@ TEST_F(TestDecimal, TestSha) { EXPECT_NE(value_at_position, response->GetScalar(i - 1).ValueOrDie()->ToString()); } } + +TEST_F(TestDecimal, TestCastDecimalVarCharInvalidInputInvalidOutput) { + auto decimal_type_10_0 = std::make_shared(10, 0); + auto decimal_type_38_30 = std::make_shared(38, 30); + auto decimal_type_38_27 = std::make_shared(38, 27); + + auto field_str = field("in_str", utf8()); + auto schema = arrow::schema({field_str}); + auto res_bool = field("res_bool", arrow::boolean()); + + // This is minimal possible expression to reproduce SIGSEGV + // equal(multiply(castDecimal(10), castDecimal(100)), castDECIMAL("foo")) + auto int_literal = TreeExprBuilder::MakeLiteral(static_cast(100)); + auto int_literal_multiply = TreeExprBuilder::MakeLiteral(static_cast(10)); + auto string_literal = TreeExprBuilder::MakeStringLiteral("foo"); + auto cast_multiply_literal = TreeExprBuilder::MakeFunction( + "castDECIMAL", {int_literal_multiply}, decimal_type_10_0); + auto cast_int_literal = + TreeExprBuilder::MakeFunction("castDECIMAL", {int_literal}, decimal_type_38_30); + auto cast_string_func = + TreeExprBuilder::MakeFunction("castDECIMAL", {string_literal}, decimal_type_38_30); + auto multiply_func = TreeExprBuilder::MakeFunction( + "multiply", {cast_multiply_literal, cast_int_literal}, decimal_type_38_27); + auto equal_func = TreeExprBuilder::MakeFunction( + "equal", {multiply_func, cast_string_func}, arrow::boolean()); + auto expr = TreeExprBuilder::MakeExpression(equal_func, res_bool); + + std::shared_ptr projector; + + ASSERT_OK(Projector::Make(schema, {expr}, TestConfiguration(), &projector)); + + int num_records = 1; + auto invalid_in = MakeArrowArrayUtf8({"1.345"}, {true}); + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {invalid_in}); + + arrow::ArrayVector outputs; + auto status = projector->Evaluate(*in_batch, pool_, &outputs); + ASSERT_NOT_OK(status); + ASSERT_THAT(status.message(), ::testing::HasSubstr("not a valid decimal128 number")); +} } // namespace gandiva