diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index d6176fe48b6..ceb4fc21774 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -17,6 +17,7 @@ #include "gandiva/expression_registry.h" +#include "arrow/extension/uuid.h" #include "gandiva/function_registry.h" #include "gandiva/llvm_types.h" @@ -166,6 +167,10 @@ static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector case arrow::Type::type::INTERVAL_DAY_TIME: vector.push_back(arrow::day_time_interval()); break; + case arrow::Type::type::EXTENSION: + // Add UUID extension type + vector.push_back(arrow::extension::uuid()); + break; default: // Unsupported types. test ensures that // when one of these are added build breaks. diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2bc6936d77b..d9d05342174 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -17,6 +17,7 @@ #include "gandiva/function_registry_string.h" +#include "arrow/extension/uuid.h" #include "gandiva/function_registry_common.h" namespace gandiva { @@ -201,6 +202,10 @@ std::vector GetStringFunctionRegistry() { utf8(), kResultNullIfNull, "castVARCHAR_decimal128_int64", NativeFunction::kNeedsContext), + NativeFunction("castUUID", {}, DataTypeVector{utf8()}, arrow::extension::uuid(), + kResultNullIfNull, "castUUID_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("crc32", {}, DataTypeVector{utf8()}, int64(), kResultNullIfNull, "gdv_fn_crc_32_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/gdv_function_stubs.h b/cpp/src/gandiva/gdv_function_stubs.h index 4113f261ad7..3469394fcc2 100644 --- a/cpp/src/gandiva/gdv_function_stubs.h +++ b/cpp/src/gandiva/gdv_function_stubs.h @@ -341,6 +341,10 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in const char* from, int32_t from_len, const char* to, int32_t to_len, int32_t* out_len); +GANDIVA_EXPORT +const char* castUUID_utf8(int64_t context, const char* data, int32_t data_len, + int32_t* out_len); + GANDIVA_EXPORT gdv_timestamp to_utc_timezone_timestamp(int64_t context, gdv_timestamp time_milliseconds, const char* timezone, int32_t length); diff --git a/cpp/src/gandiva/gdv_string_function_stubs.cc b/cpp/src/gandiva/gdv_string_function_stubs.cc index 17eefbe22e3..a03a74c348a 100644 --- a/cpp/src/gandiva/gdv_string_function_stubs.cc +++ b/cpp/src/gandiva/gdv_string_function_stubs.cc @@ -757,6 +757,84 @@ const char* translate_utf8_utf8_utf8(int64_t context, const char* in, int32_t in } } +// Helper function to convert a hex character to its numeric value +static inline int hex_char_to_int(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'a' && c <= 'f') { + return c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + return c - 'A' + 10; + } + return -1; // Invalid hex character +} + +GANDIVA_EXPORT +const char* castUUID_utf8(int64_t context, const char* data, int32_t data_len, + int32_t* out_len) { + *out_len = 16; // UUID is always 16 bytes + + // Allocate output buffer + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, 16)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for UUID"); + *out_len = 0; + return ""; + } + + // Parse UUID string + // Expected format: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" (36 chars) + // or without hyphens: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" (32 chars) + + if (data_len == 36) { + // Format with hyphens: validate hyphen positions + if (data[8] != '-' || data[13] != '-' || data[18] != '-' || data[23] != '-') { + gdv_fn_context_set_error_msg(context, "Invalid UUID format: hyphens at wrong positions"); + *out_len = 0; + return ""; + } + + // Parse hex digits, skipping hyphens + int byte_idx = 0; + for (int i = 0; i < 36 && byte_idx < 16; i++) { + if (data[i] == '-') continue; + + // Parse two hex digits + int high = hex_char_to_int(data[i]); + int low = hex_char_to_int(data[i + 1]); + + if (high < 0 || low < 0) { + gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID"); + *out_len = 0; + return ""; + } + + ret[byte_idx++] = static_cast((high << 4) | low); + i++; // Skip the second hex digit + } + } else if (data_len == 32) { + // Format without hyphens + for (int i = 0; i < 16; i++) { + int high = hex_char_to_int(data[i * 2]); + int low = hex_char_to_int(data[i * 2 + 1]); + + if (high < 0 || low < 0) { + gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID"); + *out_len = 0; + return ""; + } + + ret[i] = static_cast((high << 4) | low); + } + } else { + gdv_fn_context_set_error_msg(context, "Invalid UUID string length: expected 32 or 36 characters"); + *out_len = 0; + return ""; + } + + return ret; +} + namespace gandiva { arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const { @@ -986,6 +1064,18 @@ arrow::Status ExportedStringFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("translate_utf8_utf8_utf8", types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(translate_utf8_utf8_utf8)); + + // castUUID_utf8 + args = { + types->i64_type(), // context + types->i8_ptr_type(), // const char* data + types->i32_type(), // data_len + types->i32_ptr_type() // out_len + }; + + engine->AddGlobalMappingForFunc("castUUID_utf8", + types->i8_ptr_type() /*return_type*/, args, + reinterpret_cast(castUUID_utf8)); return arrow::Status::OK(); } } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index de322a8c0fc..6b93d762c80 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -17,6 +17,8 @@ #include "gandiva/llvm_types.h" +#include "arrow/extension/uuid.h" + namespace gandiva { // LLVM doesn't distinguish between signed and unsigned types. @@ -42,7 +44,8 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::BINARY, i8_ptr_type()}, {arrow::Type::type::DECIMAL, i128_type()}, {arrow::Type::type::INTERVAL_MONTHS, i32_type()}, - {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}}; + {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}, + {arrow::Type::type::EXTENSION, i8_ptr_type()}}; } } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index 3849cf7bdf9..a6d12522bd3 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -3034,4 +3034,87 @@ int32_t instr_utf8(const char* string, int32_t string_len, const char* substring } return 0; } + + +// Helper function to convert a hex character to its numeric value +FORCE_INLINE +int hex_char_to_int(char c) { + if (c >= '0' && c <= '9') { + return c - '0'; + } else if (c >= 'a' && c <= 'f') { + return c - 'a' + 10; + } else if (c >= 'A' && c <= 'F') { + return c - 'A' + 10; + } + return -1; // Invalid hex character +} + +// Cast VARCHAR to UUID (FixedSizeBinary(16)) +// Expected input format: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" (36 characters with hyphens) +// or "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" (32 characters without hyphens) +FORCE_INLINE +const char* castUUID_utf8(gdv_int64 context, const char* data, gdv_int32 data_len, + gdv_int32* out_len) { + *out_len = 16; // UUID is always 16 bytes + + // Allocate output buffer + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, 16)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for UUID"); + *out_len = 0; + return ""; + } + + // Parse UUID string + // Expected format: "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" (36 chars) + // or without hyphens: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" (32 chars) + + if (data_len == 36) { + // Format with hyphens: validate hyphen positions + if (data[8] != '-' || data[13] != '-' || data[18] != '-' || data[23] != '-') { + gdv_fn_context_set_error_msg(context, "Invalid UUID format: hyphens at wrong positions"); + *out_len = 0; + return ""; + } + + // Parse hex digits, skipping hyphens + int byte_idx = 0; + for (int i = 0; i < 36 && byte_idx < 16; i++) { + if (data[i] == '-') continue; + + // Parse two hex digits + int high = hex_char_to_int(data[i]); + int low = hex_char_to_int(data[i + 1]); + + if (high < 0 || low < 0) { + gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID"); + *out_len = 0; + return ""; + } + + ret[byte_idx++] = static_cast((high << 4) | low); + i++; // Skip the second hex digit + } + } else if (data_len == 32) { + // Format without hyphens + for (int i = 0; i < 16; i++) { + int high = hex_char_to_int(data[i * 2]); + int low = hex_char_to_int(data[i * 2 + 1]); + + if (high < 0 || low < 0) { + gdv_fn_context_set_error_msg(context, "Invalid hex digit in UUID"); + *out_len = 0; + return ""; + } + + ret[i] = static_cast((high << 4) | low); + } + } else { + gdv_fn_context_set_error_msg(context, "Invalid UUID string length: expected 32 or 36 characters"); + *out_len = 0; + return ""; + } + + return ret; +} } // extern "C" diff --git a/cpp/src/gandiva/precompiled/string_ops_test.cc b/cpp/src/gandiva/precompiled/string_ops_test.cc index aaa25db0a9f..37366ddaf1d 100644 --- a/cpp/src/gandiva/precompiled/string_ops_test.cc +++ b/cpp/src/gandiva/precompiled/string_ops_test.cc @@ -2704,4 +2704,120 @@ TEST(TestStringOps, TestInstr) { result = instr_utf8(s1.c_str(), s1_len, s2.c_str(), s2_len); EXPECT_EQ(result, 8); } + + +TEST(TestStringOps, TestCastUUID) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + gdv_int32 out_len = 0; + + // Test valid UUID with hyphens (36 characters) + const char* uuid_with_hyphens = "550e8400-e29b-41d4-a716-446655440000"; + const char* result = castUUID_utf8(ctx_ptr, uuid_with_hyphens, 36, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + + // Expected bytes for the UUID above + unsigned char expected[] = {0x55, 0x0e, 0x84, 0x00, 0xe2, 0x9b, 0x41, 0xd4, + 0xa7, 0x16, 0x44, 0x66, 0x55, 0x44, 0x00, 0x00}; + EXPECT_EQ(memcmp(result, expected, 16), 0); + + // Test valid UUID without hyphens (32 characters) + const char* uuid_without_hyphens = "550e8400e29b41d4a716446655440000"; + result = castUUID_utf8(ctx_ptr, uuid_without_hyphens, 32, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(memcmp(result, expected, 16), 0); + + // Test UUID with uppercase hex digits + const char* uuid_uppercase = "550E8400-E29B-41D4-A716-446655440000"; + result = castUUID_utf8(ctx_ptr, uuid_uppercase, 36, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(memcmp(result, expected, 16), 0); + + // Test UUID with mixed case + const char* uuid_mixed = "550e8400-E29B-41d4-A716-446655440000"; + result = castUUID_utf8(ctx_ptr, uuid_mixed, 36, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(memcmp(result, expected, 16), 0); + + // Test all zeros UUID + const char* uuid_zeros = "00000000-0000-0000-0000-000000000000"; + result = castUUID_utf8(ctx_ptr, uuid_zeros, 36, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + unsigned char zeros[16] = {0}; + EXPECT_EQ(memcmp(result, zeros, 16), 0); + + // Test all Fs UUID + const char* uuid_fs = "ffffffff-ffff-ffff-ffff-ffffffffffff"; + result = castUUID_utf8(ctx_ptr, uuid_fs, 36, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + unsigned char fs[16]; + memset(fs, 0xff, 16); + EXPECT_EQ(memcmp(result, fs, 16), 0); + + // Test invalid length (too short) + const char* uuid_short = "550e8400-e29b-41d4-a716"; + result = castUUID_utf8(ctx_ptr, uuid_short, 23, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.has_error()); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Invalid UUID string length")); + ctx.Reset(); + + // Test invalid length (too long) + const char* uuid_long = "550e8400-e29b-41d4-a716-446655440000-extra"; + result = castUUID_utf8(ctx_ptr, uuid_long, 42, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.has_error()); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Invalid UUID string length")); + ctx.Reset(); + + // Test invalid hex character + const char* uuid_invalid_hex = "550e8400-e29b-41d4-a716-44665544000g"; + result = castUUID_utf8(ctx_ptr, uuid_invalid_hex, 36, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.has_error()); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Invalid hex digit in UUID")); + ctx.Reset(); + + // Test hyphens at wrong positions + const char* uuid_wrong_hyphens = "550e8400e-29b-41d4-a716-446655440000"; + result = castUUID_utf8(ctx_ptr, uuid_wrong_hyphens, 36, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.has_error()); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Invalid UUID format: hyphens at wrong positions")); + ctx.Reset(); + + // Test empty string + result = castUUID_utf8(ctx_ptr, "", 0, &out_len); + EXPECT_EQ(out_len, 0); + EXPECT_TRUE(ctx.has_error()); + EXPECT_THAT(ctx.get_error(), + ::testing::HasSubstr("Invalid UUID string length")); + ctx.Reset(); + + // Test another valid UUID (different pattern) + const char* uuid2 = "123e4567-e89b-12d3-a456-426614174000"; + result = castUUID_utf8(ctx_ptr, uuid2, 36, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + unsigned char expected2[] = {0x12, 0x3e, 0x45, 0x67, 0xe8, 0x9b, 0x12, 0xd3, + 0xa4, 0x56, 0x42, 0x66, 0x14, 0x17, 0x40, 0x00}; + EXPECT_EQ(memcmp(result, expected2, 16), 0); + + // Test UUID without hyphens (different pattern) + const char* uuid2_no_hyphens = "123e4567e89b12d3a456426614174000"; + result = castUUID_utf8(ctx_ptr, uuid2_no_hyphens, 32, &out_len); + EXPECT_EQ(out_len, 16); + EXPECT_FALSE(ctx.has_error()); + EXPECT_EQ(memcmp(result, expected2, 16), 0); +} } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index c93b694fc77..f7fb16cc62a 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -841,4 +841,7 @@ const char* elt_int32_utf8_utf8_utf8_utf8_utf8( int32_t instr_utf8(const char* string, int32_t string_len, const char* substring, int32_t substring_len); +const char* castUUID_utf8(int64_t context, const char* data, int32_t data_len, + int32_t* out_len); + } // extern "C"