diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index fc6ea354e060..c4c53cdededc 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -1391,29 +1391,40 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS compareResultsAgainstVanillaSpark(sql, true, { _ => }) } - test("arabic_indic digit date") { + test("local digit date") { withSQLConf( SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> (ConstantFolding.ruleName + "," + NullPropagation.ruleName), ("spark.sql.legacy.timeParserPolicy", "LEGACY")) { - sql("create table tb_arabic_date(d string) using parquet") + sql("create table tb_local_date(i bigint, d string) using parquet") sql(""" - |insert into tb_arabic_date values - |'2aLZoNmi2aQt2aDZpi3ZoNmh', - |'2aLZoNmi2aQt2aHZoi3Zo9mh', - |'2aLZoNmi2aQt2aHZoi3Zo9mh' + |insert into tb_local_date values + |(1, '2aLZoNmi2aQt2aDZpi3ZoNmh'), + |(2, '2aLZoNmi2aQt2aHZoi3Zo9mh'), + |(3, '2aLZoNmi2aQt2aHZoi3Zo9mh'), + |(5, '27LbsNuy27Ut27HbsS3bsduz'), + |(6, ''), + |(7, '4KWo4KWm4KWo4KWrLeClp+Clpy3gpafgpak='), + |(8, '4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M='), + |(9, null), + |(10, '4Keo4Kem4Keo4KerLeCnp+Cnpy3gp6fgp6k='), + |(11, 'MjAyNS0xMS0xMg==') |""".stripMargin) var query_sql = """ |select - |from_unixtime(unix_timestamp(cast(unbase64(d) as string), 'yyyy-MM-dd')) - |from tb_arabic_date + |from_unixtime(unix_timestamp(cast(unbase64(d) as string), 'yyyy-MM-dd')), + |cast(unbase64(d) as string) from ( + |select d, i + |from tb_local_date + |order by i) |""".stripMargin compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) query_sql = """ |select from_unixtime( | unix_timestamp(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string), - | 'yyyy-MM-dd')) + | 'yyyy-MM-dd')), + | cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string) |""".stripMargin compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) @@ -1422,7 +1433,8 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS |""".stripMargin compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) - sql("drop table tb_arabic_date") + sql("drop table tb_local_date") } } + } diff --git a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp similarity index 69% rename from cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp rename to cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp index 94332c3a5fa0..7f19975c0a96 100644 --- a/cpp-ch/local-engine/Functions/ArabicIndicToAsciiDigitForDate.cpp +++ b/cpp-ch/local-engine/Functions/LocalDigitsToAsciiDigitForDate.cpp @@ -16,17 +16,19 @@ */ -#include -#include #include +#include #include +#include #include #include #include #include #include +#include #include #include +#include "base/types.h" namespace DB { @@ -40,12 +42,12 @@ namespace local_engine { // Since spark 3.3, unix_timestamp support arabic number input, e.g., "٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠". // We implement a function to translate arabic indic digits to ascii digits here. -class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction +class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction { public: - static constexpr auto name = "arabic_indic_to_ascii_digit_for_date"; + static constexpr auto name = "local_digit_to_ascii_digit_for_date"; - static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared(); } + static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared(); } String getName() const override { return name; } @@ -56,7 +58,11 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction { auto nested_type = DB::removeNullable(arguments[0]); if (!DB::WhichDataType(nested_type).isString()) - throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), arguments[0]->getName()); + throw DB::Exception( + DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Argument for function {} must be String, but got {}", + getName(), + arguments[0]->getName()); return arguments[0]; } @@ -85,9 +91,13 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction col_str = DB::checkAndGetColumn(data_col.get()); } if (!col_str) - throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), data_col->getName()); + throw DB::Exception( + DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Argument for function {} must be String, but got {}", + getName(), + data_col->getName()); auto date_str = col_str->getDataAt(0); - auto new_str = convertArabicIndicDigit(date_str); + auto new_str = convertLocalDigit(date_str); auto new_data_col = data_col->cloneEmpty(); new_data_col->insertData(new_str.c_str(), new_str.size()); return DB::ColumnConst::create(std::move(new_data_col), input_rows_count); @@ -104,10 +114,14 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction col_str = DB::checkAndGetColumn(data_col.get()); } if (!col_str) - throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), data_col->getName()); + throw DB::Exception( + DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Argument for function {} must be String, but got {}", + getName(), + data_col->getName()); auto nested_data_col = DB::removeNullable(arguments[0].column); - bool has_arabic_indic_digit = false; + bool has_local_digit = false; size_t row_index = 0; for (row_index = 0; row_index < input_rows_count; ++row_index) { @@ -116,16 +130,16 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction continue; } auto str = col_str->getDataAt(row_index); - if (hasArabicIndicDigit(str)) + if (hasLocalDigit(str)) { - has_arabic_indic_digit = true; + has_local_digit = true; break; } } - if (!has_arabic_indic_digit) + if (!has_local_digit) { - // No Arabic indic digits found, return the original column + // No local language digits found, return the original column return arguments[0].column; } @@ -141,15 +155,18 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction res_col->insertDefault(); continue; } - auto str = convertArabicIndicDigit(col_str->getDataAt(row_index)); + auto str = convertLocalDigit(col_str->getDataAt(row_index)); + LOG_ERROR(getLogger("LocalDigitsToAsciiDigitForDateFunction"), "Converted local digit string {} to ascii digit string: {}", col_str->getDataAt(row_index).toString(), str); res_col->insertData(str.c_str(), str.size()); } return res_col; } private: - bool hasArabicIndicDigit(StringRef str) const + bool hasLocalDigit(StringRef str) const { + if (!str.size) + return false; // In most cases, the first byte is a digit. char c = reinterpret_cast(str.data[0]); if ('0' <= c && c <= '9') @@ -159,11 +176,26 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction return true; } + char toAsciiDigit(char32_t c) const { + // In Thai and Persian, dates typically do not use the Gregorian calendar. + // This may cause failures in unix_timestamp parsing. + if (c >= 0x0660 && c <= 0x0669) + return static_cast(c - 0x0660 + '0'); + else if (c >= 0x06F0 && c <= 0x06F9) + return static_cast(c - 0x06F0 + '0'); + else if (c >= 0x0966 && c <= 0x096F) + return static_cast(c - 0x0966 + '0'); + else if (c >= 0x0E50 && c <= 0x0E59) + return static_cast(c - 0x0E50 + '0'); + else if (c >= 0x17E0 && c <= 0x17E9) + return static_cast(c - 0x17E0 + '0'); + else if (c >= 0x09E6 && c <= 0x09EF) + return static_cast(c - 0x09E6 + '0'); + else + return 0; + } - bool isArabicIndicDigit(char32_t c) const { return c >= 0x0660 && c <= 0x0669; } - char toAsciiDigit(char32_t c) const { return static_cast(c - 0x0660 + '0'); } - - String convertArabicIndicDigit(const StringRef & str) const + String convertLocalDigit(const StringRef & str) const { std::string result; result.reserve(str.size); @@ -191,8 +223,9 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | ((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F); i += 4; } - if (isArabicIndicDigit(cp)) - result.push_back(toAsciiDigit(cp)); + auto local_digit = toAsciiDigit(cp); + if (local_digit) + result.push_back(local_digit); else result.push_back(cp); } @@ -201,8 +234,8 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction }; using namespace DB; -REGISTER_FUNCTION(ArabicIndicToAsciiDigitForDate) +REGISTER_FUNCTION(LocalDigitToAsciiDigitForDate) { - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h index 53e6a0e6e6d7..f5c3ac0d713f 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/getTimestamp.h @@ -61,7 +61,7 @@ class FunctionParserGetTimestamp : public FunctionParser auto parsed_args = parseFunctionArguments(substrait_func, actions_dag); if (parsed_args.size() != 2) throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName()); - const auto * expr_arg = convertArabicIndicDigit(actions_dag, parsed_args[0]); + const auto * expr_arg = convertLocalDigit(actions_dag, parsed_args[0]); const auto * fmt_arg = parsed_args[1]; const auto & args = substrait_func.arguments(); @@ -130,9 +130,9 @@ class FunctionParserGetTimestamp : public FunctionParser } } - const DB::ActionsDAG::Node * convertArabicIndicDigit(DB::ActionsDAG & actions_dag, const DB::ActionsDAG::Node * node) const + const DB::ActionsDAG::Node * convertLocalDigit(DB::ActionsDAG & actions_dag, const DB::ActionsDAG::Node * node) const { - const auto * func_node = toFunctionNode(actions_dag, "arabic_indic_to_ascii_digit_for_date", {node}); + const auto * func_node = toFunctionNode(actions_dag, "local_digit_to_ascii_digit_for_date", {node}); return func_node; } };