Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1391,29 +1391,40 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS
compareResultsAgainstVanillaSpark(sql, true, { _ => })
}

test("arabic_indic digit date") {
test("local digit date") {
withSQLConf(
SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
(ConstantFolding.ruleName + "," + NullPropagation.ruleName),
("spark.sql.legacy.timeParserPolicy", "LEGACY")) {
sql("create table tb_arabic_date(d string) using parquet")
sql("create table tb_local_date(i bigint, d string) using parquet")
sql("""
|insert into tb_arabic_date values
|'2aLZoNmi2aQt2aDZpi3ZoNmh',
|'2aLZoNmi2aQt2aHZoi3Zo9mh',
|'2aLZoNmi2aQt2aHZoi3Zo9mh'
|insert into tb_local_date values
|(1, '2aLZoNmi2aQt2aDZpi3ZoNmh'),
|(2, '2aLZoNmi2aQt2aHZoi3Zo9mh'),
|(3, '2aLZoNmi2aQt2aHZoi3Zo9mh'),
|(5, '27LbsNuy27Ut27HbsS3bsduz'),
|(6, ''),
|(7, '4KWo4KWm4KWo4KWrLeClp+Clpy3gpafgpak='),
|(8, '4Z+i4Z+g4Z+i4Z+lLeGfoeGfoS3hn6Hhn6M='),
|(9, null),
|(10, '4Keo4Kem4Keo4KerLeCnp+Cnpy3gp6fgp6k='),
|(11, 'MjAyNS0xMS0xMg==')
|""".stripMargin)
var query_sql = """
|select
|from_unixtime(unix_timestamp(cast(unbase64(d) as string), 'yyyy-MM-dd'))
|from tb_arabic_date
|from_unixtime(unix_timestamp(cast(unbase64(d) as string), 'yyyy-MM-dd')),
|cast(unbase64(d) as string) from (
|select d, i
|from tb_local_date
|order by i)
|""".stripMargin
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })

query_sql = """
|select from_unixtime(
| unix_timestamp(cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string),
| 'yyyy-MM-dd'))
| 'yyyy-MM-dd')),
| cast(unbase64('2aLZoNmi2aQt2aDZpi3ZoNmh') as string)
|""".stripMargin
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })

Expand All @@ -1422,7 +1433,8 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS
|""".stripMargin
compareResultsAgainstVanillaSpark(query_sql, true, { _ => })

sql("drop table tb_arabic_date")
sql("drop table tb_local_date")
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,19 @@
*/


#include <Columns/IColumn.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnNullable.h>
#include <Columns/ColumnString.h>
#include <Columns/IColumn.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/IDataType.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <boost/iostreams/detail/select.hpp>
#include <Common/Exception.h>
#include <Common/logger_useful.h>
#include "base/types.h"

namespace DB
{
Expand All @@ -40,12 +42,12 @@ namespace local_engine
{
// Since spark 3.3, unix_timestamp support arabic number input, e.g., "٢٠٢١-٠٧-٠١ ١٢:٠٠:٠٠".
// We implement a function to translate arabic indic digits to ascii digits here.
class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
class LocalDigitsToAsciiDigitForDateFunction : public DB::IFunction
{
public:
static constexpr auto name = "arabic_indic_to_ascii_digit_for_date";
static constexpr auto name = "local_digit_to_ascii_digit_for_date";

static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared<ArabicIndicToAsciiDigitForDateFunction>(); }
static DB::FunctionPtr create(DB::ContextPtr) { return std::make_shared<LocalDigitsToAsciiDigitForDateFunction>(); }

String getName() const override { return name; }

Expand All @@ -56,7 +58,11 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
{
auto nested_type = DB::removeNullable(arguments[0]);
if (!DB::WhichDataType(nested_type).isString())
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), arguments[0]->getName());
throw DB::Exception(
DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument for function {} must be String, but got {}",
getName(),
arguments[0]->getName());
return arguments[0];
}

Expand Down Expand Up @@ -85,9 +91,13 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
}
if (!col_str)
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), data_col->getName());
throw DB::Exception(
DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument for function {} must be String, but got {}",
getName(),
data_col->getName());
auto date_str = col_str->getDataAt(0);
auto new_str = convertArabicIndicDigit(date_str);
auto new_str = convertLocalDigit(date_str);
auto new_data_col = data_col->cloneEmpty();
new_data_col->insertData(new_str.c_str(), new_str.size());
return DB::ColumnConst::create(std::move(new_data_col), input_rows_count);
Expand All @@ -104,10 +114,14 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
col_str = DB::checkAndGetColumn<DB::ColumnString>(data_col.get());
}
if (!col_str)
throw DB::Exception(DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Argument for function {} must be String, but got {}", getName(), data_col->getName());
throw DB::Exception(
DB::ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Argument for function {} must be String, but got {}",
getName(),
data_col->getName());

auto nested_data_col = DB::removeNullable(arguments[0].column);
bool has_arabic_indic_digit = false;
bool has_local_digit = false;
size_t row_index = 0;
for (row_index = 0; row_index < input_rows_count; ++row_index)
{
Expand All @@ -116,16 +130,16 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
continue;
}
auto str = col_str->getDataAt(row_index);
if (hasArabicIndicDigit(str))
if (hasLocalDigit(str))
{
has_arabic_indic_digit = true;
has_local_digit = true;
break;
}
}

if (!has_arabic_indic_digit)
if (!has_local_digit)
{
// No Arabic indic digits found, return the original column
// No local language digits found, return the original column
return arguments[0].column;
}

Expand All @@ -141,15 +155,18 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
res_col->insertDefault();
continue;
}
auto str = convertArabicIndicDigit(col_str->getDataAt(row_index));
auto str = convertLocalDigit(col_str->getDataAt(row_index));
LOG_ERROR(getLogger("LocalDigitsToAsciiDigitForDateFunction"), "Converted local digit string {} to ascii digit string: {}", col_str->getDataAt(row_index).toString(), str);
res_col->insertData(str.c_str(), str.size());
}
return res_col;
}

private:
bool hasArabicIndicDigit(StringRef str) const
bool hasLocalDigit(StringRef str) const
{
if (!str.size)
return false;
// In most cases, the first byte is a digit.
char c = reinterpret_cast<char>(str.data[0]);
if ('0' <= c && c <= '9')
Expand All @@ -159,11 +176,26 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
return true;
}

char toAsciiDigit(char32_t c) const {
// In Thai and Persian, dates typically do not use the Gregorian calendar.
// This may cause failures in unix_timestamp parsing.
if (c >= 0x0660 && c <= 0x0669)
return static_cast<char>(c - 0x0660 + '0');
else if (c >= 0x06F0 && c <= 0x06F9)
return static_cast<char>(c - 0x06F0 + '0');
else if (c >= 0x0966 && c <= 0x096F)
return static_cast<char>(c - 0x0966 + '0');
else if (c >= 0x0E50 && c <= 0x0E59)
return static_cast<char>(c - 0x0E50 + '0');
else if (c >= 0x17E0 && c <= 0x17E9)
return static_cast<char>(c - 0x17E0 + '0');
else if (c >= 0x09E6 && c <= 0x09EF)
return static_cast<char>(c - 0x09E6 + '0');
else
return 0;
}

bool isArabicIndicDigit(char32_t c) const { return c >= 0x0660 && c <= 0x0669; }
char toAsciiDigit(char32_t c) const { return static_cast<char>(c - 0x0660 + '0'); }

String convertArabicIndicDigit(const StringRef & str) const
String convertLocalDigit(const StringRef & str) const
{
std::string result;
result.reserve(str.size);
Expand Down Expand Up @@ -191,8 +223,9 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
cp = ((c & 0x07) << 18) | ((str.data[i + 1] & 0x3F) << 12) | ((str.data[i + 2] & 0x3F) << 6) | (str.data[i + 3] & 0x3F);
i += 4;
}
if (isArabicIndicDigit(cp))
result.push_back(toAsciiDigit(cp));
auto local_digit = toAsciiDigit(cp);
if (local_digit)
result.push_back(local_digit);
else
result.push_back(cp);
}
Expand All @@ -201,8 +234,8 @@ class ArabicIndicToAsciiDigitForDateFunction : public DB::IFunction
};

using namespace DB;
REGISTER_FUNCTION(ArabicIndicToAsciiDigitForDate)
REGISTER_FUNCTION(LocalDigitToAsciiDigitForDate)
{
factory.registerFunction<ArabicIndicToAsciiDigitForDateFunction>();
factory.registerFunction<LocalDigitsToAsciiDigitForDateFunction>();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ class FunctionParserGetTimestamp : public FunctionParser
auto parsed_args = parseFunctionArguments(substrait_func, actions_dag);
if (parsed_args.size() != 2)
throw DB::Exception(DB::ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Function {} requires exactly two arguments", getName());
const auto * expr_arg = convertArabicIndicDigit(actions_dag, parsed_args[0]);
const auto * expr_arg = convertLocalDigit(actions_dag, parsed_args[0]);
const auto * fmt_arg = parsed_args[1];

const auto & args = substrait_func.arguments();
Expand Down Expand Up @@ -130,9 +130,9 @@ class FunctionParserGetTimestamp : public FunctionParser
}
}

const DB::ActionsDAG::Node * convertArabicIndicDigit(DB::ActionsDAG & actions_dag, const DB::ActionsDAG::Node * node) const
const DB::ActionsDAG::Node * convertLocalDigit(DB::ActionsDAG & actions_dag, const DB::ActionsDAG::Node * node) const
{
const auto * func_node = toFunctionNode(actions_dag, "arabic_indic_to_ascii_digit_for_date", {node});
const auto * func_node = toFunctionNode(actions_dag, "local_digit_to_ascii_digit_for_date", {node});
return func_node;
}
};
Expand Down