From e1eb98cb867ee511fd2f3f4fb9a3b0d07f386200 Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Wed, 26 Jul 2023 15:15:29 -0700 Subject: [PATCH 01/13] error-messages --- .../utils/src/main/resources/error/error-classes.json | 8 ++++---- .../catalyst/plans/logical/FunctionBuilderBase.scala | 11 +++++++---- .../spark/sql/errors/QueryCompilationErrors.scala | 10 ++++++---- .../analysis/NamedParameterFunctionSuite.scala | 8 ++++---- .../analyzer-results/named-function-arguments.sql.out | 4 +++- .../results/named-function-arguments.sql.out | 4 +++- 6 files changed, 27 insertions(+), 18 deletions(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 753004041943f..6edcf9e2145a6 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -804,12 +804,12 @@ "subClass" : { "BOTH_POSITIONAL_AND_NAMED" : { "message" : [ - "A positional argument and named argument both referred to the same parameter." + "A positional argument and named argument both referred to the same parameter. Please remove the named argument referring to this parameter." ] }, "DOUBLE_NAMED_ARGUMENT_REFERENCE" : { "message" : [ - "More than one named argument referred to the same parameter." + "More than one named argument referred to the same parameter. Please assign a value only once." ] } }, @@ -2398,7 +2398,7 @@ }, "REQUIRED_PARAMETER_NOT_FOUND" : { "message" : [ - "Cannot invoke function because the parameter named is required, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally or by name) and retry the query again." + "Cannot invoke function because the parameter named is required at index , but the function call did not supply a value. Please update the function call to supply an argument value (either positionally or by name) and retry the query again." ], "sqlState" : "4274K" }, @@ -2599,7 +2599,7 @@ }, "UNEXPECTED_POSITIONAL_ARGUMENT" : { "message" : [ - "Cannot invoke function because it contains positional argument(s) following named argument(s); please rearrange them so the positional arguments come first and then retry the query again." + "Cannot invoke function because it contains positional argument(s) following the named argument assigned to ; please rearrange them so the positional arguments come first and then retry the query again." ], "sqlState" : "4274K" }, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala index 4a2b9eae98100..322ebc62c789c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala @@ -104,7 +104,8 @@ object NamedParametersSupport { val positionalParametersSet = allParameterNames.take(positionalArgs.size).toSet val namedParametersSet = collection.mutable.Set[String]() - for (arg <- namedArgs) { + namedArgs.zipWithIndex.foreach { tuple => + val (arg, index) = tuple arg match { case namedArg: NamedArgumentExpression => val parameterName = namedArg.key @@ -122,7 +123,8 @@ object NamedParametersSupport { } namedParametersSet.add(namedArg.key) case _ => - throw QueryCompilationErrors.unexpectedPositionalArgument(functionName) + throw QueryCompilationErrors.unexpectedPositionalArgument( + functionName, namedArgs(index - 1).asInstanceOf[NamedArgumentExpression].key) } } @@ -141,11 +143,12 @@ object NamedParametersSupport { }.toMap // We rearrange named arguments to match their positional order. - val rearrangedNamedArgs: Seq[Expression] = namedParameters.map { param => + val rearrangedNamedArgs: Seq[Expression] = namedParameters.zipWithIndex.map { tuple => + val (param, index) = tuple namedArgMap.getOrElse( param.name, if (param.default.isEmpty) { - throw QueryCompilationErrors.requiredParameterNotFound(functionName, param.name) + throw QueryCompilationErrors.requiredParameterNotFound(functionName, param.name, index) } else { param.default.get } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 03b0a72bba585..455d9fe0a5a89 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -90,12 +90,13 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { } def requiredParameterNotFound( - functionName: String, parameterName: String) : Throwable = { + functionName: String, parameterName: String, index: Int) : Throwable = { new AnalysisException( errorClass = "REQUIRED_PARAMETER_NOT_FOUND", messageParameters = Map( "functionName" -> toSQLId(functionName), - "parameterName" -> toSQLId(parameterName)) + "parameterName" -> toSQLId(parameterName), + "index" -> index.toString) ) } @@ -115,10 +116,11 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { ) } - def unexpectedPositionalArgument(functionName: String): Throwable = { + def unexpectedPositionalArgument(functionName: String, parameterName: String): Throwable = { new AnalysisException( errorClass = "UNEXPECTED_POSITIONAL_ARGUMENT", - messageParameters = Map("functionName" -> toSQLId(functionName)) + messageParameters = Map( + "functionName" -> toSQLId(functionName), "parameterName" -> toSQLId(parameterName)) ) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala index dd5cb5e7d03c8..ff13bb5102f9c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala @@ -53,7 +53,7 @@ object DummyExpressionBuilder extends ExpressionBuilder { DummyExpression(expressions(0), expressions(1), expressions(2), expressions(3)) } -class NamedArgumentFunctionSuite extends AnalysisTest { +class NamedParameterFunctionSuite extends AnalysisTest { final val k1Arg = Literal("v1") final val k2Arg = NamedArgumentExpression("k2", Literal("v2")) @@ -115,8 +115,8 @@ class NamedArgumentFunctionSuite extends AnalysisTest { checkError( exception = parseRearrangeException(signature, Seq(k1Arg, k2Arg, k3Arg), "foo"), errorClass = "REQUIRED_PARAMETER_NOT_FOUND", - parameters = Map("functionName" -> toSQLId("foo"), "parameterName" -> toSQLId("k4")) - ) + parameters = Map( + "functionName" -> toSQLId("foo"), "parameterName" -> toSQLId("k4"), "index" -> "2")) } test("UNRECOGNIZED_PARAMETER_NAME") { @@ -134,7 +134,7 @@ class NamedArgumentFunctionSuite extends AnalysisTest { exception = parseRearrangeException(signature, Seq(k2Arg, k3Arg, k1Arg, k4Arg), "foo"), errorClass = "UNEXPECTED_POSITIONAL_ARGUMENT", - parameters = Map("functionName" -> toSQLId("foo")) + parameters = Map("functionName" -> toSQLId("foo"), "parameterName" -> toSQLId("k3")) ) } diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out index e01e0ca5ee011..510f6dc1e2606 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/named-function-arguments.sql.out @@ -229,7 +229,8 @@ org.apache.spark.sql.AnalysisException "errorClass" : "UNEXPECTED_POSITIONAL_ARGUMENT", "sqlState" : "4274K", "messageParameters" : { - "functionName" : "`mask`" + "functionName" : "`mask`", + "parameterName" : "`lowerChar`" }, "queryContext" : [ { "objectType" : "", @@ -292,6 +293,7 @@ org.apache.spark.sql.AnalysisException "sqlState" : "4274K", "messageParameters" : { "functionName" : "`mask`", + "index" : "0", "parameterName" : "`str`" }, "queryContext" : [ { diff --git a/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out b/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out index 3b223cc0e1529..5c593979bb456 100644 --- a/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/named-function-arguments.sql.out @@ -214,7 +214,8 @@ org.apache.spark.sql.AnalysisException "errorClass" : "UNEXPECTED_POSITIONAL_ARGUMENT", "sqlState" : "4274K", "messageParameters" : { - "functionName" : "`mask`" + "functionName" : "`mask`", + "parameterName" : "`lowerChar`" }, "queryContext" : [ { "objectType" : "", @@ -283,6 +284,7 @@ org.apache.spark.sql.AnalysisException "sqlState" : "4274K", "messageParameters" : { "functionName" : "`mask`", + "index" : "0", "parameterName" : "`str`" }, "queryContext" : [ { From 312480690953c0330db962ff9caf0c0e758a3396 Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Thu, 27 Jul 2023 15:08:03 -0700 Subject: [PATCH 02/13] Add lines --- dev/error_message_refiner.py | 265 ++++++++++++++++++ .../NamedParameterFunctionSuite.scala | 2 + 2 files changed, 267 insertions(+) create mode 100644 dev/error_message_refiner.py diff --git a/dev/error_message_refiner.py b/dev/error_message_refiner.py new file mode 100644 index 0000000000000..0afebe9c356bb --- /dev/null +++ b/dev/error_message_refiner.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Utility for refining error messages based on LLM. + +Usage: + python error_message_refiner.py [--model_name=] + +Arguments: + Required. + The name of the error class to refine the messages for. + The list of error classes is located in + `common/utils/src/main/resources/error/error-classes.json`. + +Options: + --model_name= Optional. + The version of Chat GPT to use for refining the error messages. + If not provided, the default version("gpt-3.5-turbo") will be used. + +Example usage: + python error_message_refiner.py CANNOT_DECODE_URL --model_name=gpt-4 + +Description: + This script refines error messages using the LLM based approach. + It takes the name of the error class as a required argument and, optionally, + allows specifying the version of Chat GPT to use for refining the messages. + + Options: + --model_name: Specifies the version of Chat GPT. + If not provided, the default version("gpt-3.5-turbo") will be used. + + Note: + - Ensure that the necessary dependencies are installed before running the script. + - Ensure that the valid API key is entered in the `api-key.txt`. + - The refined error messages will be displayed in the console output. + - To use the gpt-4 model, you need to join the waitlist. Please refer to + https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4 for more details. +""" + +import argparse +import json +import openai +import re +import subprocess +import random +import os +from typing import Tuple, Optional + +from sparktestsupport import SPARK_HOME + +PATH_TO_ERROR_CLASS = f"{SPARK_HOME}/common/utils/src/main/resources/error/error-classes.json" +# Register your own open API key for the environment variable. +# The API key can be obtained from https://platform.openai.com/account/api-keys. +OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") + +openai.api_key = OPENAI_API_KEY + + +def _git_grep_files(search_string: str, exclude: str = None) -> str: + """ + Executes 'git grep' command to search for files containing the given search string. + Returns the file path where the search string is found. + """ + result = subprocess.run( + ["git", "grep", "-l", search_string, "--", f"{SPARK_HOME}/*.scala"], + capture_output=True, + text=True, + ) + output = result.stdout.strip() + + files = output.split("\n") + files = [file for file in files if "Suite" not in file] + if exclude is not None: + files = [file for file in files if exclude not in file] + file = random.choice(files) + return file + + +def _find_function(file_name: str, search_string: str) -> Optional[str]: + """ + Searches for a function in the given file containing the specified search string. + Returns the name of the function if found, otherwise None. + """ + with open(file_name, "r") as file: + content = file.read() + functions = re.findall(r"def\s+(\w+)\s*\(", content) + + for function in functions: + function_content = re.search( + rf"def\s+{re.escape(function)}(?:(?!def).)*?{re.escape(search_string)}", + content, + re.DOTALL, + ) + if function_content and search_string in function_content.group(0): + return function + + return None + + +def _find_func_body(file_name: str, search_string: str) -> Optional[str]: + """ + Searches for a function body in the given file containing the specified search string. + Returns the function body if found, otherwise None. + """ + with open(file_name, "r") as file: + content = file.read() + functions = re.findall(r"def\s+(\w+)\s*\(", content) + + for function in functions: + function_content = re.search( + rf"def\s+{re.escape(function)}(?:(?!def\s).)*?{re.escape(search_string)}", + content, + re.DOTALL, + ) + if function_content and search_string in function_content.group(0): + return function_content.group(0) + + return None + + +def _get_error_function(error_class: str) -> str: + """ + Retrieves the name of the error function that triggers the given error class. + """ + search_string = error_class + matched_file = _git_grep_files(search_string) + err_func = _find_function(matched_file, search_string) + return err_func + + +def _get_source_code(error_class: str) -> str: + """ + Retrieves the source code of a function where the given error class is being invoked. + """ + search_string = error_class + matched_file = _git_grep_files(search_string) + err_func = _find_function(matched_file, search_string) + source_file = _git_grep_files(err_func, exclude=matched_file) + source_code = _find_func_body(source_file, err_func) + return source_code + + +def _get_error_message(error_class: str) -> str: + """ + Returns the error message from the provided error class + listed in core/src/main/resources/error/error-classes.json. + """ + with open(PATH_TO_ERROR_CLASS) as f: + error_classes = json.load(f) + + error_message = error_classes.get(error_class) + if error_message: + return error_message["message"] + else: + return f"Error message not found for class: {error_class}" + + +def ask_chat_gpt(error_class: str, model_name: str) -> Tuple[str, str]: + """ + Requests error message improvement from GPT API + Returns a tuple containing the old error message and the refined error message. + """ + old_error_message = " ".join(_get_error_message(error_class)) + error_function = _get_error_function(error_class) + source_code = _get_source_code(error_class) + prompt = f"""I would like to improve the error messages in Apache Spark. +Apache Spark provides error message guidelines, so error messages generated by Apache Spark should follow the following rules: + +1. Include What, Why, and How +Error messages should explicitly address What, Why, and How. +For example, the error message "Unable to generate an encoder for inner class <> without access to the scope that this class was defined in. Try moving this class out of its parent class." follows the guidelines as it includes What, Why, and How. +On the other hand, the error message "Unsupported function name <>" only includes What and does not provide Why and How, so it needs improvement to comply with the guidelines. + +2. Use clear language +Error messages in should adhere to the Diction guide and Wording guide provided in the two tables below in Markdown format. + +- Diction guide: + +| Phrases | When to use | Example | +| ------------------------------------------------------------ | ------------------------------------------------------------ | ----------------------------------------------- | +| Unsupported | The user may reasonably assume that the operation is supported, but it is not. This error may go away in the future if developers add support for the operation. | `Data type <> is unsupported.` | +| Invalid / Not allowed / Unexpected | The user made a mistake when specifying an operation. The message should inform the user of how to resolve the error. | `Array has size <>, index <> is invalid.` | +| `Found <> generators for the clause <>. Only one generator is allowed.` | | | +| `Found an unexpected state format version <>. Expected versions 1 or 2.` | | | +| Failed to | The system encountered an unexpected error that cannot be reasonably attributed to user error. | `Failed to compile <>.` | +| Cannot | Any time, preferably only if one of the above alternatives does not apply. | `Cannot generate code for unsupported type <>.` | + +- Wording guide: + +| Best practice | Before | After | +| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | +| Use active voice | `DataType <> is not supported by <>.` | `<> does not support datatype <>.` | +| Avoid time-based statements, such as promises of future support | `Pandas UDF aggregate expressions are currently not supported in pivot.` | `Pivot does not support Pandas UDF aggregate expressions.` | +| `Parquet type not yet supported: <>.` | `<> does not support Parquet type.` | | +| Use the present tense to describe the error and provide suggestions | `Couldn't find the reference column for <> at <>.` | `Cannot find the reference column for <> at <>.` | +| `Join strategy hint parameter should be an identifier or string but was <>.` | `Cannot use join strategy hint parameter <>. Use a table name or identifier to specify the parameter.` | | +| Provide concrete examples if the resolution is unclear | `<> Hint expects a partition number as a parameter.` | `<> Hint expects a partition number as a parameter. For example, specify 3 partitions with <>(3).` | +| Avoid sounding accusatory, judgmental, or insulting | `You must specify an amount for <>.` | `<> cannot be empty. Specify an amount for <>.` | +| Be direct | `LEGACY store assignment policy is disallowed in Spark data source V2. Please set the configuration spark.sql.storeAssignmentPolicy to other values.` | `Spark data source V2 does not allow the LEGACY store assignment policy. Set the configuration spark.sql.storeAssignment to ANSI or STRICT.` | +| Do not use programming jargon in user-facing errors | `RENAME TABLE source and destination databases do not match: '<>' != '<>'.` | `RENAME TABLE source and destination databases do not match. The source database is <>, but the destination database is <>.` | + +To adhere to the above guidelines, please improve the following error message: "{old_error_message}" +Please note that the angle brackets in the error message represent the placeholder for the error message parameter, so they should not be modified or removed. +For more detail, the error message is triggered through a error function named "{error_function}", and the source code that actually calls the error function is as follows: + +{source_code} + +When improving the error, please also refer to the source code provided above to provide more detailed context to users.""" + try: + response = openai.ChatCompletion.create( + model=model_name, + messages=[ + { + "role": "system", + "content": "You are a engineer who want to improve the error messages " + "for Apache Spark users.", + }, + {"role": "user", "content": prompt}, + ], + ) + except openai.error.AuthenticationError: + raise openai.error.AuthenticationError( + "Please verify if the API key is set correctly in `OPENAI_API_KEY`." + ) + except openai.error.InvalidRequestError as e: + if "gpt-4" in str(e): + raise openai.error.AuthenticationError( + "To use the gpt-4 model, you need to join the waitlist. " + "Please refer to " + "https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4 " + "for more details." + ) + raise e + result = "" + for choice in response.choices: + result += choice.message.content + return old_error_message, result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("error_class", type=str) + parser.add_argument("--model_name", type=str, default="gpt-3.5-turbo") + + args = parser.parse_args() + + old_error_message, new_error_message = ask_chat_gpt(args.error_class, args.model_name) + print(f"[{args.error_class}]\nBefore: {old_error_message}\nAfter: {new_error_message}") diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala index ff13bb5102f9c..70106fa89b13a 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala @@ -40,6 +40,7 @@ case class DummyExpression( } object DummyExpressionBuilder extends ExpressionBuilder { + def defaultFunctionSignature: FunctionSignature = { FunctionSignature(Seq(InputParameter("k1"), InputParameter("k2"), @@ -49,6 +50,7 @@ object DummyExpressionBuilder extends ExpressionBuilder { override def functionSignature: Option[FunctionSignature] = Some(defaultFunctionSignature) + override def build(funcName: String, expressions: Seq[Expression]): Expression = DummyExpression(expressions(0), expressions(1), expressions(2), expressions(3)) } From 1e1c1964db8d4f4a9febf3297d1990dafe8f09dd Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Thu, 27 Jul 2023 15:08:18 -0700 Subject: [PATCH 03/13] Remove file --- dev/error_message_refiner.py | 265 ----------------------------------- 1 file changed, 265 deletions(-) delete mode 100644 dev/error_message_refiner.py diff --git a/dev/error_message_refiner.py b/dev/error_message_refiner.py deleted file mode 100644 index 0afebe9c356bb..0000000000000 --- a/dev/error_message_refiner.py +++ /dev/null @@ -1,265 +0,0 @@ -#!/usr/bin/env python3 - -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -""" -Utility for refining error messages based on LLM. - -Usage: - python error_message_refiner.py [--model_name=] - -Arguments: - Required. - The name of the error class to refine the messages for. - The list of error classes is located in - `common/utils/src/main/resources/error/error-classes.json`. - -Options: - --model_name= Optional. - The version of Chat GPT to use for refining the error messages. - If not provided, the default version("gpt-3.5-turbo") will be used. - -Example usage: - python error_message_refiner.py CANNOT_DECODE_URL --model_name=gpt-4 - -Description: - This script refines error messages using the LLM based approach. - It takes the name of the error class as a required argument and, optionally, - allows specifying the version of Chat GPT to use for refining the messages. - - Options: - --model_name: Specifies the version of Chat GPT. - If not provided, the default version("gpt-3.5-turbo") will be used. - - Note: - - Ensure that the necessary dependencies are installed before running the script. - - Ensure that the valid API key is entered in the `api-key.txt`. - - The refined error messages will be displayed in the console output. - - To use the gpt-4 model, you need to join the waitlist. Please refer to - https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4 for more details. -""" - -import argparse -import json -import openai -import re -import subprocess -import random -import os -from typing import Tuple, Optional - -from sparktestsupport import SPARK_HOME - -PATH_TO_ERROR_CLASS = f"{SPARK_HOME}/common/utils/src/main/resources/error/error-classes.json" -# Register your own open API key for the environment variable. -# The API key can be obtained from https://platform.openai.com/account/api-keys. -OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") - -openai.api_key = OPENAI_API_KEY - - -def _git_grep_files(search_string: str, exclude: str = None) -> str: - """ - Executes 'git grep' command to search for files containing the given search string. - Returns the file path where the search string is found. - """ - result = subprocess.run( - ["git", "grep", "-l", search_string, "--", f"{SPARK_HOME}/*.scala"], - capture_output=True, - text=True, - ) - output = result.stdout.strip() - - files = output.split("\n") - files = [file for file in files if "Suite" not in file] - if exclude is not None: - files = [file for file in files if exclude not in file] - file = random.choice(files) - return file - - -def _find_function(file_name: str, search_string: str) -> Optional[str]: - """ - Searches for a function in the given file containing the specified search string. - Returns the name of the function if found, otherwise None. - """ - with open(file_name, "r") as file: - content = file.read() - functions = re.findall(r"def\s+(\w+)\s*\(", content) - - for function in functions: - function_content = re.search( - rf"def\s+{re.escape(function)}(?:(?!def).)*?{re.escape(search_string)}", - content, - re.DOTALL, - ) - if function_content and search_string in function_content.group(0): - return function - - return None - - -def _find_func_body(file_name: str, search_string: str) -> Optional[str]: - """ - Searches for a function body in the given file containing the specified search string. - Returns the function body if found, otherwise None. - """ - with open(file_name, "r") as file: - content = file.read() - functions = re.findall(r"def\s+(\w+)\s*\(", content) - - for function in functions: - function_content = re.search( - rf"def\s+{re.escape(function)}(?:(?!def\s).)*?{re.escape(search_string)}", - content, - re.DOTALL, - ) - if function_content and search_string in function_content.group(0): - return function_content.group(0) - - return None - - -def _get_error_function(error_class: str) -> str: - """ - Retrieves the name of the error function that triggers the given error class. - """ - search_string = error_class - matched_file = _git_grep_files(search_string) - err_func = _find_function(matched_file, search_string) - return err_func - - -def _get_source_code(error_class: str) -> str: - """ - Retrieves the source code of a function where the given error class is being invoked. - """ - search_string = error_class - matched_file = _git_grep_files(search_string) - err_func = _find_function(matched_file, search_string) - source_file = _git_grep_files(err_func, exclude=matched_file) - source_code = _find_func_body(source_file, err_func) - return source_code - - -def _get_error_message(error_class: str) -> str: - """ - Returns the error message from the provided error class - listed in core/src/main/resources/error/error-classes.json. - """ - with open(PATH_TO_ERROR_CLASS) as f: - error_classes = json.load(f) - - error_message = error_classes.get(error_class) - if error_message: - return error_message["message"] - else: - return f"Error message not found for class: {error_class}" - - -def ask_chat_gpt(error_class: str, model_name: str) -> Tuple[str, str]: - """ - Requests error message improvement from GPT API - Returns a tuple containing the old error message and the refined error message. - """ - old_error_message = " ".join(_get_error_message(error_class)) - error_function = _get_error_function(error_class) - source_code = _get_source_code(error_class) - prompt = f"""I would like to improve the error messages in Apache Spark. -Apache Spark provides error message guidelines, so error messages generated by Apache Spark should follow the following rules: - -1. Include What, Why, and How -Error messages should explicitly address What, Why, and How. -For example, the error message "Unable to generate an encoder for inner class <> without access to the scope that this class was defined in. Try moving this class out of its parent class." follows the guidelines as it includes What, Why, and How. -On the other hand, the error message "Unsupported function name <>" only includes What and does not provide Why and How, so it needs improvement to comply with the guidelines. - -2. Use clear language -Error messages in should adhere to the Diction guide and Wording guide provided in the two tables below in Markdown format. - -- Diction guide: - -| Phrases | When to use | Example | -| ------------------------------------------------------------ | ------------------------------------------------------------ | ----------------------------------------------- | -| Unsupported | The user may reasonably assume that the operation is supported, but it is not. This error may go away in the future if developers add support for the operation. | `Data type <> is unsupported.` | -| Invalid / Not allowed / Unexpected | The user made a mistake when specifying an operation. The message should inform the user of how to resolve the error. | `Array has size <>, index <> is invalid.` | -| `Found <> generators for the clause <>. Only one generator is allowed.` | | | -| `Found an unexpected state format version <>. Expected versions 1 or 2.` | | | -| Failed to | The system encountered an unexpected error that cannot be reasonably attributed to user error. | `Failed to compile <>.` | -| Cannot | Any time, preferably only if one of the above alternatives does not apply. | `Cannot generate code for unsupported type <>.` | - -- Wording guide: - -| Best practice | Before | After | -| ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | -| Use active voice | `DataType <> is not supported by <>.` | `<> does not support datatype <>.` | -| Avoid time-based statements, such as promises of future support | `Pandas UDF aggregate expressions are currently not supported in pivot.` | `Pivot does not support Pandas UDF aggregate expressions.` | -| `Parquet type not yet supported: <>.` | `<> does not support Parquet type.` | | -| Use the present tense to describe the error and provide suggestions | `Couldn't find the reference column for <> at <>.` | `Cannot find the reference column for <> at <>.` | -| `Join strategy hint parameter should be an identifier or string but was <>.` | `Cannot use join strategy hint parameter <>. Use a table name or identifier to specify the parameter.` | | -| Provide concrete examples if the resolution is unclear | `<> Hint expects a partition number as a parameter.` | `<> Hint expects a partition number as a parameter. For example, specify 3 partitions with <>(3).` | -| Avoid sounding accusatory, judgmental, or insulting | `You must specify an amount for <>.` | `<> cannot be empty. Specify an amount for <>.` | -| Be direct | `LEGACY store assignment policy is disallowed in Spark data source V2. Please set the configuration spark.sql.storeAssignmentPolicy to other values.` | `Spark data source V2 does not allow the LEGACY store assignment policy. Set the configuration spark.sql.storeAssignment to ANSI or STRICT.` | -| Do not use programming jargon in user-facing errors | `RENAME TABLE source and destination databases do not match: '<>' != '<>'.` | `RENAME TABLE source and destination databases do not match. The source database is <>, but the destination database is <>.` | - -To adhere to the above guidelines, please improve the following error message: "{old_error_message}" -Please note that the angle brackets in the error message represent the placeholder for the error message parameter, so they should not be modified or removed. -For more detail, the error message is triggered through a error function named "{error_function}", and the source code that actually calls the error function is as follows: - -{source_code} - -When improving the error, please also refer to the source code provided above to provide more detailed context to users.""" - try: - response = openai.ChatCompletion.create( - model=model_name, - messages=[ - { - "role": "system", - "content": "You are a engineer who want to improve the error messages " - "for Apache Spark users.", - }, - {"role": "user", "content": prompt}, - ], - ) - except openai.error.AuthenticationError: - raise openai.error.AuthenticationError( - "Please verify if the API key is set correctly in `OPENAI_API_KEY`." - ) - except openai.error.InvalidRequestError as e: - if "gpt-4" in str(e): - raise openai.error.AuthenticationError( - "To use the gpt-4 model, you need to join the waitlist. " - "Please refer to " - "https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4 " - "for more details." - ) - raise e - result = "" - for choice in response.choices: - result += choice.message.content - return old_error_message, result - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("error_class", type=str) - parser.add_argument("--model_name", type=str, default="gpt-3.5-turbo") - - args = parser.parse_args() - - old_error_message, new_error_message = ask_chat_gpt(args.error_class, args.model_name) - print(f"[{args.error_class}]\nBefore: {old_error_message}\nAfter: {new_error_message}") From e6ba64da2141654497ba16cbaadefdf4993c7134 Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Fri, 28 Jul 2023 14:02:21 -0700 Subject: [PATCH 04/13] Regenerate documentation --- ...ions-duplicate-routine-parameter-assignment-error-class.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md b/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md index d9f14b5a55ef8..eb5ca2a0169d1 100644 --- a/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md +++ b/docs/sql-error-conditions-duplicate-routine-parameter-assignment-error-class.md @@ -27,10 +27,10 @@ This error class has the following derived error classes: ## BOTH_POSITIONAL_AND_NAMED -A positional argument and named argument both referred to the same parameter. +A positional argument and named argument both referred to the same parameter. Please remove the named argument referring to this parameter. ## DOUBLE_NAMED_ARGUMENT_REFERENCE -More than one named argument referred to the same parameter. +More than one named argument referred to the same parameter. Please assign a value only once. From e3aa231ef4a1a47efc7f2c8ea861f081fae7074b Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Fri, 28 Jul 2023 14:04:16 -0700 Subject: [PATCH 05/13] Redo stuff --- docs/sql-error-conditions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index 06485193b9ae1..6fd223aeac8f8 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -1555,7 +1555,7 @@ The `` clause may be used at most once per `` operation. [SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) -Cannot invoke function `` because the parameter named `` is required, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally or by name) and retry the query again. +Cannot invoke function `` because the parameter named `` is required at index ``, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally or by name) and retry the query again. ### REQUIRES_SINGLE_PART_NAMESPACE @@ -1770,7 +1770,7 @@ Parameter `` of function `` requires the `` because it contains positional argument(s) following named argument(s); please rearrange them so the positional arguments come first and then retry the query again. +Cannot invoke function `` because it contains positional argument(s) following the named argument assigned to ``; please rearrange them so the positional arguments come first and then retry the query again. ### UNKNOWN_PROTOBUF_MESSAGE_TYPE From e5a616f4c36d74a2996324b84904effe2c3b675c Mon Sep 17 00:00:00 2001 From: Richard Yu <134337791+learningchess2003@users.noreply.github.com> Date: Tue, 1 Aug 2023 10:40:02 -0700 Subject: [PATCH 06/13] Update common/utils/src/main/resources/error/error-classes.json Co-authored-by: Xinyi --- common/utils/src/main/resources/error/error-classes.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/utils/src/main/resources/error/error-classes.json b/common/utils/src/main/resources/error/error-classes.json index 6edcf9e2145a6..44f123002a4eb 100644 --- a/common/utils/src/main/resources/error/error-classes.json +++ b/common/utils/src/main/resources/error/error-classes.json @@ -2398,7 +2398,7 @@ }, "REQUIRED_PARAMETER_NOT_FOUND" : { "message" : [ - "Cannot invoke function because the parameter named is required at index , but the function call did not supply a value. Please update the function call to supply an argument value (either positionally or by name) and retry the query again." + "Cannot invoke function because the parameter named is required, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally at index or by name) and retry the query again." ], "sqlState" : "4274K" }, From dd0934c842a17bc2cf401ad5aaa91a844a95a4d0 Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Tue, 1 Aug 2023 10:44:48 -0700 Subject: [PATCH 07/13] Adding changes --- python/pyspark/worker.py | 2 +- .../plans/logical/FunctionBuilderBase.scala | 23 +++++++++---------- .../sql/errors/QueryCompilationErrors.scala | 7 ++++-- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 9592475724221..30637d2be59a8 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -632,7 +632,7 @@ def verify_result(result): ) return result - return lambda *a: map(lambda res: (res, arrow_return_type), map(verify_result, f(*a))) + return lambda *a: map(lambda res: (res, arrow_return_type), map(verify_result, f(*a, **kw))) eval = wrap_arrow_udtf(getattr(udtf, "eval"), return_type) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala index 322ebc62c789c..1088655f60cd4 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/FunctionBuilderBase.scala @@ -104,8 +104,7 @@ object NamedParametersSupport { val positionalParametersSet = allParameterNames.take(positionalArgs.size).toSet val namedParametersSet = collection.mutable.Set[String]() - namedArgs.zipWithIndex.foreach { tuple => - val (arg, index) = tuple + namedArgs.zipWithIndex.foreach { case (arg, index) => arg match { case namedArg: NamedArgumentExpression => val parameterName = namedArg.key @@ -143,16 +142,16 @@ object NamedParametersSupport { }.toMap // We rearrange named arguments to match their positional order. - val rearrangedNamedArgs: Seq[Expression] = namedParameters.zipWithIndex.map { tuple => - val (param, index) = tuple - namedArgMap.getOrElse( - param.name, - if (param.default.isEmpty) { - throw QueryCompilationErrors.requiredParameterNotFound(functionName, param.name, index) - } else { - param.default.get - } - ) + val rearrangedNamedArgs: Seq[Expression] = namedParameters.zipWithIndex.map { + case (param, index) => + namedArgMap.getOrElse( + param.name, + if (param.default.isEmpty) { + throw QueryCompilationErrors.requiredParameterNotFound(functionName, param.name, index) + } else { + param.default.get + } + ) } val rearrangedArgs = positionalArgs ++ rearrangedNamedArgs assert(rearrangedArgs.size == parameters.size) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 455d9fe0a5a89..2bd47e88dc3dd 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -116,11 +116,14 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase { ) } - def unexpectedPositionalArgument(functionName: String, parameterName: String): Throwable = { + def unexpectedPositionalArgument( + functionName: String, + precedingNamedArgument: String): Throwable = { new AnalysisException( errorClass = "UNEXPECTED_POSITIONAL_ARGUMENT", messageParameters = Map( - "functionName" -> toSQLId(functionName), "parameterName" -> toSQLId(parameterName)) + "functionName" -> toSQLId(functionName), + "parameterName" -> toSQLId(precedingNamedArgument)) ) } From 35f50f3d41ea826d9234600bcfb38bd244724f6a Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Tue, 1 Aug 2023 10:56:34 -0700 Subject: [PATCH 08/13] doc generation --- docs/sql-error-conditions.md | 2 +- python/pyspark/worker.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/sql-error-conditions.md b/docs/sql-error-conditions.md index 6fd223aeac8f8..7d5c5df26d30f 100644 --- a/docs/sql-error-conditions.md +++ b/docs/sql-error-conditions.md @@ -1555,7 +1555,7 @@ The `` clause may be used at most once per `` operation. [SQLSTATE: 4274K](sql-error-conditions-sqlstates.html#class-42-syntax-error-or-access-rule-violation) -Cannot invoke function `` because the parameter named `` is required at index ``, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally or by name) and retry the query again. +Cannot invoke function `` because the parameter named `` is required, but the function call did not supply a value. Please update the function call to supply an argument value (either positionally at index `` or by name) and retry the query again. ### REQUIRES_SINGLE_PART_NAMESPACE diff --git a/python/pyspark/worker.py b/python/pyspark/worker.py index 30637d2be59a8..9592475724221 100644 --- a/python/pyspark/worker.py +++ b/python/pyspark/worker.py @@ -632,7 +632,7 @@ def verify_result(result): ) return result - return lambda *a: map(lambda res: (res, arrow_return_type), map(verify_result, f(*a, **kw))) + return lambda *a: map(lambda res: (res, arrow_return_type), map(verify_result, f(*a))) eval = wrap_arrow_udtf(getattr(udtf, "eval"), return_type) From e74b3055f4687c95f862297775d7d18ce39f56b1 Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Wed, 2 Aug 2023 09:10:51 -0700 Subject: [PATCH 09/13] Line --- .../sql/catalyst/analysis/NamedParameterFunctionSuite.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala index 70106fa89b13a..99fed4d2ee5d9 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/NamedParameterFunctionSuite.scala @@ -63,6 +63,7 @@ class NamedParameterFunctionSuite extends AnalysisTest { final val k4Arg = NamedArgumentExpression("k4", Literal("v4")) final val namedK1Arg = NamedArgumentExpression("k1", Literal("v1-2")) final val args = Seq(k1Arg, k4Arg, k2Arg, k3Arg) + final val expectedSeq = Seq(Literal("v1"), Literal("v2"), Literal("v3"), Literal("v4")) final val signature = DummyExpressionBuilder.defaultFunctionSignature final val illegalSignature = FunctionSignature(Seq( From efcf120d39e719bd1475b658cd3b0ac71ed6f6c1 Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Wed, 2 Aug 2023 09:33:19 -0700 Subject: [PATCH 10/13] Temporarily change mods --- .github/workflows/build_and_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 02b3814a018b6..ce42bc4f07111 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -49,6 +49,9 @@ on: required: false type: string default: '' + +permissions: read-all|write-all + jobs: precondition: name: Check changes From b04547ff73c60ad001e59f4d8a1586c0b0a040fa Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Wed, 2 Aug 2023 09:35:01 -0700 Subject: [PATCH 11/13] Attempt --- .github/workflows/build_and_test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index ce42bc4f07111..02b3814a018b6 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -49,9 +49,6 @@ on: required: false type: string default: '' - -permissions: read-all|write-all - jobs: precondition: name: Check changes From 6dccb53c3de8f32e826c0c771b911356d9a2673f Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Wed, 2 Aug 2023 09:53:09 -0700 Subject: [PATCH 12/13] Try --- .github/workflows/build_and_test.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 02b3814a018b6..c8a4dbc7b76de 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -19,6 +19,9 @@ name: Build and test +permissions: + contents: read + packages: write on: workflow_call: inputs: From bd1bc817b62007f01c999eb7edf673aa5525459a Mon Sep 17 00:00:00 2001 From: Richard Yu Date: Wed, 2 Aug 2023 09:54:56 -0700 Subject: [PATCH 13/13] whatever --- .github/workflows/build_and_test.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index c8a4dbc7b76de..02b3814a018b6 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -19,9 +19,6 @@ name: Build and test -permissions: - contents: read - packages: write on: workflow_call: inputs: