From 7df768529bcff7bd69f5454272ee00fa495ffa3a Mon Sep 17 00:00:00 2001 From: totoleon Date: Fri, 3 Apr 2026 19:37:23 +0000 Subject: [PATCH 1/5] wip --- evalbench/scorers/llmrater_v2.py | 278 +++++++++++++++++++++++++++++++ evalbench/scorers/score.py | 4 + 2 files changed, 282 insertions(+) create mode 100644 evalbench/scorers/llmrater_v2.py diff --git a/evalbench/scorers/llmrater_v2.py b/evalbench/scorers/llmrater_v2.py new file mode 100644 index 00000000..fa0fa784 --- /dev/null +++ b/evalbench/scorers/llmrater_v2.py @@ -0,0 +1,278 @@ +""" +LLMRater +In this comparison strategy, an LLM compares the golden execution results with the generated sql execution results. +It returns a score between 0 and 100, with a score of 100 for concrete positive cases, +where either there is a mismatch of columns names, extra relevant columns, or harmless unrequested sorting/limits in Generated SQL. + +Evaluation rules given to LLM: + 1. Assume OUTPUT #1 is the gold standard and is ALWAYS correct. + 2. The order of columns in OUTPUT #2 does not matter. + 3. The order of rows in OUTPUT #2 does not matter UNLESS explicitly requested in the prompt. + 4. Allow slight variations due to differences in rounding or precision, for calculated values. + 5. Allow acceptable divergences based on relaxed criteria (ambiguous counts, null handling, relative dates, IDs vs Names, ambiguous limits, etc.). + 6. The mapped column names might differ, do not make any assumptions based on them. + +Run Configuration Options: + 1. model_config: Required + - File that defines the configuration settings for the LLM model to be used in evaluation. +""" + +from typing import Tuple +from generators.models import get_generator +from scorers import setmatcher +import logging + +from scorers import comparator +from .util import make_hashable, with_cache_execute +from databases.util import get_cache_client + +ERROR_CATEGORIZATION_PROMPT = """ +You are an expert SQL evaluator. Your task is to analyze a "Generated SQL" query against a "Golden SQL" (ground truth) query and their respective execution results. + +### Input Data +**NL Prompt:** {nl_prompt} +**Golden SQL:** {golden_sql} +**Golden Result:** {golden_execution_result} +**Generated SQL:** {generated_sql} +**Generated Result:** {generated_execution_result} + +### Task +Compare the queries and results to identify specific errors in the Generated SQL. If the Generated SQL is functionally equivalent to the Golden SQL (even if syntax differs), mark it as correct. + +### Error Taxonomy +If errors exist, categorize them using ONLY the following tags: + +1. [EntityError] - Wrong table or entity was used. +2. [ValueLinkingError] - Wrong literal value (string/number) was used. +3. [ColumnLinkingError] - Wrong column was selected or used in a condition. +4. [OrderingError] - Sorting order (ASC/DESC) or column is incorrect (only flag if prompt explicitly requested sorting). +5. [InstructionError] - Failed to follow specific constraints in the prompt (e.g., "return top 5"). +6. [IntentError] - Misinterpreted the user's fundamental request. +7. [DataTypesError] - Incorrect handling of data types (e.g., casting, dates). +8. [CountingError] - Aggregation or counting logic is flawed. +9. [FilterError] - Correct columns used, but wrong logical operator or filter condition. +10. [LogicError] - Fundamental logic flaw not covered by other categories (e.g., wrong join type). +11. [OtherError] - Any other error not covered by the above categories. + +### Output Format +Provide your response in the following format: + +**Reasoning:** + + +**Tags & Explanations:** +: +: +""" + + +class LLMRater(comparator.Comparator): + """ + LLMRater class implements the Comparator base class. + + Attributes: + 1. name: Name of the comparator. Set to "llmrater" + 2. model_config: File that defines the configuration settings for the LLM model used in evaluation. + """ + + def __init__(self, config: dict, global_models): + self.name = "llmrater" + self.set_match_checker = setmatcher.SetMatcher({}) + self.cache_client = get_cache_client(config) + self.model_config = config.get("model_config") or "" + if not self.model_config: + raise ValueError("model_config is required for LLM Rater") + self.model = get_generator(global_models, self.model_config) + + def _is_exact_match( + self, + nl_prompt: str, + golden_query: str, + query_type: str, + golden_execution_result: list, + golden_eval_result: str, + golden_error: str, + generated_query: str, + generated_execution_result: list, + generated_eval_result: str, + generated_error: str, + ): + score, _ = self.set_match_checker.compare( + nl_prompt, + golden_query, + query_type, + golden_execution_result, + golden_eval_result, + golden_error, + generated_query, + generated_execution_result, + generated_eval_result, + generated_error, + ) + return score == 100 + + def _inference_without_caching(self, prompt): + if self.model is None: + raise RuntimeError("Model not initialized") + return self.model.generate(prompt) + + @staticmethod + def take_n_uniques(output_list: list, n: int) -> list: + """Takes n number of unique (non duplicate) values from the output list. + + Args: + output_list: The execution output result set + n: Max number of unique values needed. + + Returns: + The execution output result set without duplicates in a size of n values or less. + """ + seen_dicts = set() + new_list = [] + for d in output_list: + # Convert the dictionary to a hashable frozenset for efficient lookup + t = frozenset((k, make_hashable(v)) for k, v in d.items()) + if t not in seen_dicts: + seen_dicts.add(t) + new_list.append(d) + if len(new_list) == n: + break + return new_list + + def compare( + self, + nl_prompt: str, + golden_query: str, + query_type: str, + golden_execution_result: list, + golden_eval_result: str, + golden_error: str, + generated_query: str, + generated_execution_result: list, + generated_eval_result: str, + generated_error: str, + ) -> Tuple[float, str]: + if self._is_exact_match( + nl_prompt, + golden_query, + query_type, + golden_execution_result, + golden_eval_result, + golden_error, + generated_query, + generated_execution_result, + generated_eval_result, + generated_error, + ): + return 100, "Skipped. Exact Match was found." + + if golden_error: + return 0, "Golden query failed to execute." + if generated_error: + return 0, "Generated query failed to execute." + + only_first_n = 50 + + golden_execution_result = self.take_n_uniques( + golden_execution_result, only_first_n + ) + generated_execution_result = self.take_n_uniques( + generated_execution_result, only_first_n + ) + + prompt = f""" + We are trying to answer this question by querying a database: + + QUESTION: {nl_prompt} + + The correct answer to this question is: + + OUTPUT #1 (Gold Standard): + + {golden_execution_result} + + + We get the following answer from a generated query: + + OUTPUT #2 (Generated Result): + + {generated_execution_result} + + + Thinking step by step, compare the two outputs and look for differences in data and presentation. + Here are steps to follow: + + 1. Analyze the QUESTION: Does it explicitly ask for a specific sorting order (e.g., "ordered by date", "top 5")? Does it explicitly ask for a limit? + 2. Column Mapping: Ensure that every column in OUTPUT #1 has a corresponding column in OUTPUT #2 that represents the same information. OUTPUT #2 is allowed to have additional descriptive columns. + 3. Data Comparison: Compare the data within each mapped column pair. + 4. Row Order: Ignore differences in row order UNLESS the QUESTION explicitly requested a specific sorting. Treat the data as unordered sets if no order is specified. + 5. Extra Rows: If OUTPUT #2 has extra rows but contains all of OUTPUT #1, evaluate if the extra rows violate the prompt's constraints. If the prompt was ambiguous about limits (e.g. "Identify the MSA with the highest growth" and the model returns a ranked list instead of a single row), treating it as EXTRA_INFORMATION is acceptable and correct. + + RULES & RELAXED EVALUATION CRITERIA - These MUST be strictly followed: + + 1. Assume OUTPUT #1 is the gold standard and its core data values are ALWAYS mathematically/logically correct. + 2. The mapped column names might differ, do not make any assumptions based on them. + 3. Do NOT penalize OUTPUT #2 if it differs from OUTPUT #1 for ANY of the following reasons: + - Column/Row Order: Differences in column names, column order, or row order when no requirements are specified in the QUESTION. + - Rounding: Differences in integer/decimal rounding or precision when the QUESTION lacks specific guidelines. + - Ambiguous "Top X": The QUESTION requests "first" or "top" X entries but doesn't specify an ordering field, yielding different subsets. + - Null/NA Handling: Differences in including vs. excluding 'null' or 'NA' values when the QUESTION does not specify. + - Ambiguous Limit: The QUESTION asks for "top/highest" or "bottom/lowest" entries but doesn't specify a concrete limit, leading to different numbers of entries. + - Entity Representation: The QUESTION asks for a list of items but doesn't specify IDs or names, leading one output to return IDs and the other names. + - Extra Columns: OUTPUT #2 has a small number of extra columns that are not explicitly excluded and don't render the overall result incorrect. + - Truncation/Subsets: Truncation for display, or differing subsets of data when ordering is not specified. + - Fewer than X: Returning fewer than X entries for a "top X" request because fewer entries meet the underlying criteria. + - Relative Time/Date: Differences arising because queries were evaluated at different assumed current times/dates (e.g., "last two years"). + + FINAL QUESTION: Does OUTPUT #2 provide the same information as OUTPUT #1? + FINAL ANSWER: Choose ONLY ONE + - INFORMATION_MATCHES -- OUTPUT #1 and OUTPUT #2 provide the same core information (or differences fall under the acceptable relaxed criteria). + - MISSING_INFORMATION -- Something important requested by the QUESTION is missing from OUTPUT #2 (e.g. data points dropped, missing expected columns). + - EXTRA_INFORMATION -- OUTPUT #2 includes the correct answer but added non-harmful extra relevant columns, or harmless extra rows due to an ambiguous limit/sorting constraint in the QUESTION. + - INCORRECT_INFORMATION -- OUTPUT #2 contains mathematically or logically incorrect data, wrong aggregations, bad joins, missing expected rows, or violates explicit constraints in the QUESTION. + """ + + logging.debug("\n --------- prompt: --------- \n %s ", prompt) + + if self.cache_client: + response = with_cache_execute( + prompt, + self.model_config, + self._inference_without_caching, + self.cache_client, + ) + else: + response = self._inference_without_caching(prompt) + + logging.debug( + "\n --------- llm_rater_output: --------- \n %s ", response) + + # Scoring Logic: Both INFORMATION_MATCHES and EXTRA_INFORMATION are rewarded as correct. + score = ( + 100 + if ("INFORMATION_MATCHES" in response or "EXTRA_INFORMATION" in response) + else 0 + ) + + if score == 0: + prompt = ERROR_CATEGORIZATION_PROMPT.format( + nl_prompt=nl_prompt, + golden_sql=golden_query, + golden_execution_result=golden_execution_result, + generated_sql=generated_query, + generated_execution_result=generated_execution_result, + ) + if self.cache_client: + error_categorization_response = with_cache_execute( + prompt, + self.model_config, + self._inference_without_caching, + self.cache_client, + ) + else: + error_categorization_response = self._inference_without_caching( + prompt) + + response += "\nError analysis:\n\n" + error_categorization_response + + return score, response diff --git a/evalbench/scorers/score.py b/evalbench/scorers/score.py index ca50de18..c31a2b86 100644 --- a/evalbench/scorers/score.py +++ b/evalbench/scorers/score.py @@ -6,6 +6,7 @@ from scorers import recallmatcher from scorers import setmatcher from scorers import llmrater +from scorers import llmrater_v2 from scorers import returnedsql from scorers import executablesql from scorers import trajectorymatcher @@ -44,6 +45,9 @@ def compare( if "llmrater" in scorers: comparators.append(llmrater.LLMRater( scorers["llmrater"], global_models)) + if "llmrater_v2" in scorers: + comparators.append(llmrater_v2.LLMRaterV2( + scorers["llmrater_v2"], global_models)) if "regexp_matcher" in scorers: comparators.append( generatedqueryregexpmatcher.GeneratedQueryRegexpMatcher( From b3ec721c5e4be84d3509c7139c323f8e8c725ea4 Mon Sep 17 00:00:00 2001 From: totoleon Date: Fri, 3 Apr 2026 20:45:29 +0000 Subject: [PATCH 2/5] Fix llmrater_v2 class name reference in score.py --- evalbench/scorers/score.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalbench/scorers/score.py b/evalbench/scorers/score.py index c31a2b86..50191761 100644 --- a/evalbench/scorers/score.py +++ b/evalbench/scorers/score.py @@ -46,7 +46,7 @@ def compare( comparators.append(llmrater.LLMRater( scorers["llmrater"], global_models)) if "llmrater_v2" in scorers: - comparators.append(llmrater_v2.LLMRaterV2( + comparators.append(llmrater_v2.LLMRater( scorers["llmrater_v2"], global_models)) if "regexp_matcher" in scorers: comparators.append( From c08134cee78089ccded0bc23f56b18e93f37a555 Mon Sep 17 00:00:00 2001 From: totoleon Date: Fri, 3 Apr 2026 21:30:51 +0000 Subject: [PATCH 3/5] Fix llmrater_v2 scorer name to match config key --- evalbench/scorers/llmrater_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalbench/scorers/llmrater_v2.py b/evalbench/scorers/llmrater_v2.py index fa0fa784..4ebd28a1 100644 --- a/evalbench/scorers/llmrater_v2.py +++ b/evalbench/scorers/llmrater_v2.py @@ -76,7 +76,7 @@ class LLMRater(comparator.Comparator): """ def __init__(self, config: dict, global_models): - self.name = "llmrater" + self.name = "llmrater_v2" self.set_match_checker = setmatcher.SetMatcher({}) self.cache_client = get_cache_client(config) self.model_config = config.get("model_config") or "" From 50934bf222e1d69c44eaa8853d183ba3d92b428c Mon Sep 17 00:00:00 2001 From: totoleon Date: Fri, 3 Apr 2026 22:00:06 +0000 Subject: [PATCH 4/5] Retry on Gemini 502 ServerError via rate limiter --- evalbench/generators/models/gemini.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/evalbench/generators/models/gemini.py b/evalbench/generators/models/gemini.py index 6290959e..4b090e10 100644 --- a/evalbench/generators/models/gemini.py +++ b/evalbench/generators/models/gemini.py @@ -1,5 +1,6 @@ from google import genai from google.genai.types import GenerateContentResponse +from google.genai.errors import ServerError from util.rate_limit import ResourceExhaustedError from util.gcp import get_gcp_project, get_gcp_region from google.api_core.exceptions import ResourceExhausted @@ -36,7 +37,7 @@ def generate_internal(self, prompt): if isinstance(response, GenerateContentResponse): r = sanitize_sql(response.text) return r - except ResourceExhausted as e: + except (ResourceExhausted, ServerError) as e: raise ResourceExhaustedError(e) except Exception as e: logger.exception("Unhandled exception during generate_content") From b4010f9f6481ca76885a82c12f4bea6473e46707 Mon Sep 17 00:00:00 2001 From: totoleon Date: Fri, 3 Apr 2026 22:59:14 +0000 Subject: [PATCH 5/5] wip --- evalbench/scorers/llmrater.py | 69 +++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 32 deletions(-) diff --git a/evalbench/scorers/llmrater.py b/evalbench/scorers/llmrater.py index 2b758c00..fa0fa784 100644 --- a/evalbench/scorers/llmrater.py +++ b/evalbench/scorers/llmrater.py @@ -2,15 +2,15 @@ LLMRater In this comparison strategy, an LLM compares the golden execution results with the generated sql execution results. It returns a score between 0 and 100, with a score of 100 for concrete positive cases, -where either there is a mismatch of columns names or extra relevant columns in Generated SQL exists. +where either there is a mismatch of columns names, extra relevant columns, or harmless unrequested sorting/limits in Generated SQL. Evaluation rules given to LLM: 1. Assume OUTPUT #1 is the gold standard and is ALWAYS correct. 2. The order of columns in OUTPUT #2 does not matter. - 3. Allow slight variations due to differences in rounding or precision, for calculated values. - Otherwise ensure exact matches for numbers, dates, timestamps, measurements, and metrics - between the two outputs. - 4. The mapped column names might differ, do not make any assumptions based on them. + 3. The order of rows in OUTPUT #2 does not matter UNLESS explicitly requested in the prompt. + 4. Allow slight variations due to differences in rounding or precision, for calculated values. + 5. Allow acceptable divergences based on relaxed criteria (ambiguous counts, null handling, relative dates, IDs vs Names, ambiguous limits, etc.). + 6. The mapped column names might differ, do not make any assumptions based on them. Run Configuration Options: 1. model_config: Required @@ -45,7 +45,7 @@ 1. [EntityError] - Wrong table or entity was used. 2. [ValueLinkingError] - Wrong literal value (string/number) was used. 3. [ColumnLinkingError] - Wrong column was selected or used in a condition. -4. [OrderingError] - Sorting order (ASC/DESC) or column is incorrect. +4. [OrderingError] - Sorting order (ASC/DESC) or column is incorrect (only flag if prompt explicitly requested sorting). 5. [InstructionError] - Failed to follow specific constraints in the prompt (e.g., "return top 5"). 6. [IntentError] - Misinterpreted the user's fundamental request. 7. [DataTypesError] - Incorrect handling of data types (e.g., casting, dates). @@ -187,46 +187,49 @@ def compare( The correct answer to this question is: - OUTPUT #1: + OUTPUT #1 (Gold Standard): {golden_execution_result} - We get the following answer from a second query. + We get the following answer from a generated query: - OUTPUT #2 + OUTPUT #2 (Generated Result): {generated_execution_result} - Thinking step by step, compare the two outputs and look for differences in data presentation. + Thinking step by step, compare the two outputs and look for differences in data and presentation. Here are steps to follow: - 1. Ensure that every column in OUTPUT #1 has a corresponding column in OUTPUT #2 that represents - the same information, even if the column names are different. - 2. All columns present in OUTPUT #1 must also be present in OUTPUT #2. OUTPUT #2 is allowed to - have additional columns relevant to the query. - 3. Compare the data within each mapped column pair between OUTPUT #1 and OUTPUT #2. - Ensure that OUTPUT #2 contains all the data from OUTPUT #1, with no missing or extra rows. - 4. Minor string transformations are allowed (e.g., concatenating first and last name), but the - underlying information must be preserved. - - RULES - These MUST be strictly followed, to answer the FINAL QUESTION: - - 1. Assume OUTPUT #1 is the gold standard and is ALWAYS correct. - 2. The order of columns in OUTPUT #2 does not matter. - 3. Allow slight variations due to differences in rounding or precision, for calculated values. - Otherwise ensure exact matches for numbers, dates, timestamps, measurements, and metrics - between the two outputs. - 4. The mapped column names might differ, do not make any assumptions based on them. + 1. Analyze the QUESTION: Does it explicitly ask for a specific sorting order (e.g., "ordered by date", "top 5")? Does it explicitly ask for a limit? + 2. Column Mapping: Ensure that every column in OUTPUT #1 has a corresponding column in OUTPUT #2 that represents the same information. OUTPUT #2 is allowed to have additional descriptive columns. + 3. Data Comparison: Compare the data within each mapped column pair. + 4. Row Order: Ignore differences in row order UNLESS the QUESTION explicitly requested a specific sorting. Treat the data as unordered sets if no order is specified. + 5. Extra Rows: If OUTPUT #2 has extra rows but contains all of OUTPUT #1, evaluate if the extra rows violate the prompt's constraints. If the prompt was ambiguous about limits (e.g. "Identify the MSA with the highest growth" and the model returns a ranked list instead of a single row), treating it as EXTRA_INFORMATION is acceptable and correct. + + RULES & RELAXED EVALUATION CRITERIA - These MUST be strictly followed: + + 1. Assume OUTPUT #1 is the gold standard and its core data values are ALWAYS mathematically/logically correct. + 2. The mapped column names might differ, do not make any assumptions based on them. + 3. Do NOT penalize OUTPUT #2 if it differs from OUTPUT #1 for ANY of the following reasons: + - Column/Row Order: Differences in column names, column order, or row order when no requirements are specified in the QUESTION. + - Rounding: Differences in integer/decimal rounding or precision when the QUESTION lacks specific guidelines. + - Ambiguous "Top X": The QUESTION requests "first" or "top" X entries but doesn't specify an ordering field, yielding different subsets. + - Null/NA Handling: Differences in including vs. excluding 'null' or 'NA' values when the QUESTION does not specify. + - Ambiguous Limit: The QUESTION asks for "top/highest" or "bottom/lowest" entries but doesn't specify a concrete limit, leading to different numbers of entries. + - Entity Representation: The QUESTION asks for a list of items but doesn't specify IDs or names, leading one output to return IDs and the other names. + - Extra Columns: OUTPUT #2 has a small number of extra columns that are not explicitly excluded and don't render the overall result incorrect. + - Truncation/Subsets: Truncation for display, or differing subsets of data when ordering is not specified. + - Fewer than X: Returning fewer than X entries for a "top X" request because fewer entries meet the underlying criteria. + - Relative Time/Date: Differences arising because queries were evaluated at different assumed current times/dates (e.g., "last two years"). FINAL QUESTION: Does OUTPUT #2 provide the same information as OUTPUT #1? FINAL ANSWER: Choose ONLY ONE - - INFORMATION_MATCHES -- OUTPUT #1 and OUTPUT #2 provide the same information. - - MISSING_INFORMATION -- Something important is missing from OUTPUT #2. - - EXTRA_INFORMATION -- Some non-harmful extra relevant columns were added to OUTPUT #2. - - INCORRECT_INFORMATION -- Some incorrect information was added to OUTPUT #2, likely due to - an incorrect filter or incorrect aggregation. + - INFORMATION_MATCHES -- OUTPUT #1 and OUTPUT #2 provide the same core information (or differences fall under the acceptable relaxed criteria). + - MISSING_INFORMATION -- Something important requested by the QUESTION is missing from OUTPUT #2 (e.g. data points dropped, missing expected columns). + - EXTRA_INFORMATION -- OUTPUT #2 includes the correct answer but added non-harmful extra relevant columns, or harmless extra rows due to an ambiguous limit/sorting constraint in the QUESTION. + - INCORRECT_INFORMATION -- OUTPUT #2 contains mathematically or logically incorrect data, wrong aggregations, bad joins, missing expected rows, or violates explicit constraints in the QUESTION. """ logging.debug("\n --------- prompt: --------- \n %s ", prompt) @@ -243,6 +246,8 @@ def compare( logging.debug( "\n --------- llm_rater_output: --------- \n %s ", response) + + # Scoring Logic: Both INFORMATION_MATCHES and EXTRA_INFORMATION are rewarded as correct. score = ( 100 if ("INFORMATION_MATCHES" in response or "EXTRA_INFORMATION" in response)