From 872f980c44c741317a8552ba65bcb0fa32ec29ac Mon Sep 17 00:00:00 2001 From: Adam Seering Date: Mon, 16 Mar 2026 19:09:41 +0000 Subject: [PATCH 1/6] fix: handle empty queries safely, ensure golden execution, and parse config robustly --- evalbench/evaluator/dataagentevaluator.py | 3 +- evalbench/evaluator/evaluator.py | 1 + evalbench/evaluator/oneshotorchestrator.py | 3 +- evalbench/evaluator/progress_reporter.py | 2 + evalbench/test/mongodb_test.py | 5 +- evalbench/test/robustness_test.py | 38 +++++++++++++ evalbench/util/config.py | 62 ++++++++-------------- evalbench/work/sqlexecwork.py | 44 +++++++++------ evalbench/work/sqlgenquerydatawork.py | 3 +- requirements.txt | 2 + 10 files changed, 98 insertions(+), 65 deletions(-) create mode 100644 evalbench/test/robustness_test.py diff --git a/evalbench/evaluator/dataagentevaluator.py b/evalbench/evaluator/dataagentevaluator.py index 86ccda43..cc6418d1 100644 --- a/evalbench/evaluator/dataagentevaluator.py +++ b/evalbench/evaluator/dataagentevaluator.py @@ -1,3 +1,4 @@ +import traceback from typing import Any, List import datetime from work import promptgenwork @@ -96,8 +97,6 @@ def evaluate( try: result = future.result() except Exception as exc: - import traceback - print(traceback.format_exc()) print(f"A task generated an exception: {exc}") diff --git a/evalbench/evaluator/evaluator.py b/evalbench/evaluator/evaluator.py index a077f5db..0b4ce6a8 100644 --- a/evalbench/evaluator/evaluator.py +++ b/evalbench/evaluator/evaluator.py @@ -1,3 +1,4 @@ +import logging from typing import Any, List import datetime from util import truncateExecutionOutputs diff --git a/evalbench/evaluator/oneshotorchestrator.py b/evalbench/evaluator/oneshotorchestrator.py index 48556810..087c0eae 100644 --- a/evalbench/evaluator/oneshotorchestrator.py +++ b/evalbench/evaluator/oneshotorchestrator.py @@ -1,7 +1,8 @@ +import logging import concurrent.futures import datetime import json -import logging + import tempfile import threading import uuid diff --git a/evalbench/evaluator/progress_reporter.py b/evalbench/evaluator/progress_reporter.py index 705f7c0d..c4d6bf70 100644 --- a/evalbench/evaluator/progress_reporter.py +++ b/evalbench/evaluator/progress_reporter.py @@ -1,4 +1,6 @@ import logging +import os + from multiprocessing.managers import SyncManager import sys import threading diff --git a/evalbench/test/mongodb_test.py b/evalbench/test/mongodb_test.py index 4696b1a4..d7834c00 100644 --- a/evalbench/test/mongodb_test.py +++ b/evalbench/test/mongodb_test.py @@ -5,6 +5,7 @@ import json import sys import os +from evalbench.databases import mongodb sys.path.append(os.path.abspath( os.path.join(os.path.dirname(__file__), "../.."))) @@ -25,7 +26,6 @@ def client(): # Directly use mongomock.MongoClient instead of patching # This avoids issues with where MongoClient is imported - from databases import mongodb # Create a mock client mock_client = mongomock.MongoClient("mongodb://mock-host:27017") @@ -71,7 +71,8 @@ def test_aggregate(self, client): """Tests aggregation query.""" # Data already inserted in previous test (session scope fixture, but we might want to clean up) # For safety, let's insert again or assume persistence. - # mongomock is in-memory, so it persists for the session if not cleared. + # mongomock is in-memory, so it persists for the session if not + # cleared. query = json.dumps( { diff --git a/evalbench/test/robustness_test.py b/evalbench/test/robustness_test.py new file mode 100644 index 00000000..6204a09e --- /dev/null +++ b/evalbench/test/robustness_test.py @@ -0,0 +1,38 @@ +import time +from queue import Queue +from unittest.mock import MagicMock +from work.sqlexecwork import SQLExecWork +import unittest + + +class TestExecutionBugs(unittest.TestCase): + + def test_sqlexecwork_handles_empty_query_safely(self): + db = MagicMock() + db_queue = Queue() + eval_result = { + "sql_generator_error": None, + "generated_sql": " ", + "query_type": "dql", + "eval_query": [], + "golden_sql": "", + "preprocess_sql": [] + } + config = { + "prompt_generator": "NOOPGenerator", + "dialect": "sqlite" + } + + work = SQLExecWork(db, config, eval_result, db_queue) + + # Should not raise "list index out of range" + result = work.run() + + self.assertIsNone(result.get("generated_result")) + self.assertEqual( + result.get("generated_error"), + "list index out of range (empty query)") + + +if __name__ == '__main__': + unittest.main() diff --git a/evalbench/util/config.py b/evalbench/util/config.py index 5b86c37f..2eaa0a7a 100644 --- a/evalbench/util/config.py +++ b/evalbench/util/config.py @@ -1,3 +1,4 @@ +import json import datetime import logging import os @@ -31,7 +32,10 @@ def load_db_data_from_csvs(data_directory: str): current_directory = os.getcwd() if not os.path.isdir(os.path.join(current_directory, data_directory)): return tables - for filename in os.listdir(os.path.join(current_directory, data_directory)): + for filename in os.listdir( + os.path.join( + current_directory, + data_directory)): if filename.endswith(".csv"): table_name = filename[:-4] with open( @@ -68,10 +72,10 @@ def load_setup_scripts(setup_scripts_directory_path: str): current_directory, setup_scripts_directory_path, "post_setup.json" ) if os.path.exists(post_setup_json_path): - import json with open(post_setup_json_path, "r") as f: - # Load as list of dicts, then convert back to strings for batch_execute + # Load as list of dicts, then convert back to strings for + # batch_execute try: data = json.load(f) if isinstance(data, list): @@ -83,8 +87,9 @@ def load_setup_scripts(setup_scripts_directory_path: str): else: post_setup = _load_setup_sql( os.path.join( - current_directory, setup_scripts_directory_path, "post_setup.sql" - ), + current_directory, + setup_scripts_directory_path, + "post_setup.sql"), ) return (pre_setup, setup, post_setup) @@ -125,40 +130,9 @@ def config_to_df( } ) df = pd.DataFrame.from_dict(configs) - df[["job_id", "config", "value"]] = df[["job_id", "config", "value"]].astype( - "string" - ) - return df - - -def df_to_config(df: pd.DataFrame) -> dict: - import ast - - original_dict = {} - - for _, row in df.iterrows(): - key_path = row["config"] - value_str = row["value"] - - try: - if pd.isna(value_str): - value = None - else: - value = ast.literal_eval(value_str) - except (ValueError, SyntaxError, TypeError): - value = value_str - - keys = key_path.split(".") - - current_level = original_dict - for key in keys[:-1]: - if key not in current_level: - current_level[key] = {} - current_level = current_level[key] - - current_level[keys[-1]] = value - - return original_dict + df[["job_id", "config", "value"]] = df[[ + "job_id", "config", "value"]].astype("string") + return config def update_google3_relative_paths( @@ -171,7 +145,8 @@ def update_google3_relative_paths( elif isinstance(value, list): values = [] for sub_value in value: - if isinstance(sub_value, str) and sub_value.startswith("google3/"): + if isinstance(sub_value, + str) and sub_value.startswith("google3/"): values.append(get_google3_relative_path( sub_value, session_id)) elif isinstance(sub_value, str) and sub_value in resource_map: @@ -208,7 +183,12 @@ def get_google3_relative_path(value, session_id): def set_session_configs(session, experiment_config: dict): session["config"] = experiment_config if "dataset_config" in experiment_config and experiment_config["dataset_config"]: - session["dataset_config"] = experiment_config["dataset_config"] + # Handle both flat string paths and nested dicts (e.g. BIRD configs) + dc = experiment_config["dataset_config"] + if isinstance(dc, dict) and "prompts_file" in dc: + session["dataset_config"] = dc["prompts_file"] + else: + session["dataset_config"] = dc if ( "database_configs" in experiment_config and experiment_config["database_configs"] diff --git a/evalbench/work/sqlexecwork.py b/evalbench/work/sqlexecwork.py index faee8eb0..69ad2d96 100644 --- a/evalbench/work/sqlexecwork.py +++ b/evalbench/work/sqlexecwork.py @@ -40,16 +40,27 @@ def run(self, work_config: Any = None) -> dict: golden_eval_result = None golden_error = None + query_type = self.eval_result["query_type"] + eval_query = self._get_eval_query() + preprocess_sql = self._get_preprocess_sql_query() + golden_sql = self._get_golden_sql() + + if golden_sql: + golden_result, golden_eval_result, golden_error = ( + self._evaluate_execution_results( + golden_sql, + preprocess_sql, + eval_query, + query_type, + is_golden=True, + ) + ) + if ( self.eval_result["sql_generator_error"] is None - and self.eval_result["generated_sql"] + and self.eval_result.get("generated_sql") ): - query_type = self.eval_result["query_type"] - eval_query = self._get_eval_query() sanitized_generated_sql = self._sanitize_sql() - preprocess_sql = self._get_preprocess_sql_query() - golden_sql = self._get_golden_sql() - if sanitized_generated_sql: generated_result, generated_eval_result, generated_error = ( self._evaluate_execution_results( @@ -60,15 +71,6 @@ def run(self, work_config: Any = None) -> dict: is_golden=False, ) ) - golden_result, golden_eval_result, golden_error = ( - self._evaluate_execution_results( - golden_sql, - preprocess_sql, - eval_query, - query_type, - is_golden=True, - ) - ) self.eval_result["generated_result"] = generated_result self.eval_result["eval_results"] = generated_eval_result @@ -91,10 +93,17 @@ def _evaluate_execution_results( self.db.execute(preprocess_sql) except Exception as preprocess_error: traceback.print_exc() + + if not query or not query.strip(): + return None, None, "list index out of range (empty query)" + if query_type == "dql": try: + stmts = sqlparse.split(query) + if not stmts: + return None, None, "list index out of range (empty query)" result, _, error = self.db.execute( - sqlparse.split(query)[0], use_cache=True, rollback=True + stmts[0], use_cache=True, rollback=True ) except Exception as e: error = str(e) @@ -143,7 +152,8 @@ def _get_golden_sql(self): return golden_sql def _get_eval_query(self): - if self.eval_result["eval_query"] and len(self.eval_result["eval_query"]) > 0: + if self.eval_result["eval_query"] and len( + self.eval_result["eval_query"]) > 0: return self.eval_result["eval_query"][0] else: return None diff --git a/evalbench/work/sqlgenquerydatawork.py b/evalbench/work/sqlgenquerydatawork.py index 7b055191..e4c6801a 100644 --- a/evalbench/work/sqlgenquerydatawork.py +++ b/evalbench/work/sqlgenquerydatawork.py @@ -1,3 +1,4 @@ +import traceback """Work is the base class for all work items.""" from typing import Any @@ -30,8 +31,6 @@ def run(self, work_config: str = None) -> dict: self.eval_result["generated_sql"] = None self.eval_result["sql_generator_error"] = "No result generated" except Exception as e: - import traceback - traceback.print_exc() sql_generator_error = str(e) diff --git a/requirements.txt b/requirements.txt index 1bda7a12..6f759e07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -33,3 +33,5 @@ mongomock rich google-adk mcp +pytest +sqlparse From 8a7a11c9f9ed9b4204247447720d54aeedf28b2c Mon Sep 17 00:00:00 2001 From: Adam Seering Date: Wed, 18 Mar 2026 05:44:03 +0000 Subject: [PATCH 2/6] Fix mongodb_test by removing evalbench. prefix to avoid duplicate module loading --- evalbench/test/mongodb_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalbench/test/mongodb_test.py b/evalbench/test/mongodb_test.py index b0da112c..3e1c8ae0 100644 --- a/evalbench/test/mongodb_test.py +++ b/evalbench/test/mongodb_test.py @@ -5,7 +5,7 @@ import json import sys import os -from evalbench.databases import mongodb +from databases import mongodb # Mocking the MongoClient to use mongomock From 65361881a2b0ab610ad15d75893ad973285da3f4 Mon Sep 17 00:00:00 2001 From: Adam Seering Date: Wed, 18 Mar 2026 05:44:18 +0000 Subject: [PATCH 3/6] Cleanup duplicate sqlparse in requirements.txt --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index bd03ce05..1186ed46 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,5 +34,4 @@ rich google-adk mcp pytest -sqlparse mock From 74429525f13242dba3e87fe62f88709e5891d3ba Mon Sep 17 00:00:00 2001 From: Adam Seering Date: Wed, 1 Apr 2026 20:22:52 +0000 Subject: [PATCH 4/6] style: remove extra blank lines in mongodb_test.py to satisfy pycodestyle Removed several unnecessary blank lines within the client pytest fixture and above it in evalbench/test/mongodb_test.py to resolve an E303 too many blank lines (2) linter failure reported by pycodestyle. --- evalbench/test/mongodb_test.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/evalbench/test/mongodb_test.py b/evalbench/test/mongodb_test.py index 54e048f4..fd954c87 100644 --- a/evalbench/test/mongodb_test.py +++ b/evalbench/test/mongodb_test.py @@ -7,7 +7,6 @@ import os from databases import mongodb - # --------------------------------------------------------------------------- # Shared fixture # --------------------------------------------------------------------------- @@ -22,15 +21,11 @@ def client(): "max_executions_per_minute": 100, "connection_string": "mongodb://mock-host:27017", } - # Directly use mongomock.MongoClient instead of patching # This avoids issues with where MongoClient is imported - - mock_client = mongomock.MongoClient("mongodb://mock-host:27017") original_client = mongodb.MongoClient mongodb.MongoClient = lambda *args, **kwargs: mock_client - try: db = get_database(db_config, "unit_test_db") From 490992cf15c12140daa1b5f76300f9ab4abcd388 Mon Sep 17 00:00:00 2001 From: Adam Seering Date: Wed, 1 Apr 2026 20:33:33 +0000 Subject: [PATCH 5/6] style: fix E302 expected 2 blank lines in mongodb_test.py Adjusted spacing before the client fixture and its comment block to satisfy PEP 8 / pycodestyle requirements. --- evalbench/test/mongodb_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalbench/test/mongodb_test.py b/evalbench/test/mongodb_test.py index fd954c87..8ad48677 100644 --- a/evalbench/test/mongodb_test.py +++ b/evalbench/test/mongodb_test.py @@ -7,10 +7,10 @@ import os from databases import mongodb + # --------------------------------------------------------------------------- # Shared fixture # --------------------------------------------------------------------------- - @pytest.fixture(scope="module") def client(): """MongoDB client backed by mongomock, seeded with e-commerce documents.""" From 4e74c4d8b4612cfece6ab54936299139fb719ef6 Mon Sep 17 00:00:00 2001 From: Adam Seering Date: Thu, 2 Apr 2026 01:34:13 +0000 Subject: [PATCH 6/6] Return the generated dataframe, not the raw dict --- evalbench/util/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/evalbench/util/config.py b/evalbench/util/config.py index 2eaa0a7a..832e85c7 100644 --- a/evalbench/util/config.py +++ b/evalbench/util/config.py @@ -132,7 +132,7 @@ def config_to_df( df = pd.DataFrame.from_dict(configs) df[["job_id", "config", "value"]] = df[[ "job_id", "config", "value"]].astype("string") - return config + return df def update_google3_relative_paths(