Skip to content

Commit 1914590

Browse files
committed
feat: align eval model selection with aliases
1 parent bf224e1 commit 1914590

File tree

2 files changed

+192
-87
lines changed

2 files changed

+192
-87
lines changed

eval/evaluate_browser_agent.py

Lines changed: 106 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import atexit
2525
import logging
2626
import datetime
27-
from urllib.parse import urlparse
2827

2928
logger = logging.getLogger(__name__)
3029

@@ -88,12 +87,11 @@ class TestResult:
8887

8988
@dataclass
9089
class LLMTarget:
91-
"""One explicit LLM target passed from the CLI."""
90+
"""One configured LLM alias passed from the CLI."""
9291

9392
name: str
94-
model: str
95-
base_url: str
96-
api_key: str
93+
alias: str
94+
model_name: str | None = None
9795

9896

9997
class OpenBrowserClient:
@@ -114,38 +112,41 @@ def health_check(self) -> bool:
114112
except requests.exceptions.RequestException:
115113
return False
116114

117-
def configure_llm(self, target: LLMTarget) -> bool:
118-
"""Configure the OpenBrowser server with the exact LLM triple for eval."""
115+
def get_llm_configs(self) -> List[Dict[str, Any]]:
116+
"""Fetch configured LLM entries from the server."""
119117
try:
120-
response = self.session.post(
121-
f"{self.base_url}/api/config/llm",
122-
json={
123-
"model": target.model,
124-
"base_url": target.base_url,
125-
"api_key": target.api_key,
126-
},
127-
timeout=5,
128-
)
129-
return response.status_code == 200
118+
response = self.session.get(f"{self.base_url}/api/config", timeout=5)
119+
if response.status_code != 200:
120+
return []
121+
data = response.json()
122+
config = data.get("config", {})
123+
llm_configs = config.get("llm_configs", [])
124+
return llm_configs if isinstance(llm_configs, list) else []
130125
except Exception as e:
131-
logger.error(f"Failed to configure LLM target {target.name}: {e}")
132-
return False
126+
logger.error(f"Failed to fetch LLM configs: {e}")
127+
return []
133128

134129
def create_conversation(
135-
self, model: Optional[str] = None, base_url: Optional[str] = None
130+
self,
131+
model: Optional[str] = None,
132+
base_url: Optional[str] = None,
133+
model_alias: Optional[str] = None,
136134
) -> Optional[str]:
137135
"""Create a new conversation and return its ID
138136
139137
Args:
140138
model: Optional model name (e.g., "dashscope/qwen3.5-plus")
141139
base_url: Optional base URL override
140+
model_alias: Optional configured model alias
142141
"""
143142
try:
144143
request_json = {}
145144
if model:
146145
request_json["model"] = model
147146
if base_url:
148147
request_json["base_url"] = base_url
148+
if model_alias:
149+
request_json["model_alias"] = model_alias
149150
if self.chrome_uuid:
150151
request_json["browser_id"] = self.chrome_uuid
151152

@@ -405,6 +406,40 @@ def __init__(self, chrome_uuid: Optional[str] = None):
405406
self.current_model: Optional[str] = None # Current model being tested
406407
self.current_target: Optional[LLMTarget] = None # Current CLI target
407408

409+
def resolve_targets(self, targets: List[LLMTarget]) -> List[LLMTarget]:
410+
"""Resolve configured aliases to raw model names."""
411+
llm_configs = self.openbrowser.get_llm_configs()
412+
alias_to_model = {
413+
config.get("alias"): config.get("model")
414+
for config in llm_configs
415+
if isinstance(config, dict) and config.get("alias") and config.get("model")
416+
}
417+
418+
resolved_targets: List[LLMTarget] = []
419+
missing_aliases: List[str] = []
420+
421+
for target in targets:
422+
model_name = alias_to_model.get(target.alias)
423+
if not isinstance(model_name, str) or not model_name:
424+
missing_aliases.append(target.alias)
425+
continue
426+
resolved_targets.append(
427+
LLMTarget(
428+
name=model_name,
429+
alias=target.alias,
430+
model_name=model_name,
431+
)
432+
)
433+
434+
if missing_aliases:
435+
raise ValueError(
436+
"Unknown model alias(es): "
437+
+ ", ".join(missing_aliases)
438+
+ ". Configure them first in the OpenBrowser frontend."
439+
)
440+
441+
return resolved_targets
442+
408443
def ensure_services(
409444
self, skip_services: bool = False, manual: bool = False
410445
) -> bool:
@@ -495,8 +530,7 @@ def run_test(self, test_case: TestCase) -> TestResult:
495530

496531
# Create new conversation with current model
497532
conversation_id = self.openbrowser.create_conversation(
498-
model=self.current_target.model if self.current_target else None,
499-
base_url=self.current_target.base_url if self.current_target else None,
533+
model_alias=self.current_target.alias if self.current_target else None,
500534
)
501535
if conversation_id:
502536
logger.debug(f"Created conversation: {conversation_id}")
@@ -1302,7 +1336,7 @@ def run_all(
13021336
return False
13031337

13041338
if targets is None or len(targets) == 0:
1305-
logger.error("No LLM targets provided")
1339+
logger.error("No model aliases provided")
13061340
return False
13071341

13081342
# Create timestamped output directory
@@ -1323,15 +1357,13 @@ def run_all(
13231357

13241358
for target in targets:
13251359
logger.info(f"\n{'=' * 60}")
1326-
logger.info(f"Testing target: {target.name}")
1360+
logger.info(
1361+
f"Testing target alias: {target.alias} -> model: {target.model_name}"
1362+
)
13271363
logger.info(f"{'=' * 60}")
13281364

1329-
if not self.openbrowser.configure_llm(target):
1330-
logger.error(f"Failed to configure LLM target: {target.name}")
1331-
return False
1332-
13331365
self.current_target = target
1334-
self.current_model = target.name
1366+
self.current_model = target.model_name or target.name
13351367

13361368
# Clear results for this model
13371369
self.results = []
@@ -1353,7 +1385,7 @@ def run_all(
13531385

13541386
# Add model information to results and store for summary
13551387
for result in self.results:
1356-
result.model = target.name
1388+
result.model = target.model_name or target.name
13571389
all_results.extend(self.results)
13581390

13591391
# Generate cross-model summary report if we tested multiple models
@@ -1751,27 +1783,21 @@ def _generate_json_report(
17511783
return None
17521784

17531785

1754-
def _build_llm_targets(
1755-
llm_models: List[str], llm_base_urls: List[str], llm_api_keys: List[str]
1756-
) -> List[LLMTarget]:
1757-
"""Build explicit LLM targets from validated CLI lists."""
1786+
def _build_llm_targets(model_aliases: List[str]) -> List[LLMTarget]:
1787+
"""Build explicit LLM targets from validated alias list."""
17581788
targets: List[LLMTarget] = []
17591789
seen_labels: dict[str, int] = {}
17601790

1761-
for model, base_url, api_key in zip(llm_models, llm_base_urls, llm_api_keys):
1762-
parsed = urlparse(base_url)
1763-
host = parsed.netloc or base_url
1764-
base_label = f"{model} @ {host}"
1765-
count = seen_labels.get(base_label, 0) + 1
1766-
seen_labels[base_label] = count
1767-
label = base_label if count == 1 else f"{base_label} #{count}"
1791+
for alias in model_aliases:
1792+
normalized_alias = alias.strip()
1793+
count = seen_labels.get(normalized_alias, 0) + 1
1794+
seen_labels[normalized_alias] = count
1795+
label = normalized_alias if count == 1 else f"{normalized_alias} #{count}"
17681796

17691797
targets.append(
17701798
LLMTarget(
17711799
name=label,
1772-
model=model,
1773-
base_url=base_url,
1774-
api_key=api_key,
1800+
alias=normalized_alias,
17751801
)
17761802
)
17771803

@@ -1787,10 +1813,10 @@ def main():
17871813
" python eval/evaluate_browser_agent.py --list\n"
17881814
" python eval/evaluate_browser_agent.py --manual --test techforum\n"
17891815
" python eval/evaluate_browser_agent.py --test techforum --chrome-uuid YOUR_BROWSER_UUID \\\n"
1790-
" --llm-model dashscope/qwen3.5-plus --llm-base-url https://dashscope.aliyuncs.com/compatible-mode/v1 --llm-api-key YOUR_KEY\n"
1816+
" --model-alias default\n"
17911817
" OPENBROWSER_CHROME_UUID=YOUR_BROWSER_UUID python eval/evaluate_browser_agent.py \\\n"
1792-
" --llm-model dashscope/qwen3.5-plus --llm-base-url https://dashscope.aliyuncs.com/compatible-mode/v1 --llm-api-key PLUS_KEY \\\n"
1793-
" --llm-model dashscope/qwen3.5-flash --llm-base-url https://dashscope.aliyuncs.com/compatible-mode/v1 --llm-api-key FLASH_KEY"
1818+
" --model-alias plus \\\n"
1819+
" --model-alias flash"
17941820
),
17951821
)
17961822
parser.add_argument("--test", help="Run specific test by ID")
@@ -1801,19 +1827,9 @@ def main():
18011827
)
18021828
parser.add_argument("--list", action="store_true", help="List available tests")
18031829
parser.add_argument(
1804-
"--llm-model",
1805-
action="append",
1806-
help="LLM model name. Must be passed together with matching --llm-base-url and --llm-api-key.",
1807-
)
1808-
parser.add_argument(
1809-
"--llm-base-url",
1810-
action="append",
1811-
help="LLM base URL. Must be passed together with matching --llm-model and --llm-api-key.",
1812-
)
1813-
parser.add_argument(
1814-
"--llm-api-key",
1830+
"--model-alias",
18151831
action="append",
1816-
help="LLM API key. Must be passed together with matching --llm-model and --llm-base-url.",
1832+
help="Configured LLM alias to evaluate. Can be passed multiple times.",
18171833
)
18181834
parser.add_argument(
18191835
"--no-services", action="store_true", help="Don't start services"
@@ -1841,31 +1857,20 @@ def main():
18411857
level=log_level, format="%(asctime)s - %(levelname)s - %(message)s"
18421858
)
18431859

1844-
llm_models = args.llm_model or []
1845-
llm_base_urls = args.llm_base_url or []
1846-
llm_api_keys = args.llm_api_key or []
1847-
llm_args_provided = any([llm_models, llm_base_urls, llm_api_keys])
1860+
model_aliases = args.model_alias or []
18481861
llm_targets: List[LLMTarget] = []
18491862

18501863
if not args.manual and not args.list:
1851-
if not llm_args_provided:
1852-
parser.error(
1853-
"Automated evaluation requires at least one full LLM triple: "
1854-
"--llm-model, --llm-base-url, and --llm-api-key"
1855-
)
1856-
1857-
if not (llm_models and llm_base_urls and llm_api_keys):
1858-
parser.error(
1859-
"--llm-model, --llm-base-url, and --llm-api-key must all be provided together"
1860-
)
1861-
1862-
if not (len(llm_models) == len(llm_base_urls) == len(llm_api_keys)):
1864+
if not model_aliases:
18631865
parser.error(
1864-
"--llm-model, --llm-base-url, and --llm-api-key must have the same number of values"
1866+
"Automated evaluation requires at least one configured model alias: "
1867+
"--model-alias"
18651868
)
18661869

1867-
llm_targets = _build_llm_targets(llm_models, llm_base_urls, llm_api_keys)
1868-
logger.info(f"LLM targets to test: {[target.name for target in llm_targets]}")
1870+
llm_targets = _build_llm_targets(model_aliases)
1871+
logger.info(
1872+
f"Model aliases to test: {[target.alias for target in llm_targets]}"
1873+
)
18691874

18701875
if not args.manual and not args.list and not args.chrome_uuid:
18711876
parser.error(
@@ -1901,6 +1906,13 @@ def main():
19011906
logger.error("Services unavailable")
19021907
return
19031908

1909+
if not args.manual:
1910+
try:
1911+
llm_targets = evaluator.resolve_targets(llm_targets)
1912+
except ValueError as e:
1913+
logger.error(str(e))
1914+
sys.exit(1)
1915+
19041916
# Create output directory for single test
19051917
timestamp = time.strftime("%Y%m%d_%H%M%S")
19061918
evaluator.output_dir = OUTPUT_BASE_DIR / timestamp
@@ -1945,24 +1957,25 @@ def main():
19451957
# Normal (automated) mode
19461958
else:
19471959
all_results = []
1948-
target_names = [target.name for target in llm_targets]
1960+
target_names = [target.model_name or target.name for target in llm_targets]
19491961
for target in llm_targets:
19501962
logger.info(f"\n{'=' * 60}")
1951-
logger.info(f"Testing target: {target.name}")
1963+
logger.info(
1964+
f"Testing target alias: {target.alias} -> model: {target.model_name}"
1965+
)
19521966
logger.info(f"{'=' * 60}")
19531967

1954-
if not evaluator.openbrowser.configure_llm(target):
1955-
logger.error(f"Failed to configure target: {target.name}")
1956-
sys.exit(1)
1957-
19581968
evaluator.current_target = target
1959-
evaluator.current_model = target.name
1969+
evaluator.current_model = target.model_name or target.name
19601970

19611971
result = evaluator.run_test(test_case)
1962-
result.model = target.name
1972+
result.model = target.model_name or target.name
19631973
all_results.append(result)
19641974

1965-
print(f"\nTest result for {test_case.name} (target: {target.name}):")
1975+
print(
1976+
f"\nTest result for {test_case.name} "
1977+
f"(alias: {target.alias}, model: {target.model_name}):"
1978+
)
19661979
print(f" Status: {'PASS' if result.passed else 'FAIL'}")
19671980
print(f" Task score: {result.score:.1f}/{result.max_score:.1f}")
19681981
print(f" Efficiency score: {result.efficiency_score or 0:.2f}/1.0")
@@ -2019,6 +2032,12 @@ def main():
20192032
sys.exit(1)
20202033
else:
20212034
# Normal automated mode
2035+
try:
2036+
llm_targets = evaluator.resolve_targets(llm_targets)
2037+
except ValueError as e:
2038+
logger.error(str(e))
2039+
sys.exit(1)
2040+
20222041
success = evaluator.run_all(
20232042
targets=llm_targets, skip_services=args.no_services, manual=False
20242043
)

0 commit comments

Comments
 (0)