2424import atexit
2525import logging
2626import datetime
27- from urllib .parse import urlparse
2827
2928logger = logging .getLogger (__name__ )
3029
@@ -88,12 +87,11 @@ class TestResult:
8887
8988@dataclass
9089class LLMTarget :
91- """One explicit LLM target passed from the CLI."""
90+ """One configured LLM alias passed from the CLI."""
9291
9392 name : str
94- model : str
95- base_url : str
96- api_key : str
93+ alias : str
94+ model_name : str | None = None
9795
9896
9997class OpenBrowserClient :
@@ -114,38 +112,41 @@ def health_check(self) -> bool:
114112 except requests .exceptions .RequestException :
115113 return False
116114
117- def configure_llm (self , target : LLMTarget ) -> bool :
118- """Configure the OpenBrowser server with the exact LLM triple for eval ."""
115+ def get_llm_configs (self ) -> List [ Dict [ str , Any ]] :
116+ """Fetch configured LLM entries from the server ."""
119117 try :
120- response = self .session .post (
121- f"{ self .base_url } /api/config/llm" ,
122- json = {
123- "model" : target .model ,
124- "base_url" : target .base_url ,
125- "api_key" : target .api_key ,
126- },
127- timeout = 5 ,
128- )
129- return response .status_code == 200
118+ response = self .session .get (f"{ self .base_url } /api/config" , timeout = 5 )
119+ if response .status_code != 200 :
120+ return []
121+ data = response .json ()
122+ config = data .get ("config" , {})
123+ llm_configs = config .get ("llm_configs" , [])
124+ return llm_configs if isinstance (llm_configs , list ) else []
130125 except Exception as e :
131- logger .error (f"Failed to configure LLM target { target . name } : { e } " )
132- return False
126+ logger .error (f"Failed to fetch LLM configs : { e } " )
127+ return []
133128
134129 def create_conversation (
135- self , model : Optional [str ] = None , base_url : Optional [str ] = None
130+ self ,
131+ model : Optional [str ] = None ,
132+ base_url : Optional [str ] = None ,
133+ model_alias : Optional [str ] = None ,
136134 ) -> Optional [str ]:
137135 """Create a new conversation and return its ID
138136
139137 Args:
140138 model: Optional model name (e.g., "dashscope/qwen3.5-plus")
141139 base_url: Optional base URL override
140+ model_alias: Optional configured model alias
142141 """
143142 try :
144143 request_json = {}
145144 if model :
146145 request_json ["model" ] = model
147146 if base_url :
148147 request_json ["base_url" ] = base_url
148+ if model_alias :
149+ request_json ["model_alias" ] = model_alias
149150 if self .chrome_uuid :
150151 request_json ["browser_id" ] = self .chrome_uuid
151152
@@ -405,6 +406,40 @@ def __init__(self, chrome_uuid: Optional[str] = None):
405406 self .current_model : Optional [str ] = None # Current model being tested
406407 self .current_target : Optional [LLMTarget ] = None # Current CLI target
407408
409+ def resolve_targets (self , targets : List [LLMTarget ]) -> List [LLMTarget ]:
410+ """Resolve configured aliases to raw model names."""
411+ llm_configs = self .openbrowser .get_llm_configs ()
412+ alias_to_model = {
413+ config .get ("alias" ): config .get ("model" )
414+ for config in llm_configs
415+ if isinstance (config , dict ) and config .get ("alias" ) and config .get ("model" )
416+ }
417+
418+ resolved_targets : List [LLMTarget ] = []
419+ missing_aliases : List [str ] = []
420+
421+ for target in targets :
422+ model_name = alias_to_model .get (target .alias )
423+ if not isinstance (model_name , str ) or not model_name :
424+ missing_aliases .append (target .alias )
425+ continue
426+ resolved_targets .append (
427+ LLMTarget (
428+ name = model_name ,
429+ alias = target .alias ,
430+ model_name = model_name ,
431+ )
432+ )
433+
434+ if missing_aliases :
435+ raise ValueError (
436+ "Unknown model alias(es): "
437+ + ", " .join (missing_aliases )
438+ + ". Configure them first in the OpenBrowser frontend."
439+ )
440+
441+ return resolved_targets
442+
408443 def ensure_services (
409444 self , skip_services : bool = False , manual : bool = False
410445 ) -> bool :
@@ -495,8 +530,7 @@ def run_test(self, test_case: TestCase) -> TestResult:
495530
496531 # Create new conversation with current model
497532 conversation_id = self .openbrowser .create_conversation (
498- model = self .current_target .model if self .current_target else None ,
499- base_url = self .current_target .base_url if self .current_target else None ,
533+ model_alias = self .current_target .alias if self .current_target else None ,
500534 )
501535 if conversation_id :
502536 logger .debug (f"Created conversation: { conversation_id } " )
@@ -1302,7 +1336,7 @@ def run_all(
13021336 return False
13031337
13041338 if targets is None or len (targets ) == 0 :
1305- logger .error ("No LLM targets provided" )
1339+ logger .error ("No model aliases provided" )
13061340 return False
13071341
13081342 # Create timestamped output directory
@@ -1323,15 +1357,13 @@ def run_all(
13231357
13241358 for target in targets :
13251359 logger .info (f"\n { '=' * 60 } " )
1326- logger .info (f"Testing target: { target .name } " )
1360+ logger .info (
1361+ f"Testing target alias: { target .alias } -> model: { target .model_name } "
1362+ )
13271363 logger .info (f"{ '=' * 60 } " )
13281364
1329- if not self .openbrowser .configure_llm (target ):
1330- logger .error (f"Failed to configure LLM target: { target .name } " )
1331- return False
1332-
13331365 self .current_target = target
1334- self .current_model = target .name
1366+ self .current_model = target .model_name or target . name
13351367
13361368 # Clear results for this model
13371369 self .results = []
@@ -1353,7 +1385,7 @@ def run_all(
13531385
13541386 # Add model information to results and store for summary
13551387 for result in self .results :
1356- result .model = target .name
1388+ result .model = target .model_name or target . name
13571389 all_results .extend (self .results )
13581390
13591391 # Generate cross-model summary report if we tested multiple models
@@ -1751,27 +1783,21 @@ def _generate_json_report(
17511783 return None
17521784
17531785
1754- def _build_llm_targets (
1755- llm_models : List [str ], llm_base_urls : List [str ], llm_api_keys : List [str ]
1756- ) -> List [LLMTarget ]:
1757- """Build explicit LLM targets from validated CLI lists."""
1786+ def _build_llm_targets (model_aliases : List [str ]) -> List [LLMTarget ]:
1787+ """Build explicit LLM targets from validated alias list."""
17581788 targets : List [LLMTarget ] = []
17591789 seen_labels : dict [str , int ] = {}
17601790
1761- for model , base_url , api_key in zip (llm_models , llm_base_urls , llm_api_keys ):
1762- parsed = urlparse (base_url )
1763- host = parsed .netloc or base_url
1764- base_label = f"{ model } @ { host } "
1765- count = seen_labels .get (base_label , 0 ) + 1
1766- seen_labels [base_label ] = count
1767- label = base_label if count == 1 else f"{ base_label } #{ count } "
1791+ for alias in model_aliases :
1792+ normalized_alias = alias .strip ()
1793+ count = seen_labels .get (normalized_alias , 0 ) + 1
1794+ seen_labels [normalized_alias ] = count
1795+ label = normalized_alias if count == 1 else f"{ normalized_alias } #{ count } "
17681796
17691797 targets .append (
17701798 LLMTarget (
17711799 name = label ,
1772- model = model ,
1773- base_url = base_url ,
1774- api_key = api_key ,
1800+ alias = normalized_alias ,
17751801 )
17761802 )
17771803
@@ -1787,10 +1813,10 @@ def main():
17871813 " python eval/evaluate_browser_agent.py --list\n "
17881814 " python eval/evaluate_browser_agent.py --manual --test techforum\n "
17891815 " python eval/evaluate_browser_agent.py --test techforum --chrome-uuid YOUR_BROWSER_UUID \\ \n "
1790- " --llm- model dashscope/qwen3.5-plus --llm-base-url https://dashscope.aliyuncs.com/compatible-mode/v1 --llm-api-key YOUR_KEY \n "
1816+ " --model-alias default \n "
17911817 " OPENBROWSER_CHROME_UUID=YOUR_BROWSER_UUID python eval/evaluate_browser_agent.py \\ \n "
1792- " --llm- model dashscope/qwen3.5-plus --llm-base-url https://dashscope.aliyuncs.com/compatible-mode/v1 --llm-api-key PLUS_KEY \\ \n "
1793- " --llm- model dashscope/qwen3.5-flash --llm-base-url https://dashscope.aliyuncs.com/compatible-mode/v1 --llm-api-key FLASH_KEY "
1818+ " --model-alias plus \\ \n "
1819+ " --model-alias flash "
17941820 ),
17951821 )
17961822 parser .add_argument ("--test" , help = "Run specific test by ID" )
@@ -1801,19 +1827,9 @@ def main():
18011827 )
18021828 parser .add_argument ("--list" , action = "store_true" , help = "List available tests" )
18031829 parser .add_argument (
1804- "--llm-model" ,
1805- action = "append" ,
1806- help = "LLM model name. Must be passed together with matching --llm-base-url and --llm-api-key." ,
1807- )
1808- parser .add_argument (
1809- "--llm-base-url" ,
1810- action = "append" ,
1811- help = "LLM base URL. Must be passed together with matching --llm-model and --llm-api-key." ,
1812- )
1813- parser .add_argument (
1814- "--llm-api-key" ,
1830+ "--model-alias" ,
18151831 action = "append" ,
1816- help = "LLM API key. Must be passed together with matching --llm-model and --llm-base-url ." ,
1832+ help = "Configured LLM alias to evaluate. Can be passed multiple times ." ,
18171833 )
18181834 parser .add_argument (
18191835 "--no-services" , action = "store_true" , help = "Don't start services"
@@ -1841,31 +1857,20 @@ def main():
18411857 level = log_level , format = "%(asctime)s - %(levelname)s - %(message)s"
18421858 )
18431859
1844- llm_models = args .llm_model or []
1845- llm_base_urls = args .llm_base_url or []
1846- llm_api_keys = args .llm_api_key or []
1847- llm_args_provided = any ([llm_models , llm_base_urls , llm_api_keys ])
1860+ model_aliases = args .model_alias or []
18481861 llm_targets : List [LLMTarget ] = []
18491862
18501863 if not args .manual and not args .list :
1851- if not llm_args_provided :
1852- parser .error (
1853- "Automated evaluation requires at least one full LLM triple: "
1854- "--llm-model, --llm-base-url, and --llm-api-key"
1855- )
1856-
1857- if not (llm_models and llm_base_urls and llm_api_keys ):
1858- parser .error (
1859- "--llm-model, --llm-base-url, and --llm-api-key must all be provided together"
1860- )
1861-
1862- if not (len (llm_models ) == len (llm_base_urls ) == len (llm_api_keys )):
1864+ if not model_aliases :
18631865 parser .error (
1864- "--llm-model, --llm-base-url, and --llm-api-key must have the same number of values"
1866+ "Automated evaluation requires at least one configured model alias: "
1867+ "--model-alias"
18651868 )
18661869
1867- llm_targets = _build_llm_targets (llm_models , llm_base_urls , llm_api_keys )
1868- logger .info (f"LLM targets to test: { [target .name for target in llm_targets ]} " )
1870+ llm_targets = _build_llm_targets (model_aliases )
1871+ logger .info (
1872+ f"Model aliases to test: { [target .alias for target in llm_targets ]} "
1873+ )
18691874
18701875 if not args .manual and not args .list and not args .chrome_uuid :
18711876 parser .error (
@@ -1901,6 +1906,13 @@ def main():
19011906 logger .error ("Services unavailable" )
19021907 return
19031908
1909+ if not args .manual :
1910+ try :
1911+ llm_targets = evaluator .resolve_targets (llm_targets )
1912+ except ValueError as e :
1913+ logger .error (str (e ))
1914+ sys .exit (1 )
1915+
19041916 # Create output directory for single test
19051917 timestamp = time .strftime ("%Y%m%d_%H%M%S" )
19061918 evaluator .output_dir = OUTPUT_BASE_DIR / timestamp
@@ -1945,24 +1957,25 @@ def main():
19451957 # Normal (automated) mode
19461958 else :
19471959 all_results = []
1948- target_names = [target .name for target in llm_targets ]
1960+ target_names = [target .model_name or target . name for target in llm_targets ]
19491961 for target in llm_targets :
19501962 logger .info (f"\n { '=' * 60 } " )
1951- logger .info (f"Testing target: { target .name } " )
1963+ logger .info (
1964+ f"Testing target alias: { target .alias } -> model: { target .model_name } "
1965+ )
19521966 logger .info (f"{ '=' * 60 } " )
19531967
1954- if not evaluator .openbrowser .configure_llm (target ):
1955- logger .error (f"Failed to configure target: { target .name } " )
1956- sys .exit (1 )
1957-
19581968 evaluator .current_target = target
1959- evaluator .current_model = target .name
1969+ evaluator .current_model = target .model_name or target . name
19601970
19611971 result = evaluator .run_test (test_case )
1962- result .model = target .name
1972+ result .model = target .model_name or target . name
19631973 all_results .append (result )
19641974
1965- print (f"\n Test result for { test_case .name } (target: { target .name } ):" )
1975+ print (
1976+ f"\n Test result for { test_case .name } "
1977+ f"(alias: { target .alias } , model: { target .model_name } ):"
1978+ )
19661979 print (f" Status: { 'PASS' if result .passed else 'FAIL' } " )
19671980 print (f" Task score: { result .score :.1f} /{ result .max_score :.1f} " )
19681981 print (f" Efficiency score: { result .efficiency_score or 0 :.2f} /1.0" )
@@ -2019,6 +2032,12 @@ def main():
20192032 sys .exit (1 )
20202033 else :
20212034 # Normal automated mode
2035+ try :
2036+ llm_targets = evaluator .resolve_targets (llm_targets )
2037+ except ValueError as e :
2038+ logger .error (str (e ))
2039+ sys .exit (1 )
2040+
20222041 success = evaluator .run_all (
20232042 targets = llm_targets , skip_services = args .no_services , manual = False
20242043 )
0 commit comments