OpenHands · juanmichelini · Jan 7, 2026 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/.github/run-eval/resolve_model_config.py b/.github/run-eval/resolve_model_config.py
@@ -50,6 +50,44 @@
         "display_name": "Kimi K2 Thinking",
         "llm_config": {"model": "litellm_proxy/moonshot/kimi-k2-thinking"},
     },
+    "claude-4.5-opus": {
+        "id": "claude-4.5-opus",
+        "display_name": "Claude 4.5 Opus",
+        "llm_config": {
+            "model": "litellm_proxy/anthropic/claude-opus-4-5-20251101",
+            "temperature": 0.0,
+        },
+    },
+    "gemini-3-pro": {
+        "id": "gemini-3-pro",
+        "display_name": "Gemini 3 Pro",
+        "llm_config": {"model": "litellm_proxy/gemini/gemini-3-pro-preview"},
+    },
+    "gemini-3-flash": {
+        "id": "gemini-3-flash",
+        "display_name": "Gemini 3 Flash",
+        "llm_config": {"model": "litellm_proxy/gemini/gemini-3-flash-preview"},
+    },
+    "gpt-5.2": {
+        "id": "gpt-5.2",
+        "display_name": "GPT-5.2",
+        "llm_config": {"model": "litellm_proxy/openai/gpt-5.2-2025-12-11"},
+    },
+    "minimax-m2": {
+        "id": "minimax-m2",
+        "display_name": "MiniMax M2",
+        "llm_config": {"model": "litellm_proxy/minimax/minimax-m2"},
+    },
+    "deepseek-v3.2-reasoner": {
+        "id": "deepseek-v3.2-reasoner",
+        "display_name": "DeepSeek V3.2 Reasoner",
+        "llm_config": {"model": "litellm_proxy/deepseek/deepseek-v3.2"},
+    },
+    "qwen-3-coder": {
+        "id": "qwen-3-coder",
+        "display_name": "Qwen 3 Coder",
+        "llm_config": {"model": "litellm_proxy/qwen/qwen3-coder"},
+    },
 }
 
 

diff --git a/tests/github_workflows/test_resolve_model_config.py b/tests/github_workflows/test_resolve_model_config.py
@@ -4,11 +4,14 @@
 from pathlib import Path
 from unittest.mock import patch
 
+import pytest
+
 
 # Import the functions from resolve_model_config.py
 run_eval_path = Path(__file__).parent.parent.parent / ".github" / "run-eval"
 sys.path.append(str(run_eval_path))
 from resolve_model_config import (  # noqa: E402  # type: ignore[import-not-found]
+    MODELS,
     find_models_by_id,
 )
 
@@ -25,8 +28,8 @@ def test_find_models_by_id_single_model():
         result = find_models_by_id(model_ids)
 
     assert len(result) == 1
-    assert result[0]["id"] == "gpt-4"
-    assert result[0]["display_name"] == "GPT-4"
+    assert result[0]["id"] == "claude-sonnet-4-5-20250929"
+    assert result[0]["display_name"] == "Claude Sonnet 4.5"
 
 
 def test_find_models_by_id_multiple_models():
@@ -42,8 +45,8 @@ def test_find_models_by_id_multiple_models():
         result = find_models_by_id(model_ids)
 
     assert len(result) == 2
-    assert result[0]["id"] == "gpt-4"
-    assert result[1]["id"] == "claude-3"
+    assert result[0]["id"] == "claude-sonnet-4-5-20250929"
+    assert result[1]["id"] == "deepseek-chat"
 
 
 def test_find_models_by_id_preserves_order():
@@ -59,12 +62,11 @@ def test_find_models_by_id_preserves_order():
         result = find_models_by_id(model_ids)
 
     assert len(result) == 3
-    assert [m["id"] for m in result] == ["c", "a", "b"]
+    assert [m["id"] for m in result] == model_ids
 
 
 def test_find_models_by_id_missing_model_exits():
     """Test that missing model ID causes exit."""
-    import pytest
 
     mock_models = {
         "gpt-4": {"id": "gpt-4", "display_name": "GPT-4", "llm_config": {}},
@@ -111,6 +113,59 @@ def test_find_models_by_id_preserves_full_config():
         result = find_models_by_id(model_ids)
 
     assert len(result) == 1
-    assert result[0]["id"] == "custom-model"
-    assert result[0]["llm_config"]["model"] == "custom-model"
-    assert result[0]["extra_field"] == "should be preserved"
+    assert result[0]["id"] == "claude-sonnet-4-5-20250929"
+    assert (
+        result[0]["llm_config"]["model"] == "litellm_proxy/claude-sonnet-4-5-20250929"
+    )
+    assert result[0]["llm_config"]["temperature"] == 0.0
+
+
+# Tests for expected models from issue #1495
+# Note: claude-4.5-sonnet is implemented as claude-sonnet-4-5-20250929 (pinned version)
+EXPECTED_MODELS = [
+    "claude-4.5-opus",
+    "claude-sonnet-4-5-20250929",
+    "gemini-3-pro",
+    "gemini-3-flash",
+    "gpt-5.2",
+    "kimi-k2-thinking",
+    "minimax-m2",
+    "deepseek-v3.2-reasoner",
+    "qwen-3-coder",
+]
+
+
+def test_all_expected_models_present():
+    """Test that all expected models from issue #1495 are present."""
+    for model_id in EXPECTED_MODELS:
+        assert model_id in MODELS, f"Model '{model_id}' is missing from MODELS"
+
+
+def test_expected_models_have_required_fields():
+    """Test that all expected models have required fields."""
+    for model_id in EXPECTED_MODELS:
+        model = MODELS[model_id]
+        assert "id" in model, f"Model '{model_id}' missing 'id' field"
+        assert "display_name" in model, f"Model '{model_id}' missing 'display_name'"
+        assert "llm_config" in model, f"Model '{model_id}' missing 'llm_config'"
+        assert "model" in model["llm_config"], (
+            f"Model '{model_id}' missing 'model' in llm_config"
+        )
+
+
+def test_expected_models_id_matches_key():
+    """Test that model id field matches the dictionary key."""
+    for model_id in EXPECTED_MODELS:
+        model = MODELS[model_id]
+        assert model["id"] == model_id, (
+            f"Model key '{model_id}' doesn't match id field '{model['id']}'"
+        )
+
+
+def test_find_all_expected_models():
+    """Test that find_models_by_id works for all expected models."""
+    result = find_models_by_id(EXPECTED_MODELS)
+
+    assert len(result) == len(EXPECTED_MODELS)
+    for i, model_id in enumerate(EXPECTED_MODELS):
+        assert result[i]["id"] == model_id