Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions .github/run-eval/resolve_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,44 @@
"display_name": "Kimi K2 Thinking",
"llm_config": {"model": "litellm_proxy/moonshot/kimi-k2-thinking"},
},
"claude-4.5-opus": {
"id": "claude-4.5-opus",
"display_name": "Claude 4.5 Opus",
"llm_config": {
"model": "litellm_proxy/anthropic/claude-opus-4-5-20251101",
"temperature": 0.0,
},
},
"gemini-3-pro": {
"id": "gemini-3-pro",
"display_name": "Gemini 3 Pro",
"llm_config": {"model": "litellm_proxy/gemini/gemini-3-pro-preview"},
},
"gemini-3-flash": {
"id": "gemini-3-flash",
"display_name": "Gemini 3 Flash",
"llm_config": {"model": "litellm_proxy/gemini/gemini-3-flash-preview"},
},
"gpt-5.2": {
"id": "gpt-5.2",
"display_name": "GPT-5.2",
"llm_config": {"model": "litellm_proxy/openai/gpt-5.2-2025-12-11"},
},
"minimax-m2": {
"id": "minimax-m2",
"display_name": "MiniMax M2",
"llm_config": {"model": "litellm_proxy/minimax/minimax-m2"},
},
"deepseek-v3.2-reasoner": {
"id": "deepseek-v3.2-reasoner",
"display_name": "DeepSeek V3.2 Reasoner",
"llm_config": {"model": "litellm_proxy/deepseek/deepseek-v3.2"},
},
"qwen-3-coder": {
"id": "qwen-3-coder",
"display_name": "Qwen 3 Coder",
"llm_config": {"model": "litellm_proxy/qwen/qwen3-coder"},
},
}


Expand Down
73 changes: 64 additions & 9 deletions tests/github_workflows/test_resolve_model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
from pathlib import Path
from unittest.mock import patch

import pytest


# Import the functions from resolve_model_config.py
run_eval_path = Path(__file__).parent.parent.parent / ".github" / "run-eval"
sys.path.append(str(run_eval_path))
from resolve_model_config import ( # noqa: E402 # type: ignore[import-not-found]
MODELS,
find_models_by_id,
)

Expand All @@ -25,8 +28,8 @@ def test_find_models_by_id_single_model():
result = find_models_by_id(model_ids)

assert len(result) == 1
assert result[0]["id"] == "gpt-4"
assert result[0]["display_name"] == "GPT-4"
assert result[0]["id"] == "claude-sonnet-4-5-20250929"
assert result[0]["display_name"] == "Claude Sonnet 4.5"


def test_find_models_by_id_multiple_models():
Expand All @@ -42,8 +45,8 @@ def test_find_models_by_id_multiple_models():
result = find_models_by_id(model_ids)

assert len(result) == 2
assert result[0]["id"] == "gpt-4"
assert result[1]["id"] == "claude-3"
assert result[0]["id"] == "claude-sonnet-4-5-20250929"
assert result[1]["id"] == "deepseek-chat"


def test_find_models_by_id_preserves_order():
Expand All @@ -59,12 +62,11 @@ def test_find_models_by_id_preserves_order():
result = find_models_by_id(model_ids)

assert len(result) == 3
assert [m["id"] for m in result] == ["c", "a", "b"]
assert [m["id"] for m in result] == model_ids


def test_find_models_by_id_missing_model_exits():
"""Test that missing model ID causes exit."""
import pytest

mock_models = {
"gpt-4": {"id": "gpt-4", "display_name": "GPT-4", "llm_config": {}},
Expand Down Expand Up @@ -111,6 +113,59 @@ def test_find_models_by_id_preserves_full_config():
result = find_models_by_id(model_ids)

assert len(result) == 1
assert result[0]["id"] == "custom-model"
assert result[0]["llm_config"]["model"] == "custom-model"
assert result[0]["extra_field"] == "should be preserved"
assert result[0]["id"] == "claude-sonnet-4-5-20250929"
assert (
result[0]["llm_config"]["model"] == "litellm_proxy/claude-sonnet-4-5-20250929"
)
assert result[0]["llm_config"]["temperature"] == 0.0


# Tests for expected models from issue #1495
# Note: claude-4.5-sonnet is implemented as claude-sonnet-4-5-20250929 (pinned version)
EXPECTED_MODELS = [
"claude-4.5-opus",
"claude-sonnet-4-5-20250929",
"gemini-3-pro",
"gemini-3-flash",
"gpt-5.2",
"kimi-k2-thinking",
"minimax-m2",
"deepseek-v3.2-reasoner",
"qwen-3-coder",
]


def test_all_expected_models_present():
"""Test that all expected models from issue #1495 are present."""
for model_id in EXPECTED_MODELS:
assert model_id in MODELS, f"Model '{model_id}' is missing from MODELS"


def test_expected_models_have_required_fields():
"""Test that all expected models have required fields."""
for model_id in EXPECTED_MODELS:
model = MODELS[model_id]
assert "id" in model, f"Model '{model_id}' missing 'id' field"
assert "display_name" in model, f"Model '{model_id}' missing 'display_name'"
assert "llm_config" in model, f"Model '{model_id}' missing 'llm_config'"
assert "model" in model["llm_config"], (
f"Model '{model_id}' missing 'model' in llm_config"
)


def test_expected_models_id_matches_key():
"""Test that model id field matches the dictionary key."""
for model_id in EXPECTED_MODELS:
model = MODELS[model_id]
assert model["id"] == model_id, (
f"Model key '{model_id}' doesn't match id field '{model['id']}'"
)


def test_find_all_expected_models():
"""Test that find_models_by_id works for all expected models."""
result = find_models_by_id(EXPECTED_MODELS)

assert len(result) == len(EXPECTED_MODELS)
for i, model_id in enumerate(EXPECTED_MODELS):
assert result[i]["id"] == model_id
Loading