code-yeongyu · code-yeongyu · Mar 11, 2026 · Mar 7, 2026 · Mar 7, 2026 · Mar 7, 2026
diff --git a/src/plugin-handlers/provider-config-handler.test.ts b/src/plugin-handlers/provider-config-handler.test.ts
@@ -0,0 +1,84 @@
+/// <reference types="bun-types" />
+
+import { describe, expect, test } from "bun:test"
+import { applyProviderConfig } from "./provider-config-handler"
+import { createModelCacheState } from "../plugin-state"
+import { clearVisionCapableModelsCache, readVisionCapableModelsCache } from "../shared/vision-capable-models-cache"
+
+describe("applyProviderConfig", () => {
+  test("caches vision-capable models from modalities and capabilities", () => {
+    // given
+    const modelCacheState = createModelCacheState()
+    const visionCapableModelsCache = modelCacheState.visionCapableModelsCache
+    if (!visionCapableModelsCache) {
+      throw new Error("visionCapableModelsCache should be initialized")
+    }
+    const config = {
+      provider: {
+        rundao: {
+          models: {
+            "public/qwen3.5-397b": {
+              modalities: {
+                input: ["text", "image"],
+              },
+            },
+            "public/text-only": {
+              modalities: {
+                input: ["text"],
+              },
+            },
+          },
+        },
+        google: {
+          models: {
+            "gemini-3-flash": {
+              capabilities: {
+                input: {
+                  image: true,
+                },
+              },
+            },
+          },
+        },
+      },
+    } satisfies Record<string, unknown>
+
+    // when
+    applyProviderConfig({ config, modelCacheState })
+
+    // then
+    expect(Array.from(visionCapableModelsCache.keys())).toEqual([
+      "rundao/public/qwen3.5-397b",
+      "google/gemini-3-flash",
+    ])
+    expect(readVisionCapableModelsCache()).toEqual([
+      { providerID: "rundao", modelID: "public/qwen3.5-397b" },
+      { providerID: "google", modelID: "gemini-3-flash" },
+    ])
+  })
+
+  test("clears stale vision-capable models when provider config changes", () => {
+    // given
+    const modelCacheState = createModelCacheState()
+    const visionCapableModelsCache = modelCacheState.visionCapableModelsCache
+    if (!visionCapableModelsCache) {
+      throw new Error("visionCapableModelsCache should be initialized")
+    }
+    visionCapableModelsCache.set("stale/old-model", {
+      providerID: "stale",
+      modelID: "old-model",
+    })
+
+    // when
+    applyProviderConfig({
+      config: { provider: {} },
+      modelCacheState,
+    })
+
+    // then
+    expect(visionCapableModelsCache.size).toBe(0)
+    expect(readVisionCapableModelsCache()).toEqual([])
+  })
+})
+
+clearVisionCapableModelsCache()
diff --git a/src/plugin-handlers/provider-config-handler.ts b/src/plugin-handlers/provider-config-handler.ts
@@ -1,10 +1,31 @@
-import type { ModelCacheState } from "../plugin-state";
+import type { ModelCacheState, VisionCapableModel } from "../plugin-state";
+import { setVisionCapableModelsCache } from "../shared/vision-capable-models-cache"
 
 type ProviderConfig = {
   options?: { headers?: Record<string, string> };
-  models?: Record<string, { limit?: { context?: number } }>;
+  models?: Record<string, ProviderModelConfig>;
 };
 
+type ProviderModelConfig = {
+  limit?: { context?: number };
+  modalities?: {
+    input?: string[];
+  };
+  capabilities?: {
+    input?: {
+      image?: boolean;
+    };
+  };
+}
+
+function supportsImageInput(modelConfig: ProviderModelConfig | undefined): boolean {
+  if (modelConfig?.modalities?.input?.includes("image")) {
+    return true
+  }
+
+  return modelConfig?.capabilities?.input?.image === true
+}
+
 export function applyProviderConfig(params: {
   config: Record<string, unknown>;
   modelCacheState: ModelCacheState;
@@ -17,13 +38,26 @@ export function applyProviderConfig(params: {
   params.modelCacheState.anthropicContext1MEnabled =
     anthropicBeta?.includes("context-1m") ?? false;
 
+  const visionCapableModelsCache = params.modelCacheState.visionCapableModelsCache
+    ?? new Map<string, VisionCapableModel>()
+  params.modelCacheState.visionCapableModelsCache = visionCapableModelsCache
+  visionCapableModelsCache.clear()
+  setVisionCapableModelsCache(visionCapableModelsCache)
+
   if (!providers) return;
 
   for (const [providerID, providerConfig] of Object.entries(providers)) {
     const models = providerConfig?.models;
     if (!models) continue;
 
     for (const [modelID, modelConfig] of Object.entries(models)) {
+      if (supportsImageInput(modelConfig)) {
+        visionCapableModelsCache.set(
+          `${providerID}/${modelID}`,
+          { providerID, modelID },
+        )
+      }
+
       const contextLimit = modelConfig?.limit?.context;
       if (!contextLimit) continue;
 

diff --git a/src/plugin-state.ts b/src/plugin-state.ts
@@ -1,11 +1,18 @@
+export type VisionCapableModel = {
+  providerID: string
+  modelID: string
+}
+
 export interface ModelCacheState {
   modelContextLimitsCache: Map<string, number>;
+  visionCapableModelsCache?: Map<string, VisionCapableModel>;
   anthropicContext1MEnabled: boolean;
 }
 
 export function createModelCacheState(): ModelCacheState {
   return {
     modelContextLimitsCache: new Map<string, number>(),
+    visionCapableModelsCache: new Map<string, VisionCapableModel>(),
     anthropicContext1MEnabled: false,
   };
 }
diff --git a/src/shared/vision-capable-models-cache.ts b/src/shared/vision-capable-models-cache.ts
@@ -0,0 +1,17 @@
+import type { VisionCapableModel } from "../plugin-state"
+
+let visionCapableModelsCache = new Map<string, VisionCapableModel>()
+
+export function setVisionCapableModelsCache(
+  cache: Map<string, VisionCapableModel>,
+): void {
+  visionCapableModelsCache = cache
+}
+
+export function readVisionCapableModelsCache(): VisionCapableModel[] {
+  return Array.from(visionCapableModelsCache.values())
+}
+
+export function clearVisionCapableModelsCache(): void {
+  visionCapableModelsCache = new Map<string, VisionCapableModel>()
+}
diff --git a/src/tools/look-at/multimodal-agent-metadata.test.ts b/src/tools/look-at/multimodal-agent-metadata.test.ts
@@ -0,0 +1,115 @@
+/// <reference types="bun-types" />
+
+import { afterEach, beforeEach, describe, expect, mock, spyOn, test } from "bun:test"
+import type { PluginInput } from "@opencode-ai/plugin"
+import { resolveMultimodalLookerAgentMetadata } from "./multimodal-agent-metadata"
+import { setVisionCapableModelsCache, clearVisionCapableModelsCache } from "../../shared/vision-capable-models-cache"
+import * as connectedProvidersCache from "../../shared/connected-providers-cache"
+import * as modelAvailability from "../../shared/model-availability"
+
+function createPluginInput(agentData: Array<Record<string, unknown>>): PluginInput {
+  const client = {} as PluginInput["client"]
+  Object.assign(client, {
+    app: {
+      agents: mock(async () => ({ data: agentData })),
+    },
+  })
+
+  return {
+    client,
+    project: {} as PluginInput["project"],
+    directory: "/project",
+    worktree: "/project",
+    serverUrl: new URL("http://localhost"),
+    $: {} as PluginInput["$"],
+  }
+}
+
+describe("resolveMultimodalLookerAgentMetadata", () => {
+  beforeEach(() => {
+    clearVisionCapableModelsCache()
+  })
+
+  afterEach(() => {
+    clearVisionCapableModelsCache()
+    ;(modelAvailability.fetchAvailableModels as unknown as { mockRestore?: () => void }).mockRestore?.()
+    ;(connectedProvidersCache.readConnectedProvidersCache as unknown as { mockRestore?: () => void }).mockRestore?.()
+  })
+
+  test("returns configured multimodal-looker model when it already matches a vision-capable override", async () => {
+    // given
+    setVisionCapableModelsCache(new Map([
+      [
+        "rundao/public/qwen3.5-397b",
+        { providerID: "rundao", modelID: "public/qwen3.5-397b" },
+      ],
+    ]))
+    spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
+      new Set(["rundao/public/qwen3.5-397b"]),
+    )
+    spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["rundao"])
+    const ctx = createPluginInput([
+      {
+        name: "multimodal-looker",
+        model: { providerID: "rundao", modelID: "public/qwen3.5-397b" },
+      },
+    ])
+
+    // when
+    const result = await resolveMultimodalLookerAgentMetadata(ctx)
+
+    // then
+    expect(result).toEqual({
+      agentModel: { providerID: "rundao", modelID: "public/qwen3.5-397b" },
+      agentVariant: undefined,
+    })
+  })
+
+  test("prefers connected vision-capable provider models before the hardcoded fallback chain", async () => {
+    // given
+    setVisionCapableModelsCache(new Map([
+      [
+        "rundao/public/qwen3.5-397b",
+        { providerID: "rundao", modelID: "public/qwen3.5-397b" },
+      ],
+    ]))
+    spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
+      new Set(["openai/gpt-5.4", "rundao/public/qwen3.5-397b"]),
+    )
+    spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai", "rundao"])
+    const ctx = createPluginInput([
+      {
+        name: "multimodal-looker",
+        model: { providerID: "openai", modelID: "gpt-5.4" },
+        variant: "medium",
+      },
+    ])
+
+    // when
+    const result = await resolveMultimodalLookerAgentMetadata(ctx)
+
+    // then
+    expect(result).toEqual({
+      agentModel: { providerID: "rundao", modelID: "public/qwen3.5-397b" },
+      agentVariant: undefined,
+    })
+  })
+
+  test("falls back to the hardcoded multimodal chain when no dynamic vision model exists", async () => {
+    // given
+    spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
+      new Set(["google/gemini-3-flash"]),
+    )
+    spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["google"])
+    const ctx = createPluginInput([])
+
+    // when
+    const result = await resolveMultimodalLookerAgentMetadata(ctx)
+
+    // then
+    expect(result).toEqual({
+      agentModel: { providerID: "google", modelID: "gemini-3-flash" },
+      agentVariant: undefined,
+    })
+  })
+})