From 9e262521321b4fe8e1bbff7055a005d0a78236f5 Mon Sep 17 00:00:00 2001 From: Sol Date: Sun, 15 Mar 2026 13:44:40 +0000 Subject: [PATCH 01/30] feat: add real-time provisioning feedback to dashboard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New useSandboxStatusPoll hook: polls individual agent status every 5s during pending/provisioning, auto-stops on terminal states - New useSandboxListPoll hook: polls agent list every 10s while any sandbox is active, fires toast on running transition - Create dialog stays open during provisioning with 4-step progress stepper (created → database → container → running), elapsed timer, retry button on error, and Open Web UI button on completion - Status cell in table now shows pulse animation for active states, scale-in animation for running transitions, shake for errors - Added shake and scaleIn keyframe animations to globals.css --- app/globals.css | 15 + packages/lib/hooks/use-sandbox-status-poll.ts | 210 +++++++ .../create-milady-sandbox-dialog.tsx | 566 +++++++++++++----- .../containers/milady-sandboxes-table.tsx | 134 ++++- 4 files changed, 759 insertions(+), 166 deletions(-) create mode 100644 packages/lib/hooks/use-sandbox-status-poll.ts diff --git a/app/globals.css b/app/globals.css index 4fe9657e9..3fa50c8c1 100644 --- a/app/globals.css +++ b/app/globals.css @@ -1654,3 +1654,18 @@ button:not(:disabled), .animate-celebrate-bounce { animation: celebrateBounce 0.6s ease-out; } + +/* Provisioning UX — status transition animations */ +@keyframes shake { + 0%, 100% { transform: translateX(0); } + 20% { transform: translateX(-4px); } + 40% { transform: translateX(4px); } + 60% { transform: translateX(-3px); } + 80% { transform: translateX(2px); } +} + +@keyframes scaleIn { + 0% { transform: scale(0.85); opacity: 0.5; } + 60% { transform: scale(1.08); } + 100% { transform: scale(1); opacity: 1; } +} diff --git a/packages/lib/hooks/use-sandbox-status-poll.ts b/packages/lib/hooks/use-sandbox-status-poll.ts new file mode 100644 index 000000000..78ef32436 --- /dev/null +++ b/packages/lib/hooks/use-sandbox-status-poll.ts @@ -0,0 +1,210 @@ +"use client"; + +import { useCallback, useEffect, useRef, useState } from "react"; + +export type SandboxStatus = + | "pending" + | "provisioning" + | "running" + | "stopped" + | "disconnected" + | "error"; + +export interface SandboxStatusResult { + status: SandboxStatus; + lastHeartbeat: string | null; + error: string | null; + isLoading: boolean; +} + +const TERMINAL_STATES = new Set(["running", "stopped", "error"]); +const ACTIVE_STATES = new Set(["pending", "provisioning"]); + +/** + * Polls a single agent's status while it's in a non-terminal state. + * Stops automatically when the agent reaches "running", "stopped", or "error". + */ +export function useSandboxStatusPoll( + agentId: string | null, + options: { + intervalMs?: number; + enabled?: boolean; + } = {}, +) { + const { intervalMs = 5_000, enabled = true } = options; + const [result, setResult] = useState({ + status: "pending", + lastHeartbeat: null, + error: null, + isLoading: false, + }); + + const cancelledRef = useRef(false); + const intervalRef = useRef | null>(null); + const statusRef = useRef("pending"); + + const cleanup = useCallback(() => { + cancelledRef.current = true; + if (intervalRef.current) { + clearInterval(intervalRef.current); + intervalRef.current = null; + } + }, []); + + useEffect(() => { + if (!agentId || !enabled) { + cleanup(); + return; + } + + cancelledRef.current = false; + + const poll = async () => { + if (cancelledRef.current) return; + if (TERMINAL_STATES.has(statusRef.current)) { + cleanup(); + return; + } + + setResult((prev) => ({ ...prev, isLoading: true })); + + try { + const res = await fetch(`/api/v1/milady/agents/${agentId}`); + if (cancelledRef.current) return; + + if (!res.ok) { + setResult((prev) => ({ + ...prev, + isLoading: false, + error: `HTTP ${res.status}`, + })); + return; + } + + const json = await res.json(); + const data = json?.data; + if (!data) return; + + const newStatus = (data.status as SandboxStatus) ?? "pending"; + statusRef.current = newStatus; + + setResult({ + status: newStatus, + lastHeartbeat: data.lastHeartbeatAt ?? null, + error: data.errorMessage ?? null, + isLoading: false, + }); + + // Stop polling once we've reached a terminal state + if (TERMINAL_STATES.has(newStatus)) { + cleanup(); + } + } catch { + if (!cancelledRef.current) { + setResult((prev) => ({ ...prev, isLoading: false })); + } + } + }; + + // Initial poll + void poll(); + + // Set up interval + intervalRef.current = setInterval(() => void poll(), intervalMs); + + return cleanup; + }, [agentId, enabled, intervalMs, cleanup]); + + return result; +} + +/** + * Polls the agent list endpoint while any sandbox is in an active state. + * Returns true when any sandbox transitions to 'running'. + */ +export function useSandboxListPoll( + sandboxes: Array<{ id: string; status: string }>, + options: { + intervalMs?: number; + onTransitionToRunning?: (agentId: string, agentName?: string) => void; + } = {}, +) { + const { intervalMs = 10_000, onTransitionToRunning } = options; + const [isPolling, setIsPolling] = useState(false); + const previousStatusesRef = useRef>(new Map()); + const callbackRef = useRef(onTransitionToRunning); + const intervalRef = useRef | null>(null); + + useEffect(() => { + callbackRef.current = onTransitionToRunning; + }, [onTransitionToRunning]); + + // Sync current sandbox statuses + useEffect(() => { + const statusMap = new Map(); + for (const sb of sandboxes) { + statusMap.set(sb.id, sb.status); + } + previousStatusesRef.current = statusMap; + }, [sandboxes]); + + const hasActiveAgents = sandboxes.some((sb) => + ACTIVE_STATES.has(sb.status as SandboxStatus), + ); + + useEffect(() => { + if (!hasActiveAgents) { + setIsPolling(false); + if (intervalRef.current) { + clearInterval(intervalRef.current); + intervalRef.current = null; + } + return; + } + + setIsPolling(true); + let cancelled = false; + + const poll = async () => { + if (cancelled) return; + + try { + const res = await fetch("/api/v1/milady/agents"); + if (cancelled || !res.ok) return; + + const json = await res.json(); + const agents: Array<{ id: string; status: string; agentName?: string; agent_name?: string }> = + json?.data ?? []; + + for (const agent of agents) { + const prevStatus = previousStatusesRef.current.get(agent.id); + const newStatus = agent.status; + + if ( + prevStatus && + ACTIVE_STATES.has(prevStatus as SandboxStatus) && + newStatus === "running" + ) { + callbackRef.current?.(agent.id, agent.agentName ?? agent.agent_name); + } + + previousStatusesRef.current.set(agent.id, newStatus); + } + } catch { + // Silently retry on next interval + } + }; + + intervalRef.current = setInterval(() => void poll(), intervalMs); + + return () => { + cancelled = true; + if (intervalRef.current) { + clearInterval(intervalRef.current); + intervalRef.current = null; + } + }; + }, [hasActiveAgents, intervalMs]); + + return { isPolling }; +} diff --git a/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx b/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx index e259f0a8e..1ffde7232 100644 --- a/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx +++ b/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx @@ -17,18 +17,224 @@ import { SelectValue, Switch, } from "@elizaos/cloud-ui"; -import { Loader2, Plus } from "lucide-react"; +import { Check, ExternalLink, Loader2, Plus, RotateCcw, X } from "lucide-react"; import { useRouter } from "next/navigation"; -import { type ReactNode, useState } from "react"; +import { type ReactNode, useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { AGENT_FLAVORS, getDefaultFlavor, getFlavorById } from "@/lib/constants/agent-flavors"; +import { useSandboxStatusPoll, type SandboxStatus } from "@/lib/hooks/use-sandbox-status-poll"; +import { getClientSafeMiladyAgentWebUiUrl } from "@/lib/milady-web-ui"; +import { openWebUIWithPairing } from "@/lib/hooks/open-web-ui"; + +// ---------------------------------------------------------------- +// Provisioning Steps +// ---------------------------------------------------------------- + +interface StepConfig { + label: string; + matchStatuses: SandboxStatus[]; +} + +const PROVISIONING_STEPS: StepConfig[] = [ + { label: "Agent created", matchStatuses: [] }, + { label: "Provisioning database", matchStatuses: ["pending"] }, + { label: "Starting container", matchStatuses: ["provisioning"] }, + { label: "Agent running", matchStatuses: ["running"] }, +]; + +function getActiveStepIndex(status: SandboxStatus): number { + if (status === "running") return 3; + if (status === "provisioning") return 2; + if (status === "pending") return 1; + return 0; +} + +type StepState = "complete" | "active" | "pending" | "error"; + +function getStepState( + stepIndex: number, + activeIndex: number, + hasError: boolean, +): StepState { + if (hasError && stepIndex === activeIndex) return "error"; + if (stepIndex < activeIndex) return "complete"; + if (stepIndex === activeIndex) return "active"; + return "pending"; +} + +// ---------------------------------------------------------------- +// Step Indicator Component +// ---------------------------------------------------------------- + +function StepIndicator({ state }: { state: StepState }) { + switch (state) { + case "complete": + return ( +
+ +
+ ); + case "active": + return ( +
+ + +
+ ); + case "error": + return ( +
+ +
+ ); + case "pending": + default: + return ( +
+ +
+ ); + } +} + +// ---------------------------------------------------------------- +// Provisioning Progress View +// ---------------------------------------------------------------- + +function ProvisioningProgress({ + status, + error, + agentId, + elapsedSec, + onClose, + onRetry, +}: { + status: SandboxStatus; + error: string | null; + agentId: string; + elapsedSec: number; + onClose: () => void; + onRetry: () => void; +}) { + const activeIndex = getActiveStepIndex(status); + const hasError = status === "error"; + const isComplete = status === "running"; + + return ( +
+ {/* Header */} +
+

+ {isComplete + ? "Your agent is ready!" + : hasError + ? "Something went wrong" + : "Setting up your agent..."} +

+ {!isComplete && !hasError && ( + + {elapsedSec < 60 + ? `${elapsedSec}s` + : `${Math.floor(elapsedSec / 60)}m ${elapsedSec % 60}s`} + {" · usually ~90s"} + + )} +
+ + {/* Steps */} +
+ {PROVISIONING_STEPS.map((step, i) => { + const state = getStepState(i, activeIndex, hasError); + const isLast = i === PROVISIONING_STEPS.length - 1; + return ( +
+ {/* Vertical connector line */} + {!isLast && ( +
+
+
+ )} + +
+

+ {step.label} +

+
+
+ ); + })} +
+ + {/* Error message */} + {hasError && error && ( +
+

{error}

+ +
+ )} + + {/* Actions */} +
+ {isComplete ? ( + <> + openWebUIWithPairing(agentId)} + > + + Open Web UI + + + Done + + + ) : ( + + {hasError ? "Close" : "Close — provisioning continues in background"} + + )} +
+
+ ); +} + +// ---------------------------------------------------------------- +// Main Dialog Component +// ---------------------------------------------------------------- interface CreateMiladySandboxDialogProps { trigger?: ReactNode; onProvisionQueued?: (agentId: string, jobId: string) => void; } -type CreatePhase = "idle" | "creating" | "provisioning"; +type CreatePhase = "form" | "creating" | "provisioning"; export function CreateMiladySandboxDialog({ trigger, @@ -40,14 +246,62 @@ export function CreateMiladySandboxDialog({ const [flavorId, setFlavorId] = useState(getDefaultFlavor().id); const [customImage, setCustomImage] = useState(""); const [autoStart, setAutoStart] = useState(true); - const [phase, setPhase] = useState("idle"); + const [phase, setPhase] = useState("form"); const [error, setError] = useState(null); + const [createdAgentId, setCreatedAgentId] = useState(null); + const [provisionStartTime, setProvisionStartTime] = useState(null); + const [elapsedSec, setElapsedSec] = useState(0); - const busy = phase !== "idle"; + const busy = phase === "creating"; + const isProvisioningPhase = phase === "provisioning"; const selectedFlavor = getFlavorById(flavorId); const isCustom = flavorId === "custom"; const resolvedDockerImage = isCustom ? customImage.trim() : selectedFlavor?.dockerImage; + // Poll the agent status while in provisioning phase + const pollResult = useSandboxStatusPoll( + isProvisioningPhase ? createdAgentId : null, + { intervalMs: 5_000, enabled: isProvisioningPhase }, + ); + + // Elapsed time counter + useEffect(() => { + if (!provisionStartTime) { + setElapsedSec(0); + return; + } + const tick = () => setElapsedSec(Math.floor((Date.now() - provisionStartTime) / 1000)); + tick(); + const id = setInterval(tick, 1000); + return () => clearInterval(id); + }, [provisionStartTime]); + + // When provisioning completes, refresh the table data + useEffect(() => { + if (isProvisioningPhase && pollResult.status === "running") { + router.refresh(); + toast.success("Agent is up and running!"); + } + }, [isProvisioningPhase, pollResult.status, router]); + + function resetForm() { + setAgentName(""); + setFlavorId(getDefaultFlavor().id); + setCustomImage(""); + setError(null); + setPhase("form"); + setCreatedAgentId(null); + setProvisionStartTime(null); + setElapsedSec(0); + } + + function handleClose() { + setOpen(false); + // Delay reset so the closing animation finishes + setTimeout(resetForm, 300); + router.refresh(); + } + async function handleCreate() { const trimmedName = agentName.trim(); if (!trimmedName || busy) return; @@ -59,7 +313,6 @@ export function CreateMiladySandboxDialog({ const createBody: Record = { agentName: trimmedName, }; - // Only send dockerImage if it differs from the default milady image if (resolvedDockerImage && flavorId !== getDefaultFlavor().id) { createBody.dockerImage = resolvedDockerImage; } @@ -82,10 +335,13 @@ export function CreateMiladySandboxDialog({ throw new Error("Sandbox created but no agent id was returned"); } - toast.success(`Sandbox "${trimmedName}" created`); + setCreatedAgentId(agentId); if (autoStart) { + // Transition to provisioning view instead of closing setPhase("provisioning"); + setProvisionStartTime(Date.now()); + const provisionRes = await fetch(`/api/v1/milady/agents/${agentId}/provision`, { method: "POST", }); @@ -96,44 +352,58 @@ export function CreateMiladySandboxDialog({ if (jobId) { onProvisionQueued?.(agentId, jobId); } - toast.info( - provisionRes.status === 409 - ? jobId - ? `Provisioning already in progress, job ${jobId.slice(0, 8)} is running` - : "Provisioning is already in progress." - : jobId - ? `Provisioning queued, job ${jobId.slice(0, 8)} is running` - : "Provisioning queued. This usually takes about 90 seconds.", - ); + // Stay in provisioning view — the polling hook will track status } else if (provisionRes.ok) { - toast.success("Sandbox is running"); + // Already running (synchronous provision) + toast.success("Agent is running"); + handleClose(); } else { toast.warning( (provisionData as { error?: string }).error ?? "Sandbox created, but auto-start failed. You can start it from the table.", ); + handleClose(); } + } else { + toast.success(`Sandbox "${trimmedName}" created`); + handleClose(); } - - setOpen(false); - setAgentName(""); - setFlavorId(getDefaultFlavor().id); - setCustomImage(""); - setError(null); - setPhase("idle"); - router.refresh(); } catch (err) { const message = err instanceof Error ? err.message : String(err); setError(message); - setPhase("idle"); + setPhase("form"); toast.error(message); } } + async function handleRetryProvision() { + if (!createdAgentId) return; + setProvisionStartTime(Date.now()); + + try { + const res = await fetch(`/api/v1/milady/agents/${createdAgentId}/provision`, { + method: "POST", + }); + const data = await res.json().catch(() => ({})); + + if (res.status === 202 || res.status === 409) { + const jobId = (data as { data?: { jobId?: string } }).data?.jobId; + if (jobId) { + onProvisionQueued?.(createdAgentId, jobId); + } + toast.info("Retrying provisioning..."); + } else if (!res.ok) { + toast.error((data as { error?: string }).error ?? "Retry failed"); + } + } catch (err) { + toast.error(`Retry failed: ${err instanceof Error ? err.message : String(err)}`); + } + } + return ( <> {trigger ? ( -
!busy && setOpen(true)}>{trigger}
+
phase === "form" && setOpen(true)}>{trigger}
) : ( setOpen(true)} disabled={busy}> @@ -141,125 +411,147 @@ export function CreateMiladySandboxDialog({ )} - !busy && setOpen(nextOpen)}> + { + if (!nextOpen && !busy) { + handleClose(); + } + }} + > - Create Milady Sandbox - - Create a new agent sandbox and optionally start provisioning right away. - + + {isProvisioningPhase ? "Launching Agent" : "Create Milady Sandbox"} + + {!isProvisioningPhase && ( + + Create a new agent sandbox and optionally start provisioning right away. + + )} -
-
- - setAgentName(e.target.value)} - disabled={busy} - className="bg-black/40 border-white/10 text-white placeholder:text-neutral-600" - onKeyDown={(e) => { - if (e.key === "Enter") { - e.preventDefault(); - void handleCreate(); - } - }} - maxLength={100} - autoFocus - /> -
+ {isProvisioningPhase ? ( + + ) : ( + <> +
+
+ + setAgentName(e.target.value)} + disabled={busy} + className="bg-black/40 border-white/10 text-white placeholder:text-neutral-600" + onKeyDown={(e) => { + if (e.key === "Enter") { + e.preventDefault(); + void handleCreate(); + } + }} + maxLength={100} + autoFocus + /> +
- {/* Flavor selector */} -
- - - {selectedFlavor && ( -

{selectedFlavor.description}

- )} -
+ {/* Flavor selector */} +
+ + + {selectedFlavor && ( +

{selectedFlavor.description}

+ )} +
- {/* Custom image input (only when "custom" flavor is selected) */} - {isCustom && ( -
- - setCustomImage(e.target.value)} - disabled={busy} - className="bg-black/40 border-white/10 text-white placeholder:text-neutral-600" - maxLength={256} - /> -
- )} + {/* Custom image input */} + {isCustom && ( +
+ + setCustomImage(e.target.value)} + disabled={busy} + className="bg-black/40 border-white/10 text-white placeholder:text-neutral-600" + maxLength={256} + /> +
+ )} -
-
- -

- Queue provisioning as soon as the sandbox record is created. -

-
- -
+
+
+ +

+ Queue provisioning as soon as the sandbox record is created. +

+
+ +
- {error && ( -

- {error} -

- )} -
+ {error && ( +

+ {error} +

+ )} +
- - setOpen(false)} disabled={busy}> - Cancel - - void handleCreate()} - disabled={!agentName.trim() || busy || (isCustom && !customImage.trim())} - > - {busy && } - {phase === "creating" - ? "Creating..." - : phase === "provisioning" - ? "Queueing..." - : autoStart - ? "Create & Start" - : "Create Sandbox"} - - + + + Cancel + + void handleCreate()} + disabled={!agentName.trim() || busy || (isCustom && !customImage.trim())} + > + {busy && } + {busy + ? "Creating..." + : autoStart + ? "Create & Start" + : "Create Sandbox"} + + + + )}
diff --git a/packages/ui/src/components/containers/milady-sandboxes-table.tsx b/packages/ui/src/components/containers/milady-sandboxes-table.tsx index 2db6e4c38..bbdde35d0 100644 --- a/packages/ui/src/components/containers/milady-sandboxes-table.tsx +++ b/packages/ui/src/components/containers/milady-sandboxes-table.tsx @@ -1,7 +1,7 @@ /** * Milady Sandboxes Table — lists AI agent sandboxes in the containers dashboard. * Distinguishes between Docker-backed (node_id set) and Vercel-backed sandboxes. - * Keeps the user-facing surface focused on Milady actions instead of raw infra. + * Auto-refreshes while any sandbox is in an active (pending/provisioning) state. */ "use client"; @@ -47,10 +47,11 @@ import { } from "lucide-react"; import Link from "next/link"; import { useRouter } from "next/navigation"; -import { useMemo, useState } from "react"; +import { useEffect, useMemo, useRef, useState } from "react"; import { toast } from "sonner"; import { openWebUIWithPairing } from "@/lib/hooks/open-web-ui"; import { useJobPoller } from "@/lib/hooks/use-job-poller"; +import { useSandboxListPoll } from "@/lib/hooks/use-sandbox-status-poll"; import { getClientSafeMiladyAgentWebUiUrl } from "@/lib/milady-web-ui"; import { CreateMiladySandboxDialog } from "./create-milady-sandbox-dialog"; @@ -142,6 +143,86 @@ function formatRelative(date: Date | string | null): string { return d.toLocaleDateString(); } +// ---------------------------------------------------------------- +// Status Cell — animated transitions +// ---------------------------------------------------------------- + +function StatusCell({ + displayStatus, + isProvisioning, + trackedJob, + errorMessage, +}: { + displayStatus: string; + isProvisioning: boolean; + trackedJob?: { jobId: string } | null; + errorMessage: string | null; +}) { + const [prevStatus, setPrevStatus] = useState(displayStatus); + const [animate, setAnimate] = useState<"success" | "error" | null>(null); + + useEffect(() => { + if (prevStatus !== displayStatus) { + if (displayStatus === "running" && (prevStatus === "provisioning" || prevStatus === "pending")) { + setAnimate("success"); + const id = setTimeout(() => setAnimate(null), 1500); + setPrevStatus(displayStatus); + return () => clearTimeout(id); + } + if (displayStatus === "error") { + setAnimate("error"); + const id = setTimeout(() => setAnimate(null), 600); + setPrevStatus(displayStatus); + return () => clearTimeout(id); + } + setPrevStatus(displayStatus); + } + }, [displayStatus, prevStatus]); + + return ( +
+
+ + {statusDot(displayStatus)} {displayStatus} + +
+ {isProvisioning && trackedJob && ( + + + Starting, job {trackedJob.jobId.slice(0, 8)} + + )} + {errorMessage && ( + + +

+ {errorMessage} +

+
+ +

{errorMessage}

+
+
+ )} +
+ ); +} + // ---------------------------------------------------------------- // Component // ---------------------------------------------------------------- @@ -157,6 +238,21 @@ export function MiladySandboxesTable({ sandboxes }: MiladySandboxesTableProps) { onFailed: (job) => toast.error(job.error ?? "Provisioning failed"), }); + // Auto-refresh polling: polls the list endpoint while any sandbox is active + useSandboxListPoll( + sandboxes.map((sb) => ({ + id: sb.id, + status: poller.isActive(sb.id) ? "provisioning" : sb.status, + })), + { + intervalMs: 10_000, + onTransitionToRunning: (_id, name) => { + toast.success(`${name ?? "Agent"} is now running!`); + router.refresh(); + }, + }, + ); + const [searchQuery, setSearchQuery] = useState(""); const [statusFilter, setStatusFilter] = useState("all"); const [sortField, setSortField] = useState<"name" | "status" | "created">("created"); @@ -441,34 +537,14 @@ export function MiladySandboxesTable({ sandboxes }: MiladySandboxesTableProps) {
- {/* Status */} + {/* Status — with animated transitions */} -
- - {statusDot(displayStatus)} {displayStatus} - - {isProvisioning && trackedJob && ( - - - Starting, job {trackedJob.jobId.slice(0, 8)} - - )} - {sb.error_message && ( - - -

- {sb.error_message} -

-
- -

{sb.error_message}

-
-
- )} -
+
{/* Runtime */} From 4cfa65f4ad5a418a7ff7e26e85f9ece172b6b776 Mon Sep 17 00:00:00 2001 From: Sol Date: Mon, 16 Mar 2026 08:24:48 +0000 Subject: [PATCH 02/30] feat: admin infrastructure dashboard API + service - New admin infrastructure API endpoint (app/api/v1/admin/infrastructure/route.ts) - Admin infrastructure service with SSH node inspection and container health classification - Unit tests for container health classification (5 tests, all passing) - Classifies containers as healthy, failed, missing, warming, or stale based on runtime state, control plane records, and heartbeat freshness --- app/api/v1/admin/infrastructure/route.ts | 34 + packages/lib/services/admin-infrastructure.ts | 880 ++++++++++++++++++ .../tests/unit/admin-infrastructure.test.ts | 90 ++ 3 files changed, 1004 insertions(+) create mode 100644 app/api/v1/admin/infrastructure/route.ts create mode 100644 packages/lib/services/admin-infrastructure.ts create mode 100644 packages/tests/unit/admin-infrastructure.test.ts diff --git a/app/api/v1/admin/infrastructure/route.ts b/app/api/v1/admin/infrastructure/route.ts new file mode 100644 index 000000000..e672207c0 --- /dev/null +++ b/app/api/v1/admin/infrastructure/route.ts @@ -0,0 +1,34 @@ +import { NextRequest, NextResponse } from "next/server"; +import { requireAdmin } from "@/lib/auth"; +import { getAdminInfrastructureSnapshot } from "@/lib/services/admin-infrastructure"; +import { logger } from "@/lib/utils/logger"; + +export const dynamic = "force-dynamic"; + +export async function GET(request: NextRequest) { + const { role } = await requireAdmin(request); + if (role !== "super_admin") { + return NextResponse.json( + { success: false, error: "Super admin access required" }, + { status: 403 }, + ); + } + + try { + const snapshot = await getAdminInfrastructureSnapshot(); + + return NextResponse.json({ + success: true, + data: snapshot, + }); + } catch (error) { + logger.error("[Admin Infrastructure] Failed to build infrastructure snapshot", { + error: error instanceof Error ? error.message : String(error), + }); + + return NextResponse.json( + { success: false, error: "Failed to load infrastructure snapshot" }, + { status: 500 }, + ); + } +} diff --git a/packages/lib/services/admin-infrastructure.ts b/packages/lib/services/admin-infrastructure.ts new file mode 100644 index 000000000..afeb34187 --- /dev/null +++ b/packages/lib/services/admin-infrastructure.ts @@ -0,0 +1,880 @@ +import { asc } from "drizzle-orm"; +import { dbRead } from "@/db/helpers"; +import { dockerNodesRepository } from "@/db/repositories/docker-nodes"; +import type { DockerNodeStatus } from "@/db/schemas/docker-nodes"; +import { type MiladySandboxStatus, miladySandboxes } from "@/db/schemas/milady-sandboxes"; +import { DockerSSHClient } from "@/lib/services/docker-ssh"; +import { logger } from "@/lib/utils/logger"; + +const HEARTBEAT_WARNING_MINUTES = 5; +const HEARTBEAT_STALE_MINUTES = 15; +const NODE_SATURATION_WARNING_PCT = 85; +const NODE_SATURATION_CRITICAL_PCT = 100; +const NODE_RESOURCE_WARNING_PCT = 85; +const NODE_RESOURCE_CRITICAL_PCT = 95; +const SSH_CONNECT_TIMEOUT_MS = 10_000; +const SSH_COMMAND_TIMEOUT_MS = 15_000; + +type IncidentSeverity = "critical" | "warning" | "info"; +type IncidentScope = "cluster" | "node" | "container"; + +export type ContainerLiveHealthStatus = + | "healthy" + | "warming" + | "degraded" + | "stale" + | "missing" + | "failed" + | "stopped" + | "unknown"; + +export interface ContainerHealthAssessment { + status: ContainerLiveHealthStatus; + severity: IncidentSeverity; + reason: string; +} + +interface RuntimeContainerRecord { + name: string; + id: string; + image: string | null; + state: string; + status: string; + runningFor: string | null; + health: "healthy" | "unhealthy" | "starting" | null; +} + +interface NodeRuntimeSnapshot { + reachable: boolean; + checkedAt: string; + sshLatencyMs: number | null; + dockerVersion: string | null; + diskUsedPercent: number | null; + memoryUsedPercent: number | null; + loadAverage: string | null; + actualContainerCount: number; + runningContainerCount: number; + containers: RuntimeContainerRecord[]; + error: string | null; +} + +export interface AdminInfrastructureContainer { + id: string; + sandboxId: string | null; + agentName: string | null; + organizationId: string | null; + userId: string | null; + nodeId: string | null; + containerName: string | null; + dbStatus: MiladySandboxStatus; + liveHealth: ContainerLiveHealthStatus; + liveHealthSeverity: IncidentSeverity; + liveHealthReason: string; + runtimeState: string | null; + runtimeStatus: string | null; + runtimePresent: boolean; + dockerImage: string | null; + bridgePort: number | null; + webUiPort: number | null; + headscaleIp: string | null; + bridgeUrl: string | null; + healthUrl: string | null; + lastHeartbeatAt: string | null; + heartbeatAgeMinutes: number | null; + errorMessage: string | null; + errorCount: number; + createdAt: string; + updatedAt: string; +} + +export interface AdminInfrastructureNode { + id: string; + nodeId: string; + hostname: string; + sshPort: number; + sshUser: string; + capacity: number; + allocatedCount: number; + availableSlots: number; + enabled: boolean; + status: DockerNodeStatus; + lastHealthCheck: string | null; + utilizationPct: number; + runtime: NodeRuntimeSnapshot; + allocationDrift: number; + alerts: string[]; + containers: AdminInfrastructureContainer[]; + ghostContainers: Array<{ + name: string; + state: string; + status: string; + }>; + metadata: Record | null; + createdAt: string; + updatedAt: string; +} + +export interface AdminInfrastructureIncident { + severity: IncidentSeverity; + scope: IncidentScope; + title: string; + detail: string; + nodeId?: string; + containerId?: string; +} + +export interface AdminInfrastructureSummary { + totalNodes: number; + enabledNodes: number; + healthyNodes: number; + degradedNodes: number; + offlineNodes: number; + unknownNodes: number; + totalCapacity: number; + allocatedSlots: number; + availableSlots: number; + utilizationPct: number; + saturatedNodes: number; + nodesWithDrift: number; + totalContainers: number; + runningContainers: number; + pendingContainers: number; + provisioningContainers: number; + stoppedContainers: number; + errorContainers: number; + disconnectedContainers: number; + healthyContainers: number; + attentionContainers: number; + staleContainers: number; + missingContainers: number; + failedContainers: number; + backlogCount: number; +} + +export interface AdminInfrastructureSnapshot { + refreshedAt: string; + summary: AdminInfrastructureSummary; + incidents: AdminInfrastructureIncident[]; + nodes: AdminInfrastructureNode[]; + containers: AdminInfrastructureContainer[]; +} + +function toIso(value: Date | string | null | undefined): string | null { + if (!value) return null; + return value instanceof Date ? value.toISOString() : new Date(value).toISOString(); +} + +function parsePercent(value: string): number | null { + const parsed = Number.parseInt(value.replace(/%/g, "").trim(), 10); + return Number.isFinite(parsed) ? parsed : null; +} + +function parseMemoryPercent(value: string): number | null { + const [usedRaw, totalRaw] = value.split("|"); + const used = Number.parseInt(usedRaw ?? "", 10); + const total = Number.parseInt(totalRaw ?? "", 10); + + if (!Number.isFinite(used) || !Number.isFinite(total) || total <= 0) { + return null; + } + + return Math.round((used / total) * 100); +} + +function parseRuntimeContainers(output: string): RuntimeContainerRecord[] { + return output + .split("\n") + .map((line) => line.trim()) + .filter(Boolean) + .map((line) => { + const [name = "", id = "", image = "", state = "", status = "", runningFor = ""] = + line.split("|"); + return { + name, + id, + image: image || null, + state: state.toLowerCase(), + status, + runningFor: runningFor || null, + health: parseDockerHealth(status), + } satisfies RuntimeContainerRecord; + }); +} + +function parseDockerHealth(status: string): RuntimeContainerRecord["health"] { + const normalized = status.toLowerCase(); + if (normalized.includes("unhealthy")) return "unhealthy"; + if (normalized.includes("healthy")) return "healthy"; + if (normalized.includes("health: starting") || normalized.includes("starting")) return "starting"; + return null; +} + +function getHeartbeatAgeMinutes(lastHeartbeatAt: string | null): number | null { + if (!lastHeartbeatAt) return null; + const parsed = new Date(lastHeartbeatAt).getTime(); + if (Number.isNaN(parsed)) return null; + return Math.max(0, Math.round((Date.now() - parsed) / 60_000)); +} + +function buildResourceAlert(label: string, percent: number | null): string | null { + if (percent === null) return null; + if (percent >= NODE_RESOURCE_CRITICAL_PCT) return `${label} ${percent}% used`; + if (percent >= NODE_RESOURCE_WARNING_PCT) return `${label} ${percent}% used`; + return null; +} + +function sortIncidents(a: AdminInfrastructureIncident, b: AdminInfrastructureIncident): number { + const severityWeight: Record = { + critical: 0, + warning: 1, + info: 2, + }; + + return severityWeight[a.severity] - severityWeight[b.severity] || a.title.localeCompare(b.title); +} + +export function classifyContainerHealth(params: { + dbStatus: MiladySandboxStatus; + runtime: RuntimeContainerRecord | null; + lastHeartbeatAt: string | null; + errorMessage: string | null; +}): ContainerHealthAssessment { + const heartbeatAgeMinutes = getHeartbeatAgeMinutes(params.lastHeartbeatAt); + const runtime = params.runtime; + + if (params.dbStatus === "error") { + return { + status: "failed", + severity: "critical", + reason: params.errorMessage || "Provisioning or runtime error recorded in control plane", + }; + } + + if (params.dbStatus === "stopped" && !runtime) { + return { + status: "stopped", + severity: "info", + reason: "Container is intentionally stopped", + }; + } + + if (!runtime) { + if (params.dbStatus === "pending" || params.dbStatus === "provisioning") { + return { + status: "warming", + severity: "info", + reason: "Container is not on a node yet", + }; + } + + return { + status: "missing", + severity: "critical", + reason: "Database record exists but container is missing from the node", + }; + } + + if (params.dbStatus === "stopped") { + return { + status: "degraded", + severity: "warning", + reason: "Control plane says stopped but container still exists on the node", + }; + } + + if (runtime.state === "dead" || runtime.state === "exited") { + return { + status: "failed", + severity: "critical", + reason: runtime.status || "Container exited unexpectedly", + }; + } + + if (runtime.state === "restarting") { + return { + status: "degraded", + severity: "warning", + reason: runtime.status || "Container is restarting", + }; + } + + if (runtime.state === "created") { + return { + status: "warming", + severity: "info", + reason: "Container exists but has not started yet", + }; + } + + if (runtime.health === "unhealthy") { + return { + status: "failed", + severity: "critical", + reason: runtime.status || "Docker health check reports unhealthy", + }; + } + + if (runtime.health === "starting") { + return { + status: "warming", + severity: "info", + reason: runtime.status || "Docker health check is still warming up", + }; + } + + if (params.dbStatus === "pending" || params.dbStatus === "provisioning") { + return { + status: "warming", + severity: "info", + reason: "Provisioning is still in progress", + }; + } + + if (params.dbStatus === "disconnected") { + return { + status: "degraded", + severity: "warning", + reason: "Container is running but marked disconnected", + }; + } + + if (heartbeatAgeMinutes === null) { + return { + status: "degraded", + severity: "warning", + reason: "No heartbeat has been recorded yet", + }; + } + + if (heartbeatAgeMinutes >= HEARTBEAT_STALE_MINUTES) { + return { + status: "stale", + severity: "critical", + reason: `Heartbeat is ${heartbeatAgeMinutes}m old`, + }; + } + + if (heartbeatAgeMinutes >= HEARTBEAT_WARNING_MINUTES) { + return { + status: "degraded", + severity: "warning", + reason: `Heartbeat is delayed (${heartbeatAgeMinutes}m old)`, + }; + } + + return { + status: "healthy", + severity: "info", + reason: runtime.status || "Container is running normally", + }; +} + +async function inspectNodeRuntime(node: { + node_id: string; + hostname: string; + ssh_port: number; + ssh_user: string; + host_key_fingerprint: string | null; +}): Promise { + const checkedAt = new Date().toISOString(); + const ssh = new DockerSSHClient({ + hostname: node.hostname, + port: node.ssh_port, + username: node.ssh_user, + hostKeyFingerprint: node.host_key_fingerprint ?? undefined, + }); + + try { + const sshStart = Date.now(); + await ssh.exec("echo ok", SSH_CONNECT_TIMEOUT_MS); + const sshLatencyMs = Date.now() - sshStart; + + const [dockerVersionRaw, diskRaw, memoryRaw, loadAverageRaw, containersRaw] = await Promise.all( + [ + ssh.exec("docker version --format '{{.Server.Version}}'", SSH_COMMAND_TIMEOUT_MS), + ssh.exec("df -P / | tail -1 | awk '{print $5}'", SSH_COMMAND_TIMEOUT_MS), + ssh.exec("free -b | awk '/Mem:/ {print $3\"|\"$2}'", SSH_COMMAND_TIMEOUT_MS), + ssh.exec("cut -d' ' -f1-3 /proc/loadavg", SSH_COMMAND_TIMEOUT_MS), + ssh.exec( + "docker ps -a --filter name=milady- --format '{{.Names}}|{{.ID}}|{{.Image}}|{{.State}}|{{.Status}}|{{.RunningFor}}' 2>/dev/null || true", + SSH_COMMAND_TIMEOUT_MS, + ), + ], + ); + + const containers = parseRuntimeContainers(containersRaw); + + return { + reachable: true, + checkedAt, + sshLatencyMs, + dockerVersion: dockerVersionRaw.trim() || null, + diskUsedPercent: parsePercent(diskRaw), + memoryUsedPercent: parseMemoryPercent(memoryRaw.trim()), + loadAverage: loadAverageRaw.trim() || null, + actualContainerCount: containers.length, + runningContainerCount: containers.filter((container) => container.state === "running").length, + containers, + error: null, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + logger.warn("[admin-infrastructure] Failed to inspect node runtime", { + nodeId: node.node_id, + error: message, + }); + + return { + reachable: false, + checkedAt, + sshLatencyMs: null, + dockerVersion: null, + diskUsedPercent: null, + memoryUsedPercent: null, + loadAverage: null, + actualContainerCount: 0, + runningContainerCount: 0, + containers: [], + error: message, + }; + } finally { + try { + await ssh.disconnect(); + } catch { + // ignore cleanup failures + } + } +} + +function buildNodeAlerts(params: { + node: Awaited>[number]; + runtime: NodeRuntimeSnapshot; + allocationDrift: number; + unhealthyContainerCount: number; +}): string[] { + const alerts: string[] = []; + const { node, runtime, allocationDrift, unhealthyContainerCount } = params; + + if (!node.enabled) { + alerts.push("Node is disabled for new allocations"); + } + + if (!runtime.reachable) { + alerts.push("Live SSH inspection failed"); + return alerts; + } + + const saturation = + node.capacity > 0 ? Math.round((node.allocated_count / node.capacity) * 100) : 0; + if (saturation >= NODE_SATURATION_CRITICAL_PCT) { + alerts.push(`Capacity exhausted (${saturation}% allocated)`); + } else if (saturation >= NODE_SATURATION_WARNING_PCT) { + alerts.push(`Capacity nearly full (${saturation}% allocated)`); + } + + const diskAlert = buildResourceAlert("Disk", runtime.diskUsedPercent); + if (diskAlert) alerts.push(diskAlert); + + const memoryAlert = buildResourceAlert("Memory", runtime.memoryUsedPercent); + if (memoryAlert) alerts.push(memoryAlert); + + if (allocationDrift !== 0) { + const driftDirection = allocationDrift > 0 ? `+${allocationDrift}` : `${allocationDrift}`; + alerts.push(`Allocation drift ${driftDirection} vs control plane`); + } + + if (unhealthyContainerCount > 0) { + alerts.push( + `${unhealthyContainerCount} container${unhealthyContainerCount === 1 ? "" : "s"} need attention`, + ); + } + + return alerts; +} + +export async function getAdminInfrastructureSnapshot(): Promise { + const refreshedAt = new Date().toISOString(); + + const [nodes, sandboxRows] = await Promise.all([ + dockerNodesRepository.findAll(), + dbRead + .select({ + id: miladySandboxes.id, + sandboxId: miladySandboxes.sandbox_id, + organizationId: miladySandboxes.organization_id, + userId: miladySandboxes.user_id, + agentName: miladySandboxes.agent_name, + status: miladySandboxes.status, + nodeId: miladySandboxes.node_id, + containerName: miladySandboxes.container_name, + bridgePort: miladySandboxes.bridge_port, + webUiPort: miladySandboxes.web_ui_port, + headscaleIp: miladySandboxes.headscale_ip, + dockerImage: miladySandboxes.docker_image, + bridgeUrl: miladySandboxes.bridge_url, + healthUrl: miladySandboxes.health_url, + lastHeartbeatAt: miladySandboxes.last_heartbeat_at, + errorMessage: miladySandboxes.error_message, + errorCount: miladySandboxes.error_count, + createdAt: miladySandboxes.created_at, + updatedAt: miladySandboxes.updated_at, + }) + .from(miladySandboxes) + .orderBy(asc(miladySandboxes.created_at)), + ]); + + const sandboxesByNode = new Map(); + const unassignedSandboxRows = [] as typeof sandboxRows; + + for (const row of sandboxRows) { + if (!row.nodeId) { + unassignedSandboxRows.push(row); + continue; + } + + const existing = sandboxesByNode.get(row.nodeId) ?? []; + existing.push(row); + sandboxesByNode.set(row.nodeId, existing); + } + + const inspectedNodes = await Promise.all( + nodes.map(async (node) => { + const dbContainers = sandboxesByNode.get(node.node_id) ?? []; + const runtime = await inspectNodeRuntime(node); + const runtimeByName = new Map( + runtime.containers.map((container) => [container.name, container]), + ); + + const containers: AdminInfrastructureContainer[] = dbContainers.map((container) => { + const runtimeMatch = container.containerName + ? (runtimeByName.get(container.containerName) ?? null) + : null; + const health = classifyContainerHealth({ + dbStatus: container.status, + runtime: runtimeMatch, + lastHeartbeatAt: toIso(container.lastHeartbeatAt), + errorMessage: container.errorMessage, + }); + + return { + id: container.id, + sandboxId: container.sandboxId, + agentName: container.agentName, + organizationId: container.organizationId, + userId: container.userId, + nodeId: container.nodeId, + containerName: container.containerName, + dbStatus: container.status, + liveHealth: health.status, + liveHealthSeverity: health.severity, + liveHealthReason: health.reason, + runtimeState: runtimeMatch?.state ?? null, + runtimeStatus: runtimeMatch?.status ?? null, + runtimePresent: !!runtimeMatch, + dockerImage: container.dockerImage ?? runtimeMatch?.image ?? null, + bridgePort: container.bridgePort, + webUiPort: container.webUiPort, + headscaleIp: container.headscaleIp, + bridgeUrl: container.bridgeUrl, + healthUrl: container.healthUrl, + lastHeartbeatAt: toIso(container.lastHeartbeatAt), + heartbeatAgeMinutes: getHeartbeatAgeMinutes(toIso(container.lastHeartbeatAt)), + errorMessage: container.errorMessage, + errorCount: container.errorCount ?? 0, + createdAt: toIso(container.createdAt) ?? refreshedAt, + updatedAt: toIso(container.updatedAt) ?? refreshedAt, + }; + }); + + const trackedContainerNames = new Set( + containers + .map((container) => container.containerName) + .filter((value): value is string => Boolean(value)), + ); + + const ghostContainers = runtime.containers + .filter((container) => !trackedContainerNames.has(container.name)) + .map((container) => ({ + name: container.name, + state: container.state, + status: container.status, + })); + + const unhealthyContainerCount = containers.filter( + (container) => + container.liveHealth !== "healthy" && + container.liveHealth !== "warming" && + container.liveHealth !== "stopped", + ).length; + + const allocationDrift = runtime.reachable + ? runtime.actualContainerCount - node.allocated_count + : 0; + + return { + id: node.id, + nodeId: node.node_id, + hostname: node.hostname, + sshPort: node.ssh_port, + sshUser: node.ssh_user, + capacity: node.capacity, + allocatedCount: node.allocated_count, + availableSlots: Math.max(0, node.capacity - node.allocated_count), + enabled: node.enabled, + status: node.status, + lastHealthCheck: toIso(node.last_health_check), + utilizationPct: + node.capacity > 0 ? Math.round((node.allocated_count / node.capacity) * 100) : 0, + runtime, + allocationDrift, + alerts: buildNodeAlerts({ node, runtime, allocationDrift, unhealthyContainerCount }), + containers, + ghostContainers, + metadata: node.metadata, + createdAt: toIso(node.created_at) ?? refreshedAt, + updatedAt: toIso(node.updated_at) ?? refreshedAt, + } satisfies AdminInfrastructureNode; + }), + ); + + const unassignedContainers: AdminInfrastructureContainer[] = unassignedSandboxRows.map( + (container) => { + const health = classifyContainerHealth({ + dbStatus: container.status, + runtime: null, + lastHeartbeatAt: toIso(container.lastHeartbeatAt), + errorMessage: container.errorMessage, + }); + + return { + id: container.id, + sandboxId: container.sandboxId, + agentName: container.agentName, + organizationId: container.organizationId, + userId: container.userId, + nodeId: null, + containerName: container.containerName, + dbStatus: container.status, + liveHealth: health.status, + liveHealthSeverity: health.severity, + liveHealthReason: health.reason, + runtimeState: null, + runtimeStatus: null, + runtimePresent: false, + dockerImage: container.dockerImage, + bridgePort: container.bridgePort, + webUiPort: container.webUiPort, + headscaleIp: container.headscaleIp, + bridgeUrl: container.bridgeUrl, + healthUrl: container.healthUrl, + lastHeartbeatAt: toIso(container.lastHeartbeatAt), + heartbeatAgeMinutes: getHeartbeatAgeMinutes(toIso(container.lastHeartbeatAt)), + errorMessage: container.errorMessage, + errorCount: container.errorCount ?? 0, + createdAt: toIso(container.createdAt) ?? refreshedAt, + updatedAt: toIso(container.updatedAt) ?? refreshedAt, + }; + }, + ); + + const containers = [ + ...inspectedNodes.flatMap((node) => node.containers), + ...unassignedContainers, + ]; + const incidents: AdminInfrastructureIncident[] = []; + + for (const node of inspectedNodes) { + if (!node.enabled) { + incidents.push({ + severity: "info", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} disabled`, + detail: "Node is excluded from new allocations", + }); + } + + if (!node.runtime.reachable) { + incidents.push({ + severity: "critical", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} unreachable`, + detail: node.runtime.error || "Live SSH inspection failed", + }); + continue; + } + + if (node.utilizationPct >= NODE_SATURATION_CRITICAL_PCT) { + incidents.push({ + severity: "critical", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} at capacity`, + detail: `${node.allocatedCount}/${node.capacity} slots allocated`, + }); + } else if (node.utilizationPct >= NODE_SATURATION_WARNING_PCT) { + incidents.push({ + severity: "warning", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} nearing capacity`, + detail: `${node.allocatedCount}/${node.capacity} slots allocated`, + }); + } + + if (node.allocationDrift !== 0) { + incidents.push({ + severity: Math.abs(node.allocationDrift) >= 2 ? "critical" : "warning", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} allocation drift`, + detail: `Control plane differs from runtime by ${node.allocationDrift > 0 ? `+${node.allocationDrift}` : node.allocationDrift} container(s)`, + }); + } + + if ( + node.runtime.diskUsedPercent !== null && + node.runtime.diskUsedPercent >= NODE_RESOURCE_CRITICAL_PCT + ) { + incidents.push({ + severity: "critical", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} disk pressure`, + detail: `Disk usage at ${node.runtime.diskUsedPercent}%`, + }); + } + + if ( + node.runtime.memoryUsedPercent !== null && + node.runtime.memoryUsedPercent >= NODE_RESOURCE_CRITICAL_PCT + ) { + incidents.push({ + severity: "critical", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} memory pressure`, + detail: `Memory usage at ${node.runtime.memoryUsedPercent}%`, + }); + } + + for (const ghost of node.ghostContainers) { + incidents.push({ + severity: "warning", + scope: "node", + nodeId: node.nodeId, + title: `${node.nodeId} has ghost container`, + detail: `${ghost.name} is running on the node but not tracked in the control plane`, + }); + } + } + + for (const container of containers) { + if ( + container.liveHealth === "healthy" || + container.liveHealth === "warming" || + container.liveHealth === "stopped" + ) { + continue; + } + + incidents.push({ + severity: container.liveHealthSeverity, + scope: "container", + nodeId: container.nodeId ?? undefined, + containerId: container.id, + title: `${container.agentName || container.containerName || container.id.slice(0, 8)} ${container.liveHealth}`, + detail: container.liveHealthReason, + }); + } + + const enabledNodes = inspectedNodes.filter((node) => node.enabled); + const totalCapacity = enabledNodes.reduce((sum, node) => sum + node.capacity, 0); + const allocatedSlots = enabledNodes.reduce((sum, node) => sum + node.allocatedCount, 0); + const availableSlots = enabledNodes.reduce((sum, node) => sum + node.availableSlots, 0); + + const summary: AdminInfrastructureSummary = { + totalNodes: inspectedNodes.length, + enabledNodes: enabledNodes.length, + healthyNodes: enabledNodes.filter((node) => node.status === "healthy").length, + degradedNodes: enabledNodes.filter((node) => node.status === "degraded").length, + offlineNodes: enabledNodes.filter((node) => node.status === "offline").length, + unknownNodes: enabledNodes.filter((node) => node.status === "unknown").length, + totalCapacity, + allocatedSlots, + availableSlots, + utilizationPct: totalCapacity > 0 ? Math.round((allocatedSlots / totalCapacity) * 100) : 0, + saturatedNodes: enabledNodes.filter( + (node) => node.utilizationPct >= NODE_SATURATION_WARNING_PCT, + ).length, + nodesWithDrift: inspectedNodes.filter((node) => node.allocationDrift !== 0).length, + totalContainers: containers.length, + runningContainers: containers.filter((container) => container.dbStatus === "running").length, + pendingContainers: containers.filter((container) => container.dbStatus === "pending").length, + provisioningContainers: containers.filter((container) => container.dbStatus === "provisioning") + .length, + stoppedContainers: containers.filter((container) => container.dbStatus === "stopped").length, + errorContainers: containers.filter((container) => container.dbStatus === "error").length, + disconnectedContainers: containers.filter((container) => container.dbStatus === "disconnected") + .length, + healthyContainers: containers.filter((container) => container.liveHealth === "healthy").length, + attentionContainers: containers.filter( + (container) => + container.liveHealth !== "healthy" && + container.liveHealth !== "warming" && + container.liveHealth !== "stopped", + ).length, + staleContainers: containers.filter((container) => container.liveHealth === "stale").length, + missingContainers: containers.filter((container) => container.liveHealth === "missing").length, + failedContainers: containers.filter((container) => container.liveHealth === "failed").length, + backlogCount: containers.filter( + (container) => container.dbStatus === "pending" || container.dbStatus === "provisioning", + ).length, + }; + + if (summary.backlogCount > summary.availableSlots && summary.availableSlots >= 0) { + incidents.push({ + severity: "warning", + scope: "cluster", + title: "Provisioning backlog exceeds free capacity", + detail: `${summary.backlogCount} containers are waiting or provisioning with ${summary.availableSlots} slots free`, + }); + } + + if (enabledNodes.length === 0 && summary.totalNodes > 0) { + incidents.push({ + severity: "critical", + scope: "cluster", + title: "No enabled Docker nodes available", + detail: + "Provisioning capacity is unavailable until at least one node is enabled for allocations", + }); + } else if (summary.healthyNodes === 0 && summary.totalNodes > 0) { + incidents.push({ + severity: "critical", + scope: "cluster", + title: "No healthy Docker nodes available", + detail: "Provisioning capacity is effectively unavailable until a node recovers", + }); + } + + return { + refreshedAt, + summary, + incidents: incidents.sort(sortIncidents), + nodes: inspectedNodes, + containers: containers.sort((a, b) => { + const severityWeight: Record = { + critical: 0, + warning: 1, + info: 2, + }; + + return ( + severityWeight[a.liveHealthSeverity] - severityWeight[b.liveHealthSeverity] || + a.createdAt.localeCompare(b.createdAt) + ); + }), + }; +} diff --git a/packages/tests/unit/admin-infrastructure.test.ts b/packages/tests/unit/admin-infrastructure.test.ts new file mode 100644 index 000000000..a61881c2f --- /dev/null +++ b/packages/tests/unit/admin-infrastructure.test.ts @@ -0,0 +1,90 @@ +import { describe, expect, test } from "bun:test"; +import { classifyContainerHealth } from "../../lib/services/admin-infrastructure"; + +describe("classifyContainerHealth", () => { + test("marks runtime-unhealthy containers as failed", () => { + const result = classifyContainerHealth({ + dbStatus: "running", + runtime: { + name: "milady-test", + id: "abc123", + image: "milady/agent:cloud-full-ui", + state: "running", + status: "Up 3m (unhealthy)", + runningFor: "3 minutes", + health: "unhealthy", + }, + lastHeartbeatAt: new Date().toISOString(), + errorMessage: null, + }); + + expect(result.status).toBe("failed"); + expect(result.severity).toBe("critical"); + expect(result.reason).toContain("unhealthy"); + }); + + test("marks missing runtime containers as missing when control plane expects them", () => { + const result = classifyContainerHealth({ + dbStatus: "running", + runtime: null, + lastHeartbeatAt: new Date().toISOString(), + errorMessage: null, + }); + + expect(result.status).toBe("missing"); + expect(result.severity).toBe("critical"); + }); + + test("treats provisioning records without runtime as warming", () => { + const result = classifyContainerHealth({ + dbStatus: "provisioning", + runtime: null, + lastHeartbeatAt: null, + errorMessage: null, + }); + + expect(result.status).toBe("warming"); + expect(result.severity).toBe("info"); + }); + + test("marks old heartbeats as stale even if runtime is running", () => { + const result = classifyContainerHealth({ + dbStatus: "running", + runtime: { + name: "milady-test", + id: "abc123", + image: "milady/agent:cloud-full-ui", + state: "running", + status: "Up 2h", + runningFor: "2 hours", + health: "healthy", + }, + lastHeartbeatAt: new Date(Date.now() - 20 * 60_000).toISOString(), + errorMessage: null, + }); + + expect(result.status).toBe("stale"); + expect(result.severity).toBe("critical"); + expect(result.reason).toContain("Heartbeat"); + }); + + test("accepts healthy running containers with fresh heartbeat", () => { + const result = classifyContainerHealth({ + dbStatus: "running", + runtime: { + name: "milady-test", + id: "abc123", + image: "milady/agent:cloud-full-ui", + state: "running", + status: "Up 10m (healthy)", + runningFor: "10 minutes", + health: "healthy", + }, + lastHeartbeatAt: new Date(Date.now() - 2 * 60_000).toISOString(), + errorMessage: null, + }); + + expect(result.status).toBe("healthy"); + expect(result.severity).toBe("info"); + }); +}); From 03ed530a170d282b2b7d1e609bbe85438e8fc83b Mon Sep 17 00:00:00 2001 From: Sol Date: Mon, 16 Mar 2026 08:24:53 +0000 Subject: [PATCH 03/30] =?UTF-8?q?fix:=20table=20reactivity=20=E2=80=94=20o?= =?UTF-8?q?ptimistic=20updates,=20client-side=20data=20refresh?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Major rewrite of milady-sandboxes-table: local state management, optimistic updates, no more stale data after start/stop/destroy actions - Added onDataRefresh callback to useSandboxListPoll hook for live table updates - Create dialog improvements with onCreated callback to trigger table refresh - Eliminates the stale-data-after-action problem that required manual page reload --- packages/lib/hooks/use-sandbox-status-poll.ts | 37 +- .../create-milady-sandbox-dialog.tsx | 140 ++--- .../containers/milady-sandboxes-table.tsx | 532 ++++++++++++------ 3 files changed, 470 insertions(+), 239 deletions(-) diff --git a/packages/lib/hooks/use-sandbox-status-poll.ts b/packages/lib/hooks/use-sandbox-status-poll.ts index 78ef32436..9aefab8e6 100644 --- a/packages/lib/hooks/use-sandbox-status-poll.ts +++ b/packages/lib/hooks/use-sandbox-status-poll.ts @@ -118,27 +118,50 @@ export function useSandboxStatusPoll( return result; } +/** Raw agent shape returned by the list endpoint (camelCase). */ +export interface SandboxListAgent { + id: string; + status: string; + agentName?: string; + agent_name?: string; + databaseStatus?: string; + errorMessage?: string; + lastHeartbeatAt?: string | null; + createdAt?: string; + updatedAt?: string; + [key: string]: unknown; +} + /** * Polls the agent list endpoint while any sandbox is in an active state. - * Returns true when any sandbox transitions to 'running'. + * Fires `onTransitionToRunning` on status transitions and pushes the full + * agent list via `onDataRefresh` so the parent can update local state without + * a full page reload. */ export function useSandboxListPoll( sandboxes: Array<{ id: string; status: string }>, options: { intervalMs?: number; onTransitionToRunning?: (agentId: string, agentName?: string) => void; + /** Called on every successful poll with the full agent list from the API. */ + onDataRefresh?: (agents: SandboxListAgent[]) => void; } = {}, ) { - const { intervalMs = 10_000, onTransitionToRunning } = options; + const { intervalMs = 10_000, onTransitionToRunning, onDataRefresh } = options; const [isPolling, setIsPolling] = useState(false); const previousStatusesRef = useRef>(new Map()); const callbackRef = useRef(onTransitionToRunning); + const dataRefreshRef = useRef(onDataRefresh); const intervalRef = useRef | null>(null); useEffect(() => { callbackRef.current = onTransitionToRunning; }, [onTransitionToRunning]); + useEffect(() => { + dataRefreshRef.current = onDataRefresh; + }, [onDataRefresh]); + // Sync current sandbox statuses useEffect(() => { const statusMap = new Map(); @@ -148,9 +171,7 @@ export function useSandboxListPoll( previousStatusesRef.current = statusMap; }, [sandboxes]); - const hasActiveAgents = sandboxes.some((sb) => - ACTIVE_STATES.has(sb.status as SandboxStatus), - ); + const hasActiveAgents = sandboxes.some((sb) => ACTIVE_STATES.has(sb.status as SandboxStatus)); useEffect(() => { if (!hasActiveAgents) { @@ -173,8 +194,10 @@ export function useSandboxListPoll( if (cancelled || !res.ok) return; const json = await res.json(); - const agents: Array<{ id: string; status: string; agentName?: string; agent_name?: string }> = - json?.data ?? []; + const agents: SandboxListAgent[] = json?.data ?? []; + + // Push full list to parent for local state merge + dataRefreshRef.current?.(agents); for (const agent of agents) { const prevStatus = previousStatusesRef.current.get(agent.id); diff --git a/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx b/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx index 1ffde7232..30f666d68 100644 --- a/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx +++ b/packages/ui/src/components/containers/create-milady-sandbox-dialog.tsx @@ -19,12 +19,11 @@ import { } from "@elizaos/cloud-ui"; import { Check, ExternalLink, Loader2, Plus, RotateCcw, X } from "lucide-react"; import { useRouter } from "next/navigation"; -import { type ReactNode, useCallback, useEffect, useRef, useState } from "react"; +import { type ReactNode, useEffect, useState } from "react"; import { toast } from "sonner"; import { AGENT_FLAVORS, getDefaultFlavor, getFlavorById } from "@/lib/constants/agent-flavors"; -import { useSandboxStatusPoll, type SandboxStatus } from "@/lib/hooks/use-sandbox-status-poll"; -import { getClientSafeMiladyAgentWebUiUrl } from "@/lib/milady-web-ui"; import { openWebUIWithPairing } from "@/lib/hooks/open-web-ui"; +import { type SandboxStatus, useSandboxStatusPoll } from "@/lib/hooks/use-sandbox-status-poll"; // ---------------------------------------------------------------- // Provisioning Steps @@ -51,11 +50,7 @@ function getActiveStepIndex(status: SandboxStatus): number { type StepState = "complete" | "active" | "pending" | "error"; -function getStepState( - stepIndex: number, - activeIndex: number, - hasError: boolean, -): StepState { +function getStepState(stepIndex: number, activeIndex: number, hasError: boolean): StepState { if (hasError && stepIndex === activeIndex) return "error"; if (stepIndex < activeIndex) return "complete"; if (stepIndex === activeIndex) return "active"; @@ -67,31 +62,34 @@ function getStepState( // ---------------------------------------------------------------- function StepIndicator({ state }: { state: StepState }) { + const base = "flex h-6 w-6 shrink-0 items-center justify-center"; + switch (state) { case "complete": return ( -
- +
+
); case "active": return ( -
- - +
+
); case "error": return ( -
- +
+
); case "pending": default: return ( -
- +
+
); } @@ -124,19 +122,19 @@ function ProvisioningProgress({
{/* Header */}
-

+

{isComplete - ? "Your agent is ready!" + ? "Your agent is ready" : hasError ? "Something went wrong" - : "Setting up your agent..."} + : "Setting up your agent…"}

{!isComplete && !hasError && ( - + {elapsedSec < 60 ? `${elapsedSec}s` : `${Math.floor(elapsedSec / 60)}m ${elapsedSec % 60}s`} - {" · usually ~90s"} + {" · ~90s"} )}
@@ -148,34 +146,34 @@ function ProvisioningProgress({ const isLast = i === PROVISIONING_STEPS.length - 1; return (
- {/* Vertical connector line */} + {/* Vertical connector */} {!isLast && (
)} -
+

{step.label} @@ -186,11 +184,12 @@ function ProvisioningProgress({ })}

- {/* Error message */} + {/* Error detail */} {hasError && error && ( -
+

{error}

)} - {/* Actions */} + {/* Footer actions */}
{isComplete ? ( <> - openWebUIWithPairing(agentId)} - > + openWebUIWithPairing(agentId)}> Open Web UI @@ -217,7 +213,7 @@ function ProvisioningProgress({ ) : ( - {hasError ? "Close" : "Close — provisioning continues in background"} + {hasError ? "Close" : "Close — continues in background"} )}
@@ -232,6 +228,8 @@ function ProvisioningProgress({ interface CreateMiladySandboxDialogProps { trigger?: ReactNode; onProvisionQueued?: (agentId: string, jobId: string) => void; + /** Called after a sandbox is successfully created so the parent can refresh. */ + onCreated?: () => void | Promise; } type CreatePhase = "form" | "creating" | "provisioning"; @@ -239,6 +237,7 @@ type CreatePhase = "form" | "creating" | "provisioning"; export function CreateMiladySandboxDialog({ trigger, onProvisionQueued, + onCreated, }: CreateMiladySandboxDialogProps) { const router = useRouter(); const [open, setOpen] = useState(false); @@ -259,10 +258,10 @@ export function CreateMiladySandboxDialog({ const resolvedDockerImage = isCustom ? customImage.trim() : selectedFlavor?.dockerImage; // Poll the agent status while in provisioning phase - const pollResult = useSandboxStatusPoll( - isProvisioningPhase ? createdAgentId : null, - { intervalMs: 5_000, enabled: isProvisioningPhase }, - ); + const pollResult = useSandboxStatusPoll(isProvisioningPhase ? createdAgentId : null, { + intervalMs: 5_000, + enabled: isProvisioningPhase, + }); // Elapsed time counter useEffect(() => { @@ -299,6 +298,8 @@ export function CreateMiladySandboxDialog({ setOpen(false); // Delay reset so the closing animation finishes setTimeout(resetForm, 300); + // Notify parent to refresh its data (client-side) + void onCreated?.(); router.refresh(); } @@ -391,7 +392,7 @@ export function CreateMiladySandboxDialog({ if (jobId) { onProvisionQueued?.(createdAgentId, jobId); } - toast.info("Retrying provisioning..."); + toast.info("Retrying provisioning…"); } else if (!res.ok) { toast.error((data as { error?: string }).error ?? "Retry failed"); } @@ -422,11 +423,11 @@ export function CreateMiladySandboxDialog({ - {isProvisioningPhase ? "Launching Agent" : "Create Milady Sandbox"} + {isProvisioningPhase ? "Launching Agent" : "Create Sandbox"} {!isProvisioningPhase && ( - - Create a new agent sandbox and optionally start provisioning right away. + + Create an agent sandbox and optionally start provisioning right away. )} @@ -443,8 +444,9 @@ export function CreateMiladySandboxDialog({ ) : ( <>
-
-