From 8ba7547227596a081e80a13c8981fbe378f62baa Mon Sep 17 00:00:00 2001 From: Che <30403707+Che-Zhu@users.noreply.github.com> Date: Wed, 11 Mar 2026 17:19:41 +0800 Subject: [PATCH 1/3] refactor: add project task platform --- .../(list)/_components/home-page-content.tsx | 2 +- .../(list)/_components/project-list.tsx | 2 +- app/(dashboard)/projects/(list)/page.tsx | 4 +- .../_components/import-github-dialog.tsx | 9 +- app/api/projects/[id]/route.ts | 6 +- lib/actions/project.ts | 27 ++- lib/data/project.ts | 3 + lib/events/sandbox/sandboxListener.ts | 6 +- lib/jobs/project-import/index.ts | 2 - .../project-import/projectImportReconcile.ts | 229 ------------------ .../executors/clone-repository.ts | 95 ++++++++ lib/jobs/project-task/executors/index.ts | 22 ++ lib/jobs/project-task/index.ts | 2 + lib/jobs/project-task/projectTaskReconcile.ts | 159 ++++++++++++ lib/repo/project-import.ts | 150 ------------ lib/repo/project-task.ts | 206 ++++++++++++++++ lib/services/project-task-dispatcher.ts | 59 +++++ lib/startup/index.ts | 24 +- lib/util/project-display-status.ts | 22 +- .../migration.sql | 53 ++++ prisma/migrations/migration_lock.toml | 1 + prisma/schema.prisma | 89 +++++-- 22 files changed, 731 insertions(+), 441 deletions(-) delete mode 100644 lib/jobs/project-import/index.ts delete mode 100644 lib/jobs/project-import/projectImportReconcile.ts create mode 100644 lib/jobs/project-task/executors/clone-repository.ts create mode 100644 lib/jobs/project-task/executors/index.ts create mode 100644 lib/jobs/project-task/index.ts create mode 100644 lib/jobs/project-task/projectTaskReconcile.ts delete mode 100644 lib/repo/project-import.ts create mode 100644 lib/repo/project-task.ts create mode 100644 lib/services/project-task-dispatcher.ts create mode 100644 prisma/migrations/20260311171810_project_task_platform/migration.sql create mode 100644 prisma/migrations/migration_lock.toml diff --git a/app/(dashboard)/projects/(list)/_components/home-page-content.tsx b/app/(dashboard)/projects/(list)/_components/home-page-content.tsx index ffe161c..40501f6 100644 --- a/app/(dashboard)/projects/(list)/_components/home-page-content.tsx +++ b/app/(dashboard)/projects/(list)/_components/home-page-content.tsx @@ -15,7 +15,7 @@ import { ProjectList } from './project-list' const REFRESH_INTERVAL_MS = 3000 interface HomePageContentProps { - projects: ProjectWithRelations<{ sandboxes: true }>[] + projects: ProjectWithRelations<{ sandboxes: true; tasks: true }>[] } export function HomePageContent({ projects }: HomePageContentProps) { diff --git a/app/(dashboard)/projects/(list)/_components/project-list.tsx b/app/(dashboard)/projects/(list)/_components/project-list.tsx index b08b605..82d95dd 100644 --- a/app/(dashboard)/projects/(list)/_components/project-list.tsx +++ b/app/(dashboard)/projects/(list)/_components/project-list.tsx @@ -9,7 +9,7 @@ import { CreateProjectCard } from './create-project-card' import { ProjectCard } from './project-card' interface ProjectListProps { - projects: ProjectWithRelations<{ sandboxes: true }>[] + projects: ProjectWithRelations<{ sandboxes: true; tasks: true }>[] activeFilter: 'ALL' | ProjectDisplayStatus } diff --git a/app/(dashboard)/projects/(list)/page.tsx b/app/(dashboard)/projects/(list)/page.tsx index 19ab469..430cbb9 100644 --- a/app/(dashboard)/projects/(list)/page.tsx +++ b/app/(dashboard)/projects/(list)/page.tsx @@ -17,7 +17,7 @@ export default async function HomePage() { redirect('/login') } - const projects = await getProjects(session.user.id, { sandboxes: true }) + const projects = await getProjects(session.user.id, { sandboxes: true, tasks: true }) return (
@@ -26,4 +26,4 @@ export default async function HomePage() {
) -} \ No newline at end of file +} diff --git a/app/(dashboard)/projects/_components/import-github-dialog.tsx b/app/(dashboard)/projects/_components/import-github-dialog.tsx index a4f2586..6c696a9 100644 --- a/app/(dashboard)/projects/_components/import-github-dialog.tsx +++ b/app/(dashboard)/projects/_components/import-github-dialog.tsx @@ -3,7 +3,7 @@ import { useCallback, useEffect, useState } from 'react' import { FaGithub } from 'react-icons/fa' import { MdRefresh } from 'react-icons/md' -import type { ProjectImportStatus } from '@prisma/client' +import type { ProjectTaskStatus, ProjectTaskType } from '@prisma/client' import { useRouter } from 'next/navigation' import { toast } from 'sonner' @@ -97,11 +97,12 @@ export function ImportGitHubDialog({ open, onOpenChange }: ImportGitHubDialogPro const pollImportStatus = async () => { try { - const project = await GET<{ importStatus: ProjectImportStatus }>( + const project = await GET<{ tasks: Array<{ type: ProjectTaskType; status: ProjectTaskStatus }> }>( `/api/projects/${importProjectId}` ) - if (project.importStatus === 'READY') { + const cloneTask = project.tasks.find((task) => task.type === 'CLONE_REPOSITORY') + if (!cloneTask || cloneTask.status === 'SUCCEEDED') { toast.success('Repository imported successfully') onOpenChange(false) setImportProjectId(null) @@ -109,7 +110,7 @@ export function ImportGitHubDialog({ open, onOpenChange }: ImportGitHubDialogPro return } - if (project.importStatus === 'FAILED') { + if (cloneTask.status === 'FAILED' || cloneTask.status === 'CANCELLED') { toast.error('Repository import failed. An empty project was created instead.') onOpenChange(false) setImportProjectId(null) diff --git a/app/api/projects/[id]/route.ts b/app/api/projects/[id]/route.ts index 832b645..6b68ff7 100644 --- a/app/api/projects/[id]/route.ts +++ b/app/api/projects/[id]/route.ts @@ -9,6 +9,7 @@ type ProjectWithFullRelations = Prisma.ProjectGetPayload<{ sandboxes: true databases: true environments: true + tasks: true } }> @@ -39,6 +40,9 @@ export const GET = withAuth(async (_req, context, session) = environments: { orderBy: { createdAt: 'asc' }, }, + tasks: { + orderBy: { createdAt: 'desc' }, + }, }, }) @@ -51,4 +55,4 @@ export const GET = withAuth(async (_req, context, session) = console.error('Error fetching project:', error) return NextResponse.json({ error: 'Failed to fetch project details' }, { status: 500 }) } -}) \ No newline at end of file +}) diff --git a/lib/actions/project.ts b/lib/actions/project.ts index 3b85013..12bca0f 100644 --- a/lib/actions/project.ts +++ b/lib/actions/project.ts @@ -7,7 +7,7 @@ * instead of API Routes directly. */ -import type { Project, ProjectImportStatus } from '@prisma/client' +import type { Project } from '@prisma/client' import { auth } from '@/lib/auth' import { EnvironmentCategory } from '@/lib/const' @@ -17,6 +17,7 @@ import { KubernetesUtils } from '@/lib/k8s/kubernetes-utils' import { VERSIONS } from '@/lib/k8s/versions' import { logger as baseLogger } from '@/lib/logger' import { getInstallationByGitHubId } from '@/lib/repo/github' +import { createProjectTask } from '@/lib/repo/project-task' import { listInstallationRepos } from '@/lib/services/github-app' import { generateRandomString } from '@/lib/util/common' @@ -61,8 +62,8 @@ type CreateProjectWithSandboxOptions = { name: string description?: string importData?: { - importStatus: ProjectImportStatus githubAppInstallationId: string + installationId: number githubRepoId: number githubRepoFullName: string githubRepoDefaultBranch?: string @@ -112,13 +113,10 @@ async function createProjectWithSandbox({ description, userId, status: 'CREATING', - importStatus: importData?.importStatus ?? 'READY', githubAppInstallationId: importData?.githubAppInstallationId, githubRepoId: importData?.githubRepoId, githubRepoFullName: importData?.githubRepoFullName, githubRepoDefaultBranch: importData?.githubRepoDefaultBranch, - importError: null, - importLockedUntil: null, }, }) @@ -168,6 +166,23 @@ async function createProjectWithSandbox({ }, }) + if (importData?.githubRepoDefaultBranch) { + await createProjectTask(tx, { + projectId: project.id, + sandboxId: sandbox.id, + type: 'CLONE_REPOSITORY', + status: 'WAITING_FOR_PREREQUISITES', + triggerSource: 'USER_ACTION', + payload: { + installationId: importData.installationId, + repoId: importData.githubRepoId, + repoFullName: importData.githubRepoFullName, + defaultBranch: importData.githubRepoDefaultBranch, + }, + maxAttempts: 3, + }) + } + return { project, sandbox } }, { @@ -263,8 +278,8 @@ export async function importProjectFromGitHub( name: payload.repoName, description: payload.description, importData: { - importStatus: 'PENDING', githubAppInstallationId: installation.id, + installationId: installation.installationId, githubRepoId: payload.repoId, githubRepoFullName: payload.repoFullName, githubRepoDefaultBranch: payload.defaultBranch, diff --git a/lib/data/project.ts b/lib/data/project.ts index 4a7b70a..c7bca58 100644 --- a/lib/data/project.ts +++ b/lib/data/project.ts @@ -11,6 +11,7 @@ export type ProjectInclude = { sandboxes?: boolean; databases?: boolean; environments?: boolean; + tasks?: boolean; }; /** @@ -33,6 +34,7 @@ export const getProject = cache(async function getProject( sandboxes: false, databases: false, environments: false, + tasks: false, ...include, }; @@ -68,6 +70,7 @@ export const getProjects = cache(async function getProjects( sandboxes: false, databases: false, environments: false, + tasks: false, ...include, }; diff --git a/lib/events/sandbox/sandboxListener.ts b/lib/events/sandbox/sandboxListener.ts index 269ce86..3606187 100644 --- a/lib/events/sandbox/sandboxListener.ts +++ b/lib/events/sandbox/sandboxListener.ts @@ -1,4 +1,4 @@ -import { triggerProjectImportForProject } from '@/lib/jobs/project-import' +import { triggerRunnableTasksForProject } from '@/lib/jobs/project-task' import { getK8sServiceForUser } from '@/lib/k8s/k8s-service-helper' import { logger as baseLogger } from '@/lib/logger' import { getProjectEnvironments } from '@/lib/repo/environment' @@ -355,8 +355,8 @@ registerSandboxListeners() async function triggerImportOnSandboxRunning(projectId: string): Promise { try { - await triggerProjectImportForProject(projectId) + await triggerRunnableTasksForProject(projectId) } catch (error) { - logger.error(`Failed to trigger project import for ${projectId}: ${error}`) + logger.error(`Failed to trigger project tasks for ${projectId}: ${error}`) } } diff --git a/lib/jobs/project-import/index.ts b/lib/jobs/project-import/index.ts deleted file mode 100644 index c0cc926..0000000 --- a/lib/jobs/project-import/index.ts +++ /dev/null @@ -1,2 +0,0 @@ -export { startProjectImportReconcileJob } from './projectImportReconcile' -export { triggerProjectImportForProject } from './projectImportReconcile' diff --git a/lib/jobs/project-import/projectImportReconcile.ts b/lib/jobs/project-import/projectImportReconcile.ts deleted file mode 100644 index c67ac3a..0000000 --- a/lib/jobs/project-import/projectImportReconcile.ts +++ /dev/null @@ -1,229 +0,0 @@ -import { Cron } from 'croner' - -import { logger as baseLogger } from '@/lib/logger' -import { - acquireAndLockImportProjects, - getImportProjectById, - type ImportProjectWithRelations, - LOCK_DURATION_SECONDS, - setProjectImportState, - tryClaimImportExecutionLock, -} from '@/lib/repo/project-import' -import { getInstallationToken } from '@/lib/services/github-app' -import { getSandboxTtydContext } from '@/lib/util/ttyd-context' -import { execCommand } from '@/lib/util/ttyd-exec' - -const logger = baseLogger.child({ module: 'lib/jobs/project-import/projectImportReconcile' }) - -const MAX_IMPORTS_PER_CYCLE = parseInt(process.env.MAX_PROJECT_IMPORTS_PER_RECONCILE || '5', 10) -const RECONCILE_INTERVAL_SECONDS = parseInt( - process.env.PROJECT_IMPORT_RECONCILE_INTERVAL_SECONDS || '10', - 10 -) -const CLONING_LOCK_DURATION_SECONDS = 300 -const CLONE_MAX_ATTEMPTS = parseInt(process.env.PROJECT_IMPORT_CLONE_MAX_ATTEMPTS || '3', 10) -const IMPORT_EXEC_TIMEOUT_MS = parseInt(process.env.PROJECT_IMPORT_EXEC_TIMEOUT_MS || '90000', 10) - -type ImportTriggerSource = 'reconcile' | 'sandbox-running-event' - -export function startProjectImportReconcileJob() { - logger.info('Starting project import reconcile job') - logger.info(`Lock duration: ${LOCK_DURATION_SECONDS} seconds`) - logger.info(`Max projects per cycle: ${MAX_IMPORTS_PER_CYCLE}`) - logger.info(`Reconcile interval: ${RECONCILE_INTERVAL_SECONDS} seconds`) - - const job = new Cron(`*/${RECONCILE_INTERVAL_SECONDS} * * * * *`, async () => { - try { - await reconcileProjectImports() - } catch (error) { - logger.error(`Project import reconcile job error: ${error}`) - } - }) - - logger.info( - `✅ Project import reconcile job started (every ${RECONCILE_INTERVAL_SECONDS} seconds)` - ) - return job -} - -async function reconcileProjectImports() { - const projects = await acquireAndLockImportProjects(MAX_IMPORTS_PER_CYCLE) - - if (projects.length === 0) { - return - } - - logger.info(`Acquired ${projects.length} projects for import reconcile`) - - for (const project of projects) { - try { - await handleSingleProjectImport(project, 'reconcile') - } catch (error) { - logger.error(`Failed to process project import for ${project.id}: ${error}`) - await setProjectImportState(project.id, { - importStatus: 'FAILED', - importError: String(error), - importLockedUntil: null, - }) - } - } -} - -export async function triggerProjectImportForProject(projectId: string): Promise { - const project = await getImportProjectById(projectId) - if (!project) { - return - } - - if (project.importStatus !== 'PENDING' && project.importStatus !== 'CLONING') { - return - } - - const sandbox = project.sandboxes[0] - if (!sandbox || sandbox.status !== 'RUNNING') { - return - } - - const claimed = await tryClaimImportExecutionLock(project.id, CLONING_LOCK_DURATION_SECONDS) - if (!claimed) { - return - } - - const projectAfterLock = await getImportProjectById(project.id) - if (!projectAfterLock) { - return - } - - await handleSingleProjectImport(projectAfterLock, 'sandbox-running-event') -} - -async function handleSingleProjectImport( - project: ImportProjectWithRelations, - source: ImportTriggerSource -): Promise { - if (!project.githubRepoFullName || !project.githubAppInstallation || !project.githubRepoDefaultBranch) { - await setProjectImportState(project.id, { - importStatus: 'FAILED', - importError: 'Missing repository metadata for import', - importLockedUntil: null, - }) - return - } - - const sandbox = project.sandboxes[0] - if (!sandbox) { - await setProjectImportState(project.id, { - importStatus: 'FAILED', - importError: 'Sandbox not found', - importLockedUntil: null, - }) - return - } - - if (sandbox.status !== 'RUNNING') { - if (project.importStatus === 'CLONING') { - await setProjectImportState(project.id, { - importStatus: 'PENDING', - importLockedUntil: null, - }) - } - return - } - - if (source === 'reconcile') { - await setProjectImportState(project.id, { - importStatus: 'CLONING', - importError: null, - importLockedUntil: new Date(Date.now() + CLONING_LOCK_DURATION_SECONDS * 1000), - }) - } - - const cloneResult = await cloneRepoToSandboxWithRetry(project, sandbox.id) - if (cloneResult.success) { - await setProjectImportState(project.id, { - importStatus: 'READY', - importError: null, - importLockedUntil: null, - }) - return - } - - await setProjectImportState(project.id, { - importStatus: 'FAILED', - importError: cloneResult.error, - importLockedUntil: null, - }) -} - -async function cloneRepoToSandboxWithRetry( - project: ImportProjectWithRelations, - sandboxId: string -): Promise<{ success: true } | { success: false; error: string }> { - let lastError = 'Unknown clone error' - - for (let attempt = 1; attempt <= CLONE_MAX_ATTEMPTS; attempt++) { - const attemptStart = Date.now() - try { - logger.info(`Import clone attempt ${attempt}/${CLONE_MAX_ATTEMPTS} for project ${project.id}`) - await cloneRepoToSandbox(project, sandboxId) - logger.info( - `Import clone succeeded on attempt ${attempt} for project ${project.id} in ${Date.now() - attemptStart}ms` - ) - return { success: true } - } catch (error) { - lastError = error instanceof Error ? error.message : String(error) - logger.warn( - `Import clone attempt ${attempt}/${CLONE_MAX_ATTEMPTS} failed for project ${project.id} after ${Date.now() - attemptStart}ms: ${lastError}` - ) - if (attempt < CLONE_MAX_ATTEMPTS) { - await sleep(1500) - } - } - } - - return { - success: false, - error: `Clone failed after ${CLONE_MAX_ATTEMPTS} attempts: ${lastError}`, - } -} - -async function cloneRepoToSandbox(project: ImportProjectWithRelations, sandboxId: string): Promise { - const installationToken = await getInstallationToken(project.githubAppInstallation!.installationId) - const { ttyd } = await getSandboxTtydContext(sandboxId, project.user.id) - - const authUrl = `https://x-access-token:${installationToken}@github.com/${project.githubRepoFullName}.git` - const escapedAuthUrl = shellEscapeSingleQuoted(authUrl) - const escapedBranch = shellEscapeSingleQuoted(project.githubRepoDefaultBranch!) - const repoFullName = project.githubRepoFullName! - const repoName = repoFullName.split('/').at(-1) || 'repo' - const importDirName = repoName.replace(/[^a-zA-Z0-9._-]/g, '-') - const uniqueImportDirName = `${importDirName}-${project.id}` - const escapedImportDirName = shellEscapeSingleQuoted(uniqueImportDirName) - - const cloneCommand = [ - 'set -e', - 'mkdir -p import', - `target_dir='import/${escapedImportDirName}'`, - 'tmp_dir=$(mktemp -d)', - `GIT_TERMINAL_PROMPT=0 GIT_ASKPASS=/bin/echo git clone --depth 1 --branch '${escapedBranch}' '${escapedAuthUrl}' "$tmp_dir/repo"`, - 'mkdir -p "$target_dir"', - 'cp -a "$tmp_dir/repo"/. "$target_dir"/', - 'rm -rf "$tmp_dir"', - ].join(' && ') - - await execCommand( - ttyd.baseUrl, - ttyd.accessToken, - cloneCommand, - IMPORT_EXEC_TIMEOUT_MS, - ttyd.authorization - ) -} - -function shellEscapeSingleQuoted(input: string): string { - return input.replace(/'/g, `'\\''`) -} - -function sleep(ms: number): Promise { - return new Promise((resolve) => setTimeout(resolve, ms)) -} diff --git a/lib/jobs/project-task/executors/clone-repository.ts b/lib/jobs/project-task/executors/clone-repository.ts new file mode 100644 index 0000000..d8b69d1 --- /dev/null +++ b/lib/jobs/project-task/executors/clone-repository.ts @@ -0,0 +1,95 @@ +import type { Prisma } from '@prisma/client' + +import type { ProjectTaskWithRelations } from '@/lib/repo/project-task' +import { getInstallationToken } from '@/lib/services/github-app' +import { getSandboxTtydContext } from '@/lib/util/ttyd-context' +import { execCommand } from '@/lib/util/ttyd-exec' + +export type ProjectTaskExecutorResult = + | { success: true; result?: Prisma.InputJsonValue } + | { success: false; error: string; retryable: boolean } + +const IMPORT_EXEC_TIMEOUT_MS = parseInt(process.env.PROJECT_IMPORT_EXEC_TIMEOUT_MS || '90000', 10) + +type CloneRepositoryPayload = { + installationId?: number + repoFullName?: string + defaultBranch?: string +} + +export async function runCloneRepositoryTask( + task: ProjectTaskWithRelations +): Promise { + const payload = (task.payload ?? {}) as CloneRepositoryPayload + const installationId = payload.installationId + const repoFullName = payload.repoFullName + const defaultBranch = payload.defaultBranch + + if (!installationId || !repoFullName || !defaultBranch) { + return { + success: false, + error: 'Missing clone repository payload', + retryable: false, + } + } + + if (!task.sandbox?.id) { + return { + success: false, + error: 'Sandbox not found for clone task', + retryable: false, + } + } + + try { + const installationToken = await getInstallationToken(installationId) + const { ttyd } = await getSandboxTtydContext(task.sandbox.id, task.project.user.id) + + const authUrl = `https://x-access-token:${installationToken}@github.com/${repoFullName}.git` + const escapedAuthUrl = shellEscapeSingleQuoted(authUrl) + const escapedBranch = shellEscapeSingleQuoted(defaultBranch) + const repoName = repoFullName.split('/').at(-1) || 'repo' + const importDirName = repoName.replace(/[^a-zA-Z0-9._-]/g, '-') + const uniqueImportDirName = `${importDirName}-${task.projectId}` + const escapedImportDirName = shellEscapeSingleQuoted(uniqueImportDirName) + + const cloneCommand = [ + 'set -e', + 'mkdir -p import', + `target_dir='import/${escapedImportDirName}'`, + 'tmp_dir=$(mktemp -d)', + `rm -rf "$target_dir"`, + `GIT_TERMINAL_PROMPT=0 GIT_ASKPASS=/bin/echo git clone --depth 1 --branch '${escapedBranch}' '${escapedAuthUrl}' "$tmp_dir/repo"`, + 'mkdir -p "$target_dir"', + 'cp -a "$tmp_dir/repo"/. "$target_dir"/', + 'rm -rf "$tmp_dir"', + ].join(' && ') + + await execCommand( + ttyd.baseUrl, + ttyd.accessToken, + cloneCommand, + IMPORT_EXEC_TIMEOUT_MS, + ttyd.authorization + ) + + return { + success: true, + result: { + repoFullName, + defaultBranch, + importPath: `import/${uniqueImportDirName}`, + }, + } + } catch (error) { + return { + success: false, + error: error instanceof Error ? error.message : String(error), + retryable: true, + } + } +} + +function shellEscapeSingleQuoted(input: string): string { + return input.replace(/'/g, `'\\''`) +} diff --git a/lib/jobs/project-task/executors/index.ts b/lib/jobs/project-task/executors/index.ts new file mode 100644 index 0000000..b549a26 --- /dev/null +++ b/lib/jobs/project-task/executors/index.ts @@ -0,0 +1,22 @@ +import type { ProjectTaskType } from '@prisma/client' + +import type { ProjectTaskWithRelations } from '@/lib/repo/project-task' + +import { type ProjectTaskExecutorResult, runCloneRepositoryTask } from './clone-repository' + +export async function runProjectTaskExecutor( + task: ProjectTaskWithRelations +): Promise { + switch (task.type as ProjectTaskType) { + case 'CLONE_REPOSITORY': + return runCloneRepositoryTask(task) + default: + return { + success: false, + error: `No executor registered for task type ${task.type}`, + retryable: false, + } + } +} + +export type { ProjectTaskExecutorResult } diff --git a/lib/jobs/project-task/index.ts b/lib/jobs/project-task/index.ts new file mode 100644 index 0000000..14a93da --- /dev/null +++ b/lib/jobs/project-task/index.ts @@ -0,0 +1,2 @@ +export { startProjectTaskReconcileJob } from './projectTaskReconcile' +export { triggerRunnableTasksForProject } from './projectTaskReconcile' diff --git a/lib/jobs/project-task/projectTaskReconcile.ts b/lib/jobs/project-task/projectTaskReconcile.ts new file mode 100644 index 0000000..acf50e2 --- /dev/null +++ b/lib/jobs/project-task/projectTaskReconcile.ts @@ -0,0 +1,159 @@ +import { Cron } from 'croner' + +import { logger as baseLogger } from '@/lib/logger' +import { + acquireAndLockProjectTasks, + getProjectTaskById, + getRunnableTasksForProject, + incrementProjectTaskAttemptCount, + LOCK_DURATION_SECONDS, + type ProjectTaskWithRelations, + setProjectTaskState, + tryClaimProjectTaskExecutionLock, +} from '@/lib/repo/project-task' + +import { runProjectTaskExecutor } from './executors' + +const logger = baseLogger.child({ module: 'lib/jobs/project-task/projectTaskReconcile' }) + +const MAX_TASKS_PER_CYCLE = parseInt(process.env.MAX_PROJECT_TASKS_PER_RECONCILE || '10', 10) +const RECONCILE_INTERVAL_SECONDS = parseInt( + process.env.PROJECT_TASK_RECONCILE_INTERVAL_SECONDS || '10', + 10 +) +const EXECUTION_LOCK_DURATION_SECONDS = parseInt( + process.env.PROJECT_TASK_EXECUTION_LOCK_DURATION_SECONDS || '300', + 10 +) + +export function startProjectTaskReconcileJob() { + logger.info('Starting project task reconcile job') + logger.info(`Lock duration: ${LOCK_DURATION_SECONDS} seconds`) + logger.info(`Execution lock duration: ${EXECUTION_LOCK_DURATION_SECONDS} seconds`) + logger.info(`Max tasks per cycle: ${MAX_TASKS_PER_CYCLE}`) + logger.info(`Reconcile interval: ${RECONCILE_INTERVAL_SECONDS} seconds`) + + const job = new Cron(`*/${RECONCILE_INTERVAL_SECONDS} * * * * *`, async () => { + try { + await reconcileProjectTasks() + } catch (error) { + logger.error(`Project task reconcile job error: ${error}`) + } + }) + + logger.info(`✅ Project task reconcile job started (every ${RECONCILE_INTERVAL_SECONDS} seconds)`) + return job +} + +async function reconcileProjectTasks() { + const tasks = await acquireAndLockProjectTasks(MAX_TASKS_PER_CYCLE) + + if (tasks.length === 0) { + return + } + + logger.info(`Acquired ${tasks.length} project tasks for reconcile`) + + for (const task of tasks) { + try { + await handleProjectTask(task) + } catch (error) { + logger.error(`Failed to process project task ${task.id}: ${error}`) + } + } +} + +export async function triggerRunnableTasksForProject(projectId: string): Promise { + const tasks = await getRunnableTasksForProject(projectId) + for (const task of tasks) { + try { + const freshTask = await getProjectTaskById(task.id) + if (freshTask) { + await handleProjectTask(freshTask) + } + } catch (error) { + logger.error(`Failed to trigger project task ${task.id}: ${error}`) + } + } +} + +async function handleProjectTask(task: ProjectTaskWithRelations): Promise { + const prerequisites = evaluateTaskPrerequisites(task) + if (!prerequisites.ready) { + await setProjectTaskState(task.id, { + status: 'WAITING_FOR_PREREQUISITES', + error: prerequisites.reason ?? null, + lockedUntil: null, + startedAt: null, + finishedAt: null, + }) + return + } + + const claimed = await tryClaimProjectTaskExecutionLock(task.id, EXECUTION_LOCK_DURATION_SECONDS) + if (!claimed) { + return + } + + const executingTask = await getProjectTaskById(task.id) + if (!executingTask) { + return + } + + const attemptCount = await incrementProjectTaskAttemptCount(task.id) + const result = await runProjectTaskExecutor(executingTask) + + if (result.success) { + await setProjectTaskState(task.id, { + status: 'SUCCEEDED', + error: null, + result: result.result, + lockedUntil: null, + finishedAt: new Date(), + attemptCount, + }) + return + } + + if (result.retryable && attemptCount < executingTask.maxAttempts) { + await setProjectTaskState(task.id, { + status: 'PENDING', + error: result.error, + lockedUntil: null, + finishedAt: null, + attemptCount, + }) + return + } + + await setProjectTaskState(task.id, { + status: 'FAILED', + error: result.error, + lockedUntil: null, + finishedAt: new Date(), + attemptCount, + }) +} + +function evaluateTaskPrerequisites( + task: Pick +): { ready: boolean; reason?: string } { + switch (task.type) { + case 'CLONE_REPOSITORY': + case 'INSTALL_SKILL': + case 'UNINSTALL_SKILL': + case 'DEPLOY_PROJECT': + if (!task.sandbox) { + return { ready: false, reason: 'Sandbox not found' } + } + if (task.sandbox.status !== 'RUNNING') { + return { + ready: false, + reason: `Waiting for sandbox to become RUNNING (current: ${task.sandbox.status})`, + } + } + return { ready: true } + default: + return { ready: false, reason: `Unknown task type ${task.type}` } + } +} diff --git a/lib/repo/project-import.ts b/lib/repo/project-import.ts deleted file mode 100644 index 6753c56..0000000 --- a/lib/repo/project-import.ts +++ /dev/null @@ -1,150 +0,0 @@ -import type { Project } from '@prisma/client' - -import { prisma } from '@/lib/db' -import { logger as baseLogger } from '@/lib/logger' - -const logger = baseLogger.child({ module: 'lib/repo/project-import' }) - -const LOCK_DURATION_SECONDS = parseInt(process.env.PROJECT_IMPORT_LOCK_DURATION_SECONDS || '5', 10) - -type ImportProjectWithRelations = Project & { - user: { - id: string - } - githubAppInstallation: { - installationId: number - } | null - sandboxes: Array<{ - id: string - status: string - }> -} - -const importProjectWithRelationsInclude = { - user: { - select: { - id: true, - }, - }, - githubAppInstallation: { - select: { - installationId: true, - }, - }, - sandboxes: { - select: { - id: true, - status: true, - }, - orderBy: { - createdAt: 'asc', - }, - }, -} as const - -export async function acquireAndLockImportProjects( - limit: number = 10, - baseLockSeconds: number = LOCK_DURATION_SECONDS, - randomOffsetSeconds: number = 2 -): Promise { - try { - const now = new Date() - const randomSeconds = Math.random() * randomOffsetSeconds - const lockUntil = new Date(now.getTime() + (baseLockSeconds + randomSeconds) * 1000) - - const lockedProjects = await prisma.$transaction(async (tx) => { - return await tx.$queryRaw` - UPDATE "Project" - SET - "importLockedUntil" = ${lockUntil}, - "updatedAt" = NOW() - WHERE "id" IN ( - SELECT "id" - FROM "Project" - WHERE "importStatus" IN ('PENDING', 'CLONING') - AND ("importLockedUntil" IS NULL OR "importLockedUntil" <= ${now}) - ORDER BY "updatedAt" ASC - LIMIT ${limit} - FOR UPDATE SKIP LOCKED - ) - RETURNING * - ` - }) - - if (lockedProjects.length === 0) { - return [] - } - - const projectsWithRelations = await prisma.project.findMany({ - where: { - id: { - in: lockedProjects.map((project) => project.id), - }, - }, - include: importProjectWithRelationsInclude, - }) - - return projectsWithRelations - } catch (error) { - logger.error(`Error acquiring and locking import projects: ${error}`) - return [] - } -} - -export async function getImportProjectById(projectId: string): Promise { - return await prisma.project.findUnique({ - where: { - id: projectId, - }, - include: importProjectWithRelationsInclude, - }) -} - -export async function tryClaimImportExecutionLock( - projectId: string, - lockSeconds: number -): Promise { - const now = new Date() - const lockUntil = new Date(now.getTime() + lockSeconds * 1000) - - const result = await prisma.project.updateMany({ - where: { - id: projectId, - importStatus: { - in: ['PENDING', 'CLONING'], - }, - OR: [{ importLockedUntil: null }, { importLockedUntil: { lte: now } }], - }, - data: { - importStatus: 'CLONING', - importError: null, - importLockedUntil: lockUntil, - updatedAt: new Date(), - }, - }) - - return result.count > 0 -} - -export async function setProjectImportState( - projectId: string, - data: { - importStatus: 'PENDING' | 'CLONING' | 'READY' | 'FAILED' - importError?: string | null - importLockedUntil?: Date | null - } -): Promise { - await prisma.project.update({ - where: { id: projectId }, - data: { - importStatus: data.importStatus, - importError: data.importError ?? null, - importLockedUntil: - data.importLockedUntil === undefined ? undefined : data.importLockedUntil, - updatedAt: new Date(), - }, - }) -} - -export { LOCK_DURATION_SECONDS } -export type { ImportProjectWithRelations } diff --git a/lib/repo/project-task.ts b/lib/repo/project-task.ts new file mode 100644 index 0000000..4859abb --- /dev/null +++ b/lib/repo/project-task.ts @@ -0,0 +1,206 @@ +import type { + Prisma, + ProjectTaskStatus, + ProjectTaskTriggerSource, + ProjectTaskType, +} from '@prisma/client' + +import { prisma } from '@/lib/db' +import { logger as baseLogger } from '@/lib/logger' + +const logger = baseLogger.child({ module: 'lib/repo/project-task' }) + +const LOCK_DURATION_SECONDS = parseInt(process.env.PROJECT_TASK_LOCK_DURATION_SECONDS || '5', 10) + +const projectTaskWithRelationsInclude = { + project: { + include: { + user: true, + }, + }, + sandbox: { + select: { + id: true, + status: true, + }, + }, +} satisfies Prisma.ProjectTaskInclude + +type ProjectTaskWithRelations = Prisma.ProjectTaskGetPayload<{ + include: typeof projectTaskWithRelationsInclude +}> + +type CreateProjectTaskInput = { + projectId: string + sandboxId?: string | null + type: ProjectTaskType + status?: ProjectTaskStatus + triggerSource: ProjectTaskTriggerSource + payload?: Prisma.InputJsonValue + maxAttempts?: number +} + +type ProjectTaskStateUpdate = { + status: ProjectTaskStatus + error?: string | null + result?: Prisma.InputJsonValue | Prisma.NullableJsonNullValueInput + lockedUntil?: Date | null + startedAt?: Date | null + finishedAt?: Date | null + attemptCount?: number +} + +export async function createProjectTask( + tx: Prisma.TransactionClient, + input: CreateProjectTaskInput +) { + return tx.projectTask.create({ + data: { + projectId: input.projectId, + sandboxId: input.sandboxId ?? null, + type: input.type, + status: input.status ?? 'PENDING', + triggerSource: input.triggerSource, + payload: input.payload, + maxAttempts: input.maxAttempts ?? 3, + }, + }) +} + +export async function acquireAndLockProjectTasks( + limit: number = 10, + baseLockSeconds: number = LOCK_DURATION_SECONDS, + randomOffsetSeconds: number = 2 +): Promise { + try { + const now = new Date() + const randomSeconds = Math.random() * randomOffsetSeconds + const lockUntil = new Date(now.getTime() + (baseLockSeconds + randomSeconds) * 1000) + + const lockedTasks = await prisma.$transaction(async (tx) => { + return await tx.$queryRaw>` + UPDATE "ProjectTask" + SET + "lockedUntil" = ${lockUntil}, + "updatedAt" = NOW() + WHERE "id" IN ( + SELECT "id" + FROM "ProjectTask" + WHERE "status" IN ('PENDING', 'WAITING_FOR_PREREQUISITES', 'RUNNING') + AND ("lockedUntil" IS NULL OR "lockedUntil" <= ${now}) + ORDER BY "updatedAt" ASC + LIMIT ${limit} + FOR UPDATE SKIP LOCKED + ) + RETURNING "id" + ` + }) + + if (lockedTasks.length === 0) { + return [] + } + + return prisma.projectTask.findMany({ + where: { + id: { + in: lockedTasks.map((task) => task.id), + }, + }, + include: projectTaskWithRelationsInclude, + orderBy: { createdAt: 'asc' }, + }) + } catch (error) { + logger.error(`Error acquiring and locking project tasks: ${error}`) + return [] + } +} + +export async function getProjectTaskById(taskId: string): Promise { + return prisma.projectTask.findUnique({ + where: { id: taskId }, + include: projectTaskWithRelationsInclude, + }) +} + +export async function getRunnableTasksForProject( + projectId: string, + taskType?: ProjectTaskType +): Promise { + return prisma.projectTask.findMany({ + where: { + projectId, + type: taskType, + status: { + in: ['PENDING', 'WAITING_FOR_PREREQUISITES', 'RUNNING'], + }, + }, + include: projectTaskWithRelationsInclude, + orderBy: { createdAt: 'asc' }, + }) +} + +export async function tryClaimProjectTaskExecutionLock( + taskId: string, + lockSeconds: number +): Promise { + const now = new Date() + const lockUntil = new Date(now.getTime() + lockSeconds * 1000) + + const result = await prisma.projectTask.updateMany({ + where: { + id: taskId, + status: { + in: ['PENDING', 'WAITING_FOR_PREREQUISITES', 'RUNNING'], + }, + OR: [{ lockedUntil: null }, { lockedUntil: { lte: now } }], + }, + data: { + status: 'RUNNING', + error: null, + lockedUntil: lockUntil, + startedAt: now, + updatedAt: now, + }, + }) + + return result.count > 0 +} + +export async function setProjectTaskState( + taskId: string, + data: ProjectTaskStateUpdate +): Promise { + await prisma.projectTask.update({ + where: { id: taskId }, + data: { + status: data.status, + error: data.error ?? null, + result: data.result, + lockedUntil: data.lockedUntil === undefined ? undefined : data.lockedUntil, + startedAt: data.startedAt === undefined ? undefined : data.startedAt, + finishedAt: data.finishedAt === undefined ? undefined : data.finishedAt, + attemptCount: data.attemptCount, + updatedAt: new Date(), + }, + }) +} + +export async function incrementProjectTaskAttemptCount(taskId: string): Promise { + const task = await prisma.projectTask.update({ + where: { id: taskId }, + data: { + attemptCount: { + increment: 1, + }, + updatedAt: new Date(), + }, + select: { + attemptCount: true, + }, + }) + + return task.attemptCount +} + +export { LOCK_DURATION_SECONDS, projectTaskWithRelationsInclude } +export type { CreateProjectTaskInput, ProjectTaskWithRelations } diff --git a/lib/services/project-task-dispatcher.ts b/lib/services/project-task-dispatcher.ts new file mode 100644 index 0000000..0fa19f2 --- /dev/null +++ b/lib/services/project-task-dispatcher.ts @@ -0,0 +1,59 @@ +import type { Prisma, ProjectTaskStatus, ProjectTaskType } from '@prisma/client' +import { ProjectTaskTriggerSource } from '@prisma/client' + +import { prisma } from '@/lib/db' +import { logger as baseLogger } from '@/lib/logger' +import { createProjectTask } from '@/lib/repo/project-task' + +const logger = baseLogger.child({ module: 'lib/services/project-task-dispatcher' }) + +type DispatchProjectTaskInput = { + projectId: string + sandboxId?: string | null + type: ProjectTaskType + triggerSource: ProjectTaskTriggerSource + payload?: Prisma.InputJsonValue + status?: ProjectTaskStatus + maxAttempts?: number +} + +export async function dispatchProjectTask(input: DispatchProjectTaskInput) { + const task = await prisma.$transaction(async (tx) => + createProjectTask(tx, { + projectId: input.projectId, + sandboxId: input.sandboxId, + type: input.type, + status: input.status, + triggerSource: input.triggerSource, + payload: input.payload, + maxAttempts: input.maxAttempts, + }) + ) + + logger.info(`Dispatched project task ${task.id} (${task.type}) for project ${task.projectId}`) + return task +} + +export async function dispatchCloneRepositoryTask(input: { + projectId: string + sandboxId: string + installationId: number + repoId: number + repoFullName: string + defaultBranch: string +}) { + return dispatchProjectTask({ + projectId: input.projectId, + sandboxId: input.sandboxId, + type: 'CLONE_REPOSITORY', + status: 'WAITING_FOR_PREREQUISITES', + triggerSource: ProjectTaskTriggerSource.USER_ACTION, + payload: { + installationId: input.installationId, + repoId: input.repoId, + repoFullName: input.repoFullName, + defaultBranch: input.defaultBranch, + }, + maxAttempts: 3, + }) +} diff --git a/lib/startup/index.ts b/lib/startup/index.ts index 3523368..4c9ee7d 100644 --- a/lib/startup/index.ts +++ b/lib/startup/index.ts @@ -77,17 +77,17 @@ async function startBackgroundJobs() { const sandboxJob = startSandboxReconcileJob() logger.info('✅ Sandbox reconcile job started') - // Start project import reconciliation job - // Runs every 3 seconds to process GitHub imports when sandboxes become RUNNING - const { startProjectImportReconcileJob } = await import('@/lib/jobs/project-import') - const projectImportJob = startProjectImportReconcileJob() - logger.info('✅ Project import reconcile job started') + // Start project task reconciliation job + // Runs every 10 seconds to process project-level async tasks when prerequisites are met + const { startProjectTaskReconcileJob } = await import('@/lib/jobs/project-task') + const projectTaskJob = startProjectTaskReconcileJob() + logger.info('✅ Project task reconcile job started') // Store job references for potential cleanup (optional) // In most cases, these jobs will run for the lifetime of the server globalThis.__databaseReconcileJob = databaseJob globalThis.__sandboxReconcileJob = sandboxJob - globalThis.__projectImportReconcileJob = projectImportJob + globalThis.__projectTaskReconcileJob = projectTaskJob logger.info('All background jobs started successfully') } catch (error) { @@ -123,11 +123,11 @@ export function cleanup() { logger.info('✅ Sandbox reconcile job stopped') } - // Stop project import reconcile job if it exists - if (globalThis.__projectImportReconcileJob) { - globalThis.__projectImportReconcileJob.stop() - globalThis.__projectImportReconcileJob = undefined - logger.info('✅ Project import reconcile job stopped') + // Stop project task reconcile job if it exists + if (globalThis.__projectTaskReconcileJob) { + globalThis.__projectTaskReconcileJob.stop() + globalThis.__projectTaskReconcileJob = undefined + logger.info('✅ Project task reconcile job stopped') } logger.info('✅ Cleanup completed') @@ -142,5 +142,5 @@ declare global { var __sandboxReconcileJob: Cron | undefined - var __projectImportReconcileJob: Cron | undefined + var __projectTaskReconcileJob: Cron | undefined } diff --git a/lib/util/project-display-status.ts b/lib/util/project-display-status.ts index a02e5e5..ac350e0 100644 --- a/lib/util/project-display-status.ts +++ b/lib/util/project-display-status.ts @@ -1,4 +1,4 @@ -import type { ProjectImportStatus, ProjectStatus } from '@prisma/client' +import type { ProjectStatus, ProjectTask, ProjectTaskStatus, ProjectTaskType } from '@prisma/client' export type ProjectDisplayStatus = | 'CREATING' @@ -14,9 +14,9 @@ export type ProjectDisplayStatus = type ProjectDisplayStatusInput = { status: ProjectStatus - importStatus: ProjectImportStatus githubRepoFullName?: string | null githubAppInstallationId?: string | null + tasks?: Pick[] } export function isImportProject(project: { @@ -30,8 +30,9 @@ export function getProjectDisplayStatus( project: ProjectDisplayStatusInput ): ProjectDisplayStatus { const importProject = isImportProject(project) + const cloneTask = getLatestTask(project.tasks, 'CLONE_REPOSITORY') - if (project.importStatus === 'FAILED') { + if (cloneTask?.status === 'FAILED') { return 'NEEDS_ATTENTION' } @@ -55,7 +56,7 @@ export function getProjectDisplayStatus( return 'UPDATING' } - if (importProject && (project.importStatus === 'PENDING' || project.importStatus === 'CLONING')) { + if (importProject && cloneTask && isActiveTaskStatus(cloneTask.status)) { return 'IMPORTING' } @@ -73,3 +74,16 @@ export function getProjectDisplayStatus( return 'NEEDS_ATTENTION' } + +function getLatestTask( + tasks: ProjectDisplayStatusInput['tasks'], + type: ProjectTaskType +) { + return tasks + ?.filter((task) => task.type === type) + .sort((a, b) => b.createdAt.getTime() - a.createdAt.getTime())[0] +} + +function isActiveTaskStatus(status: ProjectTaskStatus): boolean { + return ['PENDING', 'WAITING_FOR_PREREQUISITES', 'RUNNING'].includes(status) +} diff --git a/prisma/migrations/20260311171810_project_task_platform/migration.sql b/prisma/migrations/20260311171810_project_task_platform/migration.sql new file mode 100644 index 0000000..239c657 --- /dev/null +++ b/prisma/migrations/20260311171810_project_task_platform/migration.sql @@ -0,0 +1,53 @@ +-- CreateEnum +CREATE TYPE "ProjectTaskType" AS ENUM ('CLONE_REPOSITORY', 'INSTALL_SKILL', 'UNINSTALL_SKILL', 'DEPLOY_PROJECT'); + +-- CreateEnum +CREATE TYPE "ProjectTaskStatus" AS ENUM ('PENDING', 'WAITING_FOR_PREREQUISITES', 'RUNNING', 'SUCCEEDED', 'FAILED', 'CANCELLED'); + +-- CreateEnum +CREATE TYPE "ProjectTaskTriggerSource" AS ENUM ('USER_ACTION', 'SYSTEM_EVENT', 'POLICY_ROLLOUT'); + +-- AlterTable +ALTER TABLE "Project" DROP COLUMN "importError", +DROP COLUMN "importLockedUntil", +DROP COLUMN "importStatus"; + +-- DropEnum +DROP TYPE "ProjectImportStatus"; + +-- CreateTable +CREATE TABLE "ProjectTask" ( + "id" TEXT NOT NULL, + "projectId" TEXT NOT NULL, + "sandboxId" TEXT, + "type" "ProjectTaskType" NOT NULL, + "status" "ProjectTaskStatus" NOT NULL DEFAULT 'PENDING', + "triggerSource" "ProjectTaskTriggerSource" NOT NULL, + "payload" JSONB, + "result" JSONB, + "error" TEXT, + "attemptCount" INTEGER NOT NULL DEFAULT 0, + "maxAttempts" INTEGER NOT NULL DEFAULT 3, + "lockedUntil" TIMESTAMP(3), + "startedAt" TIMESTAMP(3), + "finishedAt" TIMESTAMP(3), + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "ProjectTask_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE INDEX "ProjectTask_projectId_status_idx" ON "ProjectTask"("projectId", "status"); + +-- CreateIndex +CREATE INDEX "ProjectTask_status_lockedUntil_idx" ON "ProjectTask"("status", "lockedUntil"); + +-- CreateIndex +CREATE INDEX "ProjectTask_type_status_idx" ON "ProjectTask"("type", "status"); + +-- AddForeignKey +ALTER TABLE "ProjectTask" ADD CONSTRAINT "ProjectTask_projectId_fkey" FOREIGN KEY ("projectId") REFERENCES "Project"("id") ON DELETE CASCADE ON UPDATE CASCADE; + +-- AddForeignKey +ALTER TABLE "ProjectTask" ADD CONSTRAINT "ProjectTask_sandboxId_fkey" FOREIGN KEY ("sandboxId") REFERENCES "Sandbox"("id") ON DELETE SET NULL ON UPDATE CASCADE; diff --git a/prisma/migrations/migration_lock.toml b/prisma/migrations/migration_lock.toml new file mode 100644 index 0000000..2fe25d8 --- /dev/null +++ b/prisma/migrations/migration_lock.toml @@ -0,0 +1 @@ +provider = "postgresql" diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 445269f..cf5f8c0 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -15,9 +15,9 @@ model User { updatedAt DateTime @updatedAt // Relations - identities UserIdentity[] // One-to-many: User has multiple authentication identities - projects Project[] // One-to-many: User owns multiple projects - configs UserConfig[] // One-to-many: User global configurations + identities UserIdentity[] // One-to-many: User has multiple authentication identities + projects Project[] // One-to-many: User owns multiple projects + configs UserConfig[] // One-to-many: User global configurations githubInstallations GitHubAppInstallation[] // One-to-many: User's GitHub App installations } @@ -68,19 +68,19 @@ model UserConfig { // GitHubAppInstallation: Tracks GitHub App installations // Required for obtaining installation access tokens model GitHubAppInstallation { - id String @id @default(cuid()) - installationId Int @unique // GitHub's numeric installation ID + id String @id @default(cuid()) + installationId Int @unique // GitHub's numeric installation ID userId String - accountId Int // GitHub account numeric ID - accountLogin String // GitHub username or organization name - accountType String // "User" or "Organization" + accountId Int // GitHub account numeric ID + accountLogin String // GitHub username or organization name + accountType String // "User" or "Organization" accountAvatarUrl String? repositorySelection String // "all" or "selected" - permissions Json @default("{}") // App permissions - events Json @default("[]") // Subscribed events + permissions Json @default("{}") // App permissions + events Json @default("[]") // Subscribed events status GitHubInstallationStatus @default(ACTIVE) suspendedAt DateTime? @@ -113,11 +113,6 @@ model Project { githubRepoFullName String? // "owner/repo-name", for display githubRepoDefaultBranch String? // Repository default branch for import workflow - // GitHub import lifecycle - importStatus ProjectImportStatus @default(READY) - importError String? - importLockedUntil DateTime? // Lock for import reconcile job - // Aggregated status based on child resources status ProjectStatus @default(CREATING) @@ -125,10 +120,11 @@ model Project { updatedAt DateTime @updatedAt // Relations: Project is a collection of resources - user User @relation(fields: [userId], references: [id], onDelete: Cascade) // Many-to-one: Project belongs to a User - environments Environment[] // One-to-many: Configuration variables - databases Database[] // One-to-many: Multiple database clusters - sandboxes Sandbox[] // One-to-many: Multiple runtime containers + user User @relation(fields: [userId], references: [id], onDelete: Cascade) // Many-to-one: Project belongs to a User + environments Environment[] // One-to-many: Configuration variables + databases Database[] // One-to-many: Multiple database clusters + sandboxes Sandbox[] // One-to-many: Multiple runtime containers + tasks ProjectTask[] // One-to-many: Project-level asynchronous tasks githubAppInstallation GitHubAppInstallation? @relation(fields: [githubAppInstallationId], references: [id], onDelete: SetNull) @@index([githubAppInstallationId]) @@ -231,7 +227,8 @@ model Sandbox { updatedAt DateTime @updatedAt // Relation: Many-to-one to Project (foreign key: projectId) - project Project @relation(fields: [projectId], references: [id], onDelete: Cascade) + project Project @relation(fields: [projectId], references: [id], onDelete: Cascade) + tasks ProjectTask[] @@unique([projectId, name]) // Unique sandbox name per project @@unique([k8sNamespace, sandboxName]) // Unique sandbox name per namespace @@ -239,6 +236,32 @@ model Sandbox { @@index([sandboxName]) } +model ProjectTask { + id String @id @default(cuid()) + projectId String + sandboxId String? + type ProjectTaskType + status ProjectTaskStatus @default(PENDING) + triggerSource ProjectTaskTriggerSource + payload Json? + result Json? + error String? + attemptCount Int @default(0) + maxAttempts Int @default(3) + lockedUntil DateTime? + startedAt DateTime? + finishedAt DateTime? + createdAt DateTime @default(now()) + updatedAt DateTime @updatedAt + + project Project @relation(fields: [projectId], references: [id], onDelete: Cascade) + sandbox Sandbox? @relation(fields: [sandboxId], references: [id], onDelete: SetNull) + + @@index([projectId, status]) + @@index([status, lockedUntil]) + @@index([type, status]) +} + // Project aggregated status based on child resources // Aggregation rules (priority order): // 1. ERROR: At least one resource has ERROR status @@ -269,12 +292,26 @@ enum ProjectStatus { PARTIAL // Inconsistent mixed states - manual intervention needed } -// GitHub repository import lifecycle for a project -enum ProjectImportStatus { - PENDING // Waiting for sandbox to become RUNNING - CLONING // Repository clone in progress - READY // Import finished or not an imported project - FAILED // Import failed; project remains usable as empty project +enum ProjectTaskType { + CLONE_REPOSITORY + INSTALL_SKILL + UNINSTALL_SKILL + DEPLOY_PROJECT +} + +enum ProjectTaskStatus { + PENDING + WAITING_FOR_PREREQUISITES + RUNNING + SUCCEEDED + FAILED + CANCELLED +} + +enum ProjectTaskTriggerSource { + USER_ACTION + SYSTEM_EVENT + POLICY_ROLLOUT } // Individual resource status (Database and Sandbox) From 22ca386e5970be644b160b3c5187235c48693f48 Mon Sep 17 00:00:00 2001 From: Che <30403707+Che-Zhu@users.noreply.github.com> Date: Thu, 12 Mar 2026 10:50:20 +0800 Subject: [PATCH 2/3] docs: update architecture direction --- docs/architecture-evolution.md | 377 ++++++++++++++++++ docs/architecture.md | 672 +++++++++++++++++++-------------- 2 files changed, 763 insertions(+), 286 deletions(-) create mode 100644 docs/architecture-evolution.md diff --git a/docs/architecture-evolution.md b/docs/architecture-evolution.md new file mode 100644 index 0000000..835b844 --- /dev/null +++ b/docs/architecture-evolution.md @@ -0,0 +1,377 @@ +# Architecture Evolution + +This document describes the intended next stage of Fulling's codebase evolution. + +It is not a description of the current repository layout. It is the target direction for making the code structure match the system architecture more directly. + +## Why Evolve + +Fulling is no longer just a conventional web application. + +It is a platform with: + +- a database-backed control plane +- resource reconciliation for Kubernetes infrastructure +- project-level asynchronous task execution inside sandboxes +- multiple external integrations such as GitHub, ttyd, Kubernetes, and future deploy providers + +As the system grows, a framework-shaped code layout stops matching the real mental model of the platform. + +The goal of the next phase is to make the codebase express the system in the same terms we use to reason about it: + +```text +Intent -> State -> Reconcile -> Effect +``` + +## Target Mental Model + +The ideal architecture should be understandable as five layers: + +1. Interaction Layer +2. Control State Layer +3. Orchestration Layer +4. Execution Layer +5. Integration Layer + +A reader should be able to answer these questions quickly: + +- Where does user intent enter the system? +- Where is that intent persisted as state? +- Which background mechanism advances that state? +- Which code actually performs the external effect? +- Which files encapsulate external system protocols? + +## Target Architecture + +### 1. Interaction Layer + +Purpose: + +- receive requests from users or clients +- parse input +- call control-plane commands or queries +- return responses + +Examples: + +- Next.js route handlers +- Server Actions +- UI entry components + +This layer should not contain infrastructure lifecycle logic or long-running orchestration logic. + +### 2. Control State Layer + +Purpose: + +- convert user intent into persisted state transitions +- define command and query use cases +- decide which records must be created or updated + +Examples: + +- create project +- import project from GitHub +- create database +- dispatch install-skill tasks +- dispatch deploy task + +This layer is where the system decides what should happen, not where it actually performs it. + +### 3. Orchestration Layer + +Purpose: + +- scan persisted state +- evaluate state machine transitions +- determine whether prerequisites are met +- claim work and advance it safely + +Examples: + +- sandbox reconcile +- database reconcile +- project task reconcile + +This layer should own "when work is ready to run" and "what transition comes next." + +### 4. Execution Layer + +Purpose: + +- perform the actual work after orchestration decides it should happen + +Examples: + +- create sandbox in Kubernetes +- stop database cluster +- clone repository inside sandbox +- install skill in sandbox +- deploy project + +This layer should be effectful, explicit, and easy to test in isolation. + +### 5. Integration Layer + +Purpose: + +- isolate external system protocol details from internal orchestration and domain logic + +Examples: + +- GitHub App token exchange and installation APIs +- ttyd command execution +- Kubernetes service and managers +- deploy provider clients +- AI proxy integration + +This layer should be the only place that knows provider-specific protocol details. + +## Ideal Repository Shape + +The repository should gradually move toward something like this: + +```text +app/ + Framework entrypoints only + +lib/ + domain/ + control/ + commands/ + queries/ + persistence/ + orchestrators/ + resources/ + tasks/ + executors/ + k8s/ + sandbox/ + deploy/ + integrations/ + github/ + ttyd/ + k8s/ + aiproxy/ + policies/ + shared/ +``` + +This is not a requirement to rename everything immediately. It is the target shape that best matches the architecture. + +## What Each Target Area Means + +### `app/` + +Should contain: + +- routes +- pages +- route-local UI composition +- request parsing +- response shaping + +Should not be the place where major platform logic lives. + +### `lib/domain/` + +Should contain: + +- state semantics +- status aggregation rules +- lifecycle definitions +- prerequisite evaluation logic +- domain types and invariants + +Typical examples: + +- project status aggregation +- task prerequisite rules +- lifecycle transition rules + +### `lib/control/` + +Should contain: + +- application commands +- application queries +- user-intent entrypoints that are independent from framework details + +Typical examples: + +- `create-project` +- `import-project-from-github` +- `dispatch-install-skill-for-user-projects` +- `get-project-list` + +This is the layer that turns interaction into durable state changes. + +### `lib/persistence/` + +Should contain: + +- database access +- row claiming +- lock management +- state transition persistence + +This corresponds closely to what `lib/repo/` does today, but with a name that better reflects its role in a control-plane system. + +### `lib/orchestrators/` + +Should contain: + +- reconcile loops +- transition selection +- scheduling logic +- wake-up logic for resource or task processing + +This layer is currently spread across `jobs/` and `events/`. + +Long term, the code should make it easy to inspect one workflow in one place, instead of hopping across multiple implementation-mechanism directories. + +### `lib/executors/` + +Should contain: + +- effectful operations triggered by orchestrators + +Examples: + +- Kubernetes resource actions +- sandbox command execution +- deployment actions + +The key separation is: + +- orchestrators decide whether to run +- executors perform the work + +### `lib/integrations/` + +Should contain: + +- provider-specific APIs +- transport/protocol code +- external authentication/token exchange + +This is especially important because `services/` and `util/` currently mix together: + +- true business services +- external adapters +- generic helpers + +That mixing increases cognitive load. + +## Structural Rules For The Next Phase + +### Rule 1: Keep resource plane and task plane distinct + +Resource lifecycle and project task execution are related but not the same. + +- `Sandbox` and `Database` belong to the resource plane +- `ProjectTask` belongs to the task plane + +They should not collapse into a single generic abstraction too early. + +### Rule 2: Prefer architecture-shaped names over framework-shaped names + +Names should describe system role, not just invocation style. + +Examples: + +- better: `control`, `orchestrators`, `integrations`, `executors` +- weaker: `services`, `util`, `helpers` + +### Rule 3: Polling remains the correctness mechanism + +Wake-up triggers are useful, but background reconcile loops remain the source of truth. + +The architecture should continue to optimize for recoverability, not only low latency. + +### Rule 4: State machines should remain explicit + +If work needs to resume later, it should be represented in persisted state. + +Avoid moving important lifecycle meaning into hidden in-memory logic. + +### Rule 5: Reorganization should follow system boundaries, not file count + +The purpose of the reorganization is not aesthetic. + +It is to make the codebase reflect the platform's actual conceptual boundaries: + +- intent handling +- persisted state +- orchestration +- execution +- external integration + +## Suggested Migration Strategy + +The next phase should be gradual. + +### Phase 1: Clarify naming and ownership + +Goals: + +- continue building `ProjectTask` as the task-plane abstraction +- reduce ambiguous use of `services/` and `util/` +- document system ownership by layer + +Low-risk changes: + +- move external adapters toward `integrations/` +- move business rules toward `domain/` or `policies/` +- introduce command/query naming for control-plane operations + +### Phase 2: Reshape orchestration boundaries + +Goals: + +- make workflows easier to inspect end-to-end +- reduce mental jumps between job and listener implementations + +Likely work: + +- make resource orchestration more visibly grouped +- make task orchestration more visibly grouped +- separate transition logic from executor logic more explicitly + +### Phase 3: Unify platform patterns where appropriate + +Goals: + +- reduce duplicated scheduling and transition mechanics +- strengthen task dependency and execution models + +Examples: + +- generalized task prerequisite evaluation +- shared orchestration helpers for claiming and transition persistence +- richer task types such as deploy, install skill, uninstall skill + +This phase should happen only after the architecture is clearly expressed in the codebase. + +## What "Success" Looks Like + +The reorganization is successful when the following are true: + +- a new engineer can trace a major flow without guessing layer ownership +- resource lifecycles are easy to inspect in one conceptual area +- task execution flows are easy to inspect in one conceptual area +- external provider code is clearly separated from domain and orchestration code +- adding a new task type does not require inventing a new architectural path + +## Current Direction + +The current `ProjectTask` introduction is the first concrete move in this direction. + +It shifts GitHub import from a project-specific status track into a reusable task-plane abstraction. + +That same direction should guide the next structural changes: + +- task types should grow through the task plane +- external adapters should become more explicit +- orchestration should become easier to inspect +- the repository layout should increasingly mirror the architecture itself diff --git a/docs/architecture.md b/docs/architecture.md index c76a0fa..09458df 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,353 +1,453 @@ -# Architecture Design Document +# Architecture -This document describes the architecture of FullstackAgent, an AI-powered cloud development platform. +This document describes the current runtime architecture of Fulling and the roles of the major subsystems in the repository. -## Table of Contents +## Overview -- [Overview](#overview) -- [Reconciliation Pattern](#reconciliation-pattern) -- [System Layers](#system-layers) -- [Event System](#event-system) -- [State Management](#state-management) -- [Resource Lifecycle](#resource-lifecycle) -- [Key Design Decisions](#key-design-decisions) +Fulling is a database-driven control plane for project sandboxes. -## Overview +The system is organized around three ideas: -FullstackAgent creates isolated Kubernetes sandbox environments for full-stack development. Each project gets: +1. User-facing code records intent in PostgreSQL. +2. Background reconcile jobs read persisted state and advance it asynchronously. +3. External effects happen through two execution layers: + - Kubernetes resource control for sandboxes and databases + - Project task execution inside ready sandboxes -- **Sandbox Container**: Next.js + Claude Code CLI + ttyd terminal + FileBrowser -- **PostgreSQL Database**: Dedicated KubeBlocks cluster -- **Live Domains**: HTTPS subdomains for app, terminal, and file browser +This is not a request/response system that blocks on infrastructure. It is an asynchronous state-convergence system. -### Core Principle: Asynchronous Reconciliation +## High-Level Model -The platform uses an **asynchronous reconciliation pattern** where: +There are three major domains: -1. API endpoints return immediately (non-blocking) -2. Background jobs sync desired state (database) with actual state (Kubernetes) -3. Event listeners execute K8s operations -4. Status updates happen asynchronously +- Control plane + - Next.js pages, Server Actions, and API routes + - Authentication and authorization + - Prisma models as the source of truth +- Resource plane + - `Sandbox` and `Database` lifecycle management + - Kubernetes operations through user-scoped services +- Task plane + - `ProjectTask` records for project-level asynchronous work + - Current use case: clone a GitHub repository into a sandbox + - Future use cases: install skill, uninstall skill, deploy project -``` -User Request → API updates DB (status=CREATING) → Returns immediately (< 50ms) - ↓ - Reconciliation Job (every 3s) - ↓ - Emit Events → Listeners execute K8s ops - ↓ - Update status: CREATING → STARTING → RUNNING - ↓ - Frontend polls for updates +At a high level: + +```text +User action +-> App code validates and writes desired state to DB +-> Reconcile jobs scan DB for records in transitional states +-> Event listeners / task executors perform external work +-> Resource/task state is updated in DB +-> UI polls and reflects the latest state ``` -## Reconciliation Pattern +## Repository Structure -### Why Reconciliation? +```text +app/ + UI routes, API routes, and route-local components -Traditional approach (blocking API): -- API waits for K8s operations (30+ seconds) -- User sees loading spinner -- Timeout errors common -- Hard to recover from failures +components/ + Shared UI components -Reconciliation approach (non-blocking): -- API returns immediately (< 50ms) -- Background jobs handle K8s operations -- Automatic retry on failures -- Easy to monitor and debug +lib/actions/ + Server Actions called from client components -### Flow Example +lib/data/ + Server-side data access for React Server Components -``` -1. User clicks "Create Project" - POST /api/projects { name: "my-blog" } - -2. API creates database records immediately - Project: status=CREATING - Sandbox: status=CREATING - Database: status=CREATING - -3. Reconciliation job runs (every 3s) - - Query: SELECT * FROM Sandbox WHERE status='CREATING' AND lockedUntil IS NULL - - Locks sandbox (optimistic locking) - - Emits CreateSandbox event - -4. Event listener executes - - Gets user-specific K8s service - - Creates StatefulSet, Service, Ingresses - - Updates status: CREATING → STARTING - -5. Next cycle checks K8s status - - If RUNNING: Update status to RUNNING - - Aggregate project status from child resources -``` +lib/repo/ + Persistence helpers, locking, and state transitions + +lib/jobs/ + Background reconcile loops + +lib/events/ + Resource event buses and listeners -### Optimistic Locking +lib/k8s/ + User-scoped Kubernetes service and managers -Prevents concurrent updates to the same resource: +lib/services/ + Cross-cutting services and task dispatch helpers -```typescript -// Repository layer automatically handles locking -const lockedSandboxes = await acquireAndLockSandboxes(10) -// Only returns sandboxes where lockedUntil IS NULL OR < NOW() -// Sets lockedUntil = NOW() + 30 seconds atomically +lib/util/ + Aggregation, ttyd execution, formatting, and helpers + +prisma/ + Prisma schema and migrations ``` -Lock duration: 30 seconds (configurable) +## Core Runtime Layers + +### 1. User Interaction Layer -## System Layers +Primary locations: -### Layer 1: Control Plane (Main App) +- `app/` +- `lib/actions/` +- `lib/data/` -- **Framework**: Next.js 16 (App Router) + React 19 -- **Authentication**: NextAuth v5 (GitHub, Password, Sealos OAuth) -- **Database**: Prisma ORM → PostgreSQL -- **Responsibilities**: - - Manages projects, users, environment variables - - Does NOT directly execute K8s operations - - Only updates database +Responsibilities: -### Layer 2: Reconciliation System +- Authenticate the user +- Validate inputs +- Decide whether an operation is allowed +- Write the resulting state to the database +- Return immediately without waiting on Kubernetes or long-running sandbox work -- **Background Jobs**: `lib/jobs/sandbox/`, `lib/jobs/database/` -- **Event System**: `lib/events/sandbox/`, `lib/events/database/` -- **Repository Layer**: `lib/repo/` with optimistic locking -- **Status Aggregation**: `lib/util/projectStatus.ts` +Examples: -### Layer 3: Kubernetes Managers +- Create new project + - create `Project` + - create `Sandbox` + - create built-in environment variables +- Import from GitHub + - create `Project` + - create `Sandbox` + - create a `ProjectTask` of type `CLONE_REPOSITORY` +- Add database + - create `Database` with status `CREATING` +- Update environment variables + - persist environment changes + - mark running sandboxes as `UPDATING` -- **SandboxManager**: `lib/k8s/sandbox-manager.ts` - StatefulSet operations -- **DatabaseManager**: `lib/k8s/database-manager.ts` - KubeBlocks operations -- **K8sServiceHelper**: `lib/k8s/k8s-service-helper.ts` - User-specific service factory -- **All operations are idempotent and non-blocking** +### 2. State Persistence Layer -### Layer 4: Kubernetes Orchestration +Primary locations: -- **Platform**: Sealos (usw.sealos.io) -- **Namespaces**: Each user operates in their own namespace -- **Resources per project**: - - 1 StatefulSet (sandbox) - - 1 Service - - 3 Ingresses (app, terminal, filebrowser) - - 1 PostgreSQL cluster (KubeBlocks) +- `prisma/schema.prisma` +- `lib/repo/` -### Layer 5: Runtime Containers +The database is the durable control plane. -- **Image**: `limbo2342/fullstack-web-runtime:sha-ca2470e` -- **Base**: Ubuntu 24.04 + Node.js 22.x -- **Includes**: - - Claude Code CLI - - ttyd (web terminal with HTTP Basic Auth) - - FileBrowser (web file manager) - - Next.js, Prisma, PostgreSQL client - - Buildah (rootless container builds) +The key models are: -## Event System +- `Project` + - project metadata + - aggregated project status + - optional GitHub repository metadata +- `Sandbox` + - sandbox lifecycle state + - URLs and runtime resource configuration +- `Database` + - PostgreSQL lifecycle state + - connection credentials once ready +- `Environment` + - project-scoped environment variables +- `ProjectTask` + - project-level asynchronous work + - payload, result, retries, and locks +- `GitHubAppInstallation` + - GitHub App installation ownership and permissions -### Event Bus +The repository layer is where row locking and state transitions are centralized. -Each resource type has its own event bus: +## Resource Plane -```typescript -// lib/events/sandbox/bus.ts -export const enum Events { - CreateSandbox = 'CreateSandbox', - StartSandbox = 'StartSandbox', - StopSandbox = 'StopSandbox', - DeleteSandbox = 'DeleteSandbox', - UpdateSandbox = 'UpdateSandbox', -} +The resource plane manages infrastructure resources that exist independently in Kubernetes. -export const sandboxEventBus = new EventEmitter() +### Sandbox Lifecycle + +Primary files: + +- `lib/jobs/sandbox/sandboxReconcile.ts` +- `lib/events/sandbox/sandboxListener.ts` +- `lib/repo/sandbox.ts` +- `lib/k8s/sandbox-manager.ts` + +States: + +- `CREATING` +- `STARTING` +- `RUNNING` +- `UPDATING` +- `STOPPING` +- `STOPPED` +- `TERMINATING` +- `TERMINATED` +- `ERROR` + +Flow: + +```text +Sandbox.status = CREATING +-> sandbox reconcile job locks the row +-> emits CreateSandbox +-> listener creates K8s resources and writes ingress URLs +-> Sandbox.status = STARTING +-> later reconcile checks K8s status +-> Sandbox.status = RUNNING ``` -### Event Listeners +Environment updates reuse the same mechanism: -Listeners are registered at application startup: +```text +Environment changed +-> running sandboxes marked UPDATING +-> update sandbox env vars in Kubernetes +-> pod restarts if needed +-> sandbox returns to STARTING or RUNNING +``` -```typescript -// lib/events/sandbox/sandboxListener.ts -export function registerSandboxListeners(): void { - on(Events.CreateSandbox, handleCreateSandbox) - on(Events.StartSandbox, handleStartSandbox) - on(Events.StopSandbox, handleStopSandbox) - on(Events.DeleteSandbox, handleDeleteSandbox) - on(Events.UpdateSandbox, handleUpdateSandbox) -} +### Database Lifecycle -// Auto-register when module is imported -registerSandboxListeners() +Primary files: + +- `lib/jobs/database/databaseReconcile.ts` +- `lib/events/database/databaseListener.ts` +- `lib/repo/database.ts` +- `lib/k8s/database-manager.ts` + +States: + +- `CREATING` +- `STARTING` +- `RUNNING` +- `STOPPING` +- `STOPPED` +- `TERMINATING` +- `TERMINATED` +- `ERROR` + +Flow: + +```text +Database.status = CREATING +-> database reconcile job locks the row +-> emits CreateDatabase +-> listener creates KubeBlocks cluster +-> Database.status = STARTING +-> later reconcile checks cluster status +-> credentials are fetched +-> Database.status = RUNNING ``` -### Event Handler Pattern +### Project Status Aggregation -```typescript -async function handleCreateSandbox(payload: SandboxEventPayload): Promise { - const { user, project, sandbox } = payload +Primary file: - if (sandbox.status !== 'CREATING') return +- `lib/util/projectStatus.ts` - try { - const k8sService = await getK8sServiceForUser(user.id) - await k8sService.getSandboxManager().createSandbox({...}) - await updateSandboxStatus(sandbox.id, 'STARTING') - await projectStatusReconcile(project.id) - } catch (error) { - await updateSandboxStatus(sandbox.id, 'ERROR') - } -} -``` +`Project.status` is derived from child resource states. It is not the main driver of work. -## State Management +Priority order: -### Resource Status +1. `ERROR` +2. `CREATING` +3. `UPDATING` +4. all resources equal the same stable state +5. consistent mixed transitions: + - `{RUNNING, STARTING}` -> `STARTING` + - `{STOPPED, STOPPING}` -> `STOPPING` + - `{TERMINATED, TERMINATING}` -> `TERMINATING` +6. otherwise `PARTIAL` -Individual resources (Sandbox, Database) have these states: +When a project has no remaining resources, the project and its environments are deleted. -| Status | Description | -|--------|-------------| -| `CREATING` | K8s resource being initially created | -| `STARTING` | Transitioning from STOPPED to RUNNING | -| `RUNNING` | Active and operational | -| `STOPPING` | Transitioning from RUNNING to STOPPED | -| `STOPPED` | Paused (replicas=0) | -| `UPDATING` | Environment variables being updated | -| `TERMINATING` | Being deleted from K8s | -| `TERMINATED` | Deleted from K8s (soft delete in DB) | -| `ERROR` | Encountered an error | +## Task Plane -### Project Status Aggregation +The task plane manages project-level work that happens after a sandbox is ready. -Project status is **aggregated** from child resources: - -**Priority order**: -1. **ERROR** - At least one resource has ERROR -2. **CREATING** - At least one resource has CREATING -3. **UPDATING** - At least one resource has UPDATING -4. **Pure states** - All same status → use that status -5. **Transition states**: - - STARTING: All ∈ {RUNNING, STARTING} - - STOPPING: All ∈ {STOPPED, STOPPING} - - TERMINATING: All ∈ {TERMINATED, TERMINATING} -6. **PARTIAL** - Inconsistent mixed states (manual intervention needed) - -```typescript -// lib/util/projectStatus.ts -export function aggregateProjectStatus( - sandboxes: Sandbox[], - databases: Database[] -): ProjectStatus { - // Implementation follows priority rules above -} -``` +Primary files: -## Resource Lifecycle +- `lib/jobs/project-task/projectTaskReconcile.ts` +- `lib/jobs/project-task/executors/` +- `lib/repo/project-task.ts` +- `lib/services/project-task-dispatcher.ts` -### Sandbox Lifecycle +### Current Task Types -``` -CREATING → STARTING → RUNNING ⇄ STOPPING → STOPPED - ↓ ↓ ↓ ↓ - └──────────┴─────────┴─────────→ ERROR - ↓ - TERMINATING → TERMINATED +- `CLONE_REPOSITORY` +- `INSTALL_SKILL` +- `UNINSTALL_SKILL` +- `DEPLOY_PROJECT` + +Only `CLONE_REPOSITORY` is implemented today. + +### Task States + +- `PENDING` +- `WAITING_FOR_PREREQUISITES` +- `RUNNING` +- `SUCCEEDED` +- `FAILED` +- `CANCELLED` + +### Task Flow + +```text +User imports a GitHub repository +-> app creates Project + Sandbox +-> app creates ProjectTask(type=CLONE_REPOSITORY, status=WAITING_FOR_PREREQUISITES) +-> sandbox reaches RUNNING +-> task reconcile sees prerequisites are now satisfied +-> task executor runs git clone inside the sandbox through ttyd +-> task becomes SUCCEEDED or FAILED ``` -**Transitions**: -- `CREATING → STARTING`: K8s resources created -- `STARTING → RUNNING`: Pod ready -- `RUNNING → STOPPING`: Stop requested -- `STOPPING → STOPPED`: Pod terminated -- `STOPPED → STARTING`: Start requested -- `Any → ERROR`: Operation failed -- `Any → TERMINATING`: Delete requested -- `TERMINATING → TERMINATED`: K8s resources deleted +Task execution data lives in: -### Database Lifecycle +- `payload` + - executor input, such as repo metadata or skill id +- `result` + - executor output, such as imported path +- `error` + - terminal error message for failed tasks + +## Polling and Triggering + +The system uses both polling and direct wake-up triggers. + +### Polling + +Background jobs continuously scan the database: + +- sandbox reconcile job +- database reconcile job +- project task reconcile job + +This is the correctness mechanism. + +### Direct wake-up triggers + +Some transitions accelerate work without replacing polling. + +Example: + +- when a sandbox becomes `RUNNING`, sandbox listeners call `triggerRunnableTasksForProject(projectId)` + +This reduces latency, but correctness still depends on periodic reconcile loops. + +## Locking Model -Same as Sandbox, but for KubeBlocks PostgreSQL clusters. +The system uses database-based optimistic coordination, not an external queue. -### Environment Variable Updates +Patterns: -When environment variables change: +- resource rows (`Sandbox`, `Database`) have `lockedUntil` +- task rows (`ProjectTask`) also have `lockedUntil` +- reconcile queries atomically select and lock eligible rows +- row-level transitions are updated in repo helpers +This avoids duplicate processing across concurrent app instances. + +## Kubernetes Integration + +Primary file: + +- `lib/k8s/k8s-service-helper.ts` + +Rule: + +- always obtain Kubernetes access through `getK8sServiceForUser(userId)` + +Why: + +- each user has a user-scoped kubeconfig +- each user operates in a separate namespace +- the app should never perform cluster operations without user scoping + +Kubernetes resources currently managed per project: + +- one sandbox StatefulSet +- one sandbox Service +- three sandbox Ingresses +- optional PostgreSQL cluster through KubeBlocks + +## GitHub Integration + +Primary files: + +- `lib/actions/github.ts` +- `lib/services/github-app.ts` +- `app/api/github/app/callback/route.ts` + +The system uses GitHub App installations, not anonymous repository access. + +Import flow: + +1. user installs GitHub App +2. installation is recorded in `GitHubAppInstallation` +3. user chooses a repository in the import dialog +4. import action verifies repository access against the installation +5. project creation creates a clone task +6. task executor clones the repo into the sandbox using an installation token + +## Design Rules + +### Non-blocking control plane + +User-facing endpoints should write desired state and return. They should not block on Kubernetes creation or long sandbox operations. + +### State machines over ad hoc branching + +If the system needs to resume work later, represent that as persisted state instead of in-memory flags. + +### Resource plane and task plane stay separate + +Use resource states for infrastructure lifecycle. +Use project tasks for asynchronous work that runs on top of ready infrastructure. + +### Polling is the source of truth + +Event-triggered wake-ups are an optimization. Reconcile jobs remain the primary correctness mechanism. + +## Current End-to-End Flows + +### New project + +```text +Create project +-> Project.status = CREATING +-> Sandbox.status = CREATING +-> sandbox reconcile creates and starts sandbox +-> Sandbox.status = RUNNING +-> Project.status aggregates to RUNNING ``` -RUNNING → UPDATING → STARTING → RUNNING + +### Import from GitHub + +```text +Import from GitHub +-> Project + Sandbox created +-> ProjectTask(CLONE_REPOSITORY) created +-> sandbox reconcile drives sandbox to RUNNING +-> project task reconcile runs clone executor +-> task becomes SUCCEEDED or FAILED +-> project remains usable regardless of task outcome ``` -1. Status changes to UPDATING -2. StatefulSet spec updated (triggers pod restart) -3. Status changes to STARTING -4. Pod restarts with new env vars -5. Status changes to RUNNING - -## Key Design Decisions - -### Why StatefulSet instead of Deployment? - -**StatefulSet benefits**: -- **Persistent storage**: Each pod gets its own PVC -- **Stable network identities**: Predictable pod names -- **Ordered deployment**: Graceful startup/shutdown -- **Stateful apps**: Better for databases, caches - -**Trade-offs**: -- Slightly slower scaling -- More complex updates - -### Why Reconciliation Pattern? - -**Benefits**: -- Non-blocking API responses -- Automatic recovery from failures -- Consistent state management -- Easy to monitor and debug -- Idempotent operations - -**Trade-offs**: -- Eventual consistency (up to 3s delay) -- More complex architecture - -### Why HTTP Basic Auth for ttyd? - -**Previous approach**: Custom authentication script -- Required maintaining shell script -- Complex session management -- Login popup in browser - -**Current approach**: HTTP Basic Auth (ttyd native) -- No custom scripts needed -- URL-based authentication: `?authorization=base64(user:password)` -- Seamless browser integration -- No login popup - -### Why FileBrowser Integration? - -**Benefits**: -- Web-based file management -- Drag & drop file upload -- TUS protocol for large files -- Session tracking for cwd detection -- No additional authentication (uses own credentials) - -### Why User-Specific Namespaces? - -**Benefits**: -- Multi-tenancy isolation -- Resource quotas per user -- Separate kubeconfig per user -- No cross-user access - -**Implementation**: -- Kubeconfig stored in `UserConfig` table -- Loaded via `getK8sServiceForUser(userId)` -- Each user operates in their own namespace - -## Related Documentation - -- [API Reference](./api.md) - API endpoints and request/response formats -- [Database Schema](./database.md) - Prisma models and relationships -- [Development Guide](./development.md) - Local development and code patterns -- [Operations Manual](./operations.md) - Deployment and K8s operations -- [Troubleshooting](./troubleshooting.md) - Common issues and debugging +### Add database + +```text +Create database +-> Database.status = CREATING +-> database reconcile creates cluster +-> Database.status = STARTING +-> credentials become available +-> Database.status = RUNNING +``` + +### Deploy project + +Planned path: + +```text +User requests deploy +-> create ProjectTask(type=DEPLOY_PROJECT) +-> task reconcile waits for prerequisites +-> deploy executor performs deployment work +``` + +## When To Update This Document + +Update this document whenever one of the following changes: + +- a new persisted state machine is introduced +- resource lifecycle semantics change +- a new task type is added +- ownership of a subsystem moves to a different directory +- project-level work stops being sandbox-based From f219ce616670a340be38be04ce21609c87a9d4aa Mon Sep 17 00:00:00 2001 From: Che <30403707+Che-Zhu@users.noreply.github.com> Date: Thu, 12 Mar 2026 10:54:37 +0800 Subject: [PATCH 3/3] chore: update repo guidance --- AGENTS.md | 196 ++++++++++++++++++++++++++++++++++++++++ docs/troubleshooting.md | 6 ++ 2 files changed, 202 insertions(+) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..66c82cd --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,196 @@ +# AGENTS.md + +This file provides guidance to Codex (Codex.ai/code) when working with code in this repository. + +## Project Overview + +**Fulling v2** is an AI-powered development platform that integrates AI Agent ecosystem to provide full-stack development capabilities. Users can import existing projects from GitHub or create new projects directly on the platform. + +**Core Value**: Free users' mental bandwidth through AI Agents. Users focus on development while Agents silently handle complex operations (deployment, infrastructure, etc.) without interruption. + +**Key Features**: +- **Flexible Project Creation**: Import from GitHub repositories or create new projects from scratch +- **Optional Database**: Add PostgreSQL database on-demand when needed +- **AI Agent Ecosystem**: AI agents handle development, testing, deployment, and infrastructure management +- **Automated Operations**: Deployment, scaling, and infrastructure management happen automatically in the background +- **Full-Stack Development**: Complete environment with optional database, terminal, and file management +- **Zero Infrastructure Knowledge Required**: Users don't need to understand Kubernetes, networking, or DevOps + +**Architecture**: The platform uses an **asynchronous reconciliation pattern** where API endpoints return immediately and background jobs sync desired state (database) with actual state (Kubernetes) every 3 seconds. + +## Tech Stack + +### Frontend +- Framework: Next.js 16 (App Router) + React 19 +- Language: TypeScript +- Styling: Tailwind CSS v4 +- UI Components: Shadcn/UI + +### Backend +- Runtime: Node.js 22 +- API: Next.js API Routes +- Database ORM: Prisma +- Authentication: NextAuth v5 (GitHub, Password, Sealos OAuth) + +### Infrastructure +- Container Orchestration: Kubernetes +- Database: PostgreSQL (via KubeBlocks) +- Web Terminal: ttyd (HTTP Basic Auth) +- File Manager: FileBrowser + +## Key Conventions + +### Code Style +- Use TypeScript strict mode +- Always follow best practices +- Write self-documenting code: for complex functions, describe purpose, expected inputs, and expected outputs above the function +- Use functional components with hooks + +### Naming Conventions +- K8s resources: `{k8s-project-name}-{resource-type}-{6chars}` +- Environment variables: `UPPER_SNAKE_CASE` +- Database tables: PascalCase (Prisma models) +- API routes: kebab-case +- Files: kebab-case + +### Component Organization +- **Route-specific components**: Place in `_components/` directory under the route folder + - Use `_` prefix to prevent Next.js from treating it as a route + - Example: `app/(dashboard)/settings/_components/github-status-card.tsx` +- **Shared components**: Place in top-level `components/` directory + - Only for components used across multiple routes + - Example: `components/ui/button.tsx`, `components/sidebar.tsx` + +### Important Patterns + +1. **Always use user-specific K8s service**: + ```typescript + const k8sService = await getK8sServiceForUser(userId) + ``` + +2. **API endpoints are non-blocking**: + - Only update database + - Return immediately + - Reconciliation jobs handle K8s operations + +3. **Use optimistic locking**: + - Repository layer handles locking automatically + - Prevents concurrent updates + +4. **Follow reconciliation pattern**: + - API → Database → Reconciliation Job → Event → K8s Operation + - Status updates happen asynchronously + +## Key Design Decisions + +### Why StatefulSet? +- Persistent storage for each pod +- Stable network identities +- Ordered pod deployment + +### Why Reconciliation Pattern? +- Non-blocking API responses +- Automatic recovery from failures +- Consistent state management +- Easy to monitor and debug + +### Why User-Specific Namespaces? +- Multi-tenancy isolation +- Resource quotas per user +- Separate kubeconfig per user +- No cross-user access + +## Project Structure + +``` +Fulling/ +├── app/ # Next.js App Router +│ ├── api/ # API Routes +│ ├── (auth)/ # Auth pages +│ └── (dashboard)/projects/[id]/ +│ └── _components/ # Route-specific components +│ +├── components/ # Shared components +├── hooks/ # Client-side hooks +│ +├── lib/ +│ ├── data/ # Server-side data access (for Server Components) +│ ├── actions/ # Client-side data access (Server Actions) +│ ├── repo/ # Repository layer with optimistic locking +│ ├── services/ # Business logic services +│ ├── events/ # Event bus and listeners +│ ├── jobs/ # Reconciliation background jobs +│ ├── startup/ # Application initialization +│ ├── k8s/ # Kubernetes operations +│ └── util/ # Utility functions +│ +├── prisma/ # Prisma schema +├── provider/ # React Context providers +├── runtime/ # Sandbox Docker image +└── yaml/ # Kubernetes templates +``` + +**Key Directories**: +- `lib/data/` - Server-side data access, used by Server Components +- `lib/actions/` - Server Actions, used by Client Components +- `lib/repo/` - Repository with optimistic locking, used by Jobs/Events +- `lib/events/` + `lib/jobs/` - Core of reconciliation pattern +- `lib/startup/` - Initializes event listeners and reconciliation jobs + +## Documentation Index + +- [Architecture](./docs/architecture.md) - Reconciliation pattern, event system, state management +- [Development Guide](./docs/development.md) - Local development, code patterns, testing +- [Operations Manual](./docs/operations.md) - Deployment, monitoring, K8s operations +- [Troubleshooting](./docs/troubleshooting.md) - Common issues, debugging commands + +## Quick Reference + +### Development Commands +```bash +pnpm dev # Start dev server +pnpm build # Build for production +pnpm lint # Run ESLint +npx prisma generate # Generate Prisma client +npx prisma db push # Push schema to database +``` + +### Key Files +- `lib/k8s/k8s-service-helper.ts` - User-specific K8s service +- `lib/events/sandbox/sandboxListener.ts` - Sandbox lifecycle handlers +- `lib/jobs/sandbox/sandboxReconcile.ts` - Sandbox reconciliation job +- `lib/events/database/databaseListener.ts` - Database lifecycle handlers +- `lib/jobs/database/databaseReconcile.ts` - Database reconciliation job +- `lib/actions/project.ts` - Project creation (creates Sandbox only) +- `lib/actions/database.ts` - Database creation/deletion (on-demand) +- `prisma/schema.prisma` - Database schema +- `instrumentation.ts` - Application startup + +### Environment Variables +- `DATABASE_URL` - Main app database connection +- `NEXTAUTH_SECRET` - NextAuth secret +- `GITHUB_CLIENT_ID` / `GITHUB_CLIENT_SECRET` - GitHub OAuth +- `SEALOS_JWT_SECRET` - Sealos OAuth validation +- `RUNTIME_IMAGE` - Container image version + +### Resource Status +- `CREATING` → `STARTING` → `RUNNING` ⇄ `STOPPING` → `STOPPED` +- `UPDATING` - Environment variables being updated +- `TERMINATING` → `TERMINATED` +- `ERROR` - Operation failed + +### Port Exposure +- **3000**: Next.js application +- **7681**: ttyd web terminal +- **8080**: FileBrowser (file manager) + +## Important Notes + +- **Project Resources**: Each project includes a Sandbox (required) and can optionally have a Database (PostgreSQL). Database can be added on-demand after project creation. +- **Reconciliation Delay**: Status updates may take up to 3 seconds +- **User-Specific Namespaces**: Each user operates in their own K8s namespace +- **Frontend Polling**: Client components poll every 3 seconds for status updates +- **Database Wait Time**: PostgreSQL cluster takes 2-3 minutes to reach "Running" (when added) +- **Idempotent Operations**: All K8s methods can be called multiple times safely +- **Lock Duration**: Optimistic locks held for 30 seconds +- **Deployment Domain**: Main app listens on `0.0.0.0:3000` (not localhost) for Sealos diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index b7ac6c9..0eb338a 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -8,6 +8,7 @@ This document provides solutions for common issues and debugging techniques. - [Debugging Commands](#debugging-commands) - [Error Messages](#error-messages) - [Performance Issues](#performance-issues) +- [Incident References](#incident-references) ## Common Issues @@ -258,6 +259,11 @@ await prisma.project.findMany({ }) // Check query performance + +## Incident References + +- GitHub import delay postmortem: + - [project-import-delay-postmortem.md](/Users/che/Documents/GitHub/fulling/docs/project-import-delay-postmortem.md) const prisma = new PrismaClient({ log: ['query', 'info', 'warn', 'error'], })