diff --git a/.env.example b/.env.example index 5c1fdf5c..56223f11 100644 --- a/.env.example +++ b/.env.example @@ -1,21 +1,31 @@ -# API 认证配置 -AUTH_SECRET_KEY=your-secret-key-change-this-in-production -ADMIN_API_KEY=your-admin-api-key +# Core API runtime +AUTORESEARCH_API_ENV=dev +AUTORESEARCH_API_HOST=127.0.0.1 +AUTORESEARCH_API_PORT=8001 -# 允许的 Agent ID 列表(逗号分隔) -ALLOWED_AGENTS=agent-001,agent-002,agent-003 +# SQLite backing store +AUTORESEARCH_API_DB_PATH=artifacts/api/evaluations.sqlite3 -# Anthropic API Key(用于 Claude) -ANTHROPIC_API_KEY=your-anthropic-api-key +# Telegram / Panel +AUTORESEARCH_TELEGRAM_BOT_TOKEN=your-telegram-bot-token +AUTORESEARCH_TELEGRAM_ALLOWED_UIDS=123456789 +AUTORESEARCH_PANEL_JWT_SECRET=replace-with-random-secret +AUTORESEARCH_PANEL_BASE_URL=https://panel.example.com/api/v1/panel/view +# Optional: when configured, Telegram notifications use Mini App web_app buttons +AUTORESEARCH_TELEGRAM_MINI_APP_URL=https://panel.example.com/api/v1/panel/view -# OpenAI API Key(可选) -OPENAI_API_KEY=your-openai-api-key +# Optional upstream watcher +AUTORESEARCH_UPSTREAM_WATCH_URL=https://github.com/openclaw/openclaw.git +AUTORESEARCH_UPSTREAM_WATCH_WORKSPACE_ROOT=/Volumes/AI_LAB/ai_lab/workspace +AUTORESEARCH_UPSTREAM_WATCH_MAX_COMMITS=5 -# 数据库配置 -DATABASE_URL=sqlite:///./data/autoresearch.db +# Admin bootstrap +AUTORESEARCH_ADMIN_BOOTSTRAP_KEY=replace-with-admin-bootstrap-key +AUTORESEARCH_ADMIN_JWT_SECRET=replace-with-admin-jwt-secret -# Telegram Bot Token(可选) -TELEGRAM_BOT_TOKEN=your-telegram-bot-token +# Optional model providers +OPENAI_API_KEY=your-openai-api-key +ANTHROPIC_API_KEY=your-anthropic-api-key -# MCP 配置(可选) -MCP_SERVER_URL=http://localhost:3000 +# Recommended first-step runtime on Linux remote workers +# OPENHANDS_RUNTIME=host diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7140980d..41478c69 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -9,6 +9,37 @@ on: pull_request: merge_group: +env: + PYTHON_BASELINE: "3.11" + CORE_LINT_PATHS: >- + src/autoresearch/agent_protocol + src/autoresearch/api/routers + src/autoresearch/executions/runner.py + src/gateway + src/gatekeeper + scripts/agent_run.py + tests/gateway/test_message_mirror.py + tests/test_agent_protocol_models.py + tests/test_agent_policy_merge.py + tests/test_agent_runner_patch_filter.py + tests/test_agent_fallbacks.py + tests/test_approvals_api.py + tests/test_capability_api.py + tests/test_gatekeeper_e2e.py + tests/test_openclaw_compat.py + tests/test_openhands_controlled_backend.py + CORE_TEST_PATHS: >- + tests/gateway/test_message_mirror.py + tests/test_agent_protocol_models.py + tests/test_agent_policy_merge.py + tests/test_agent_runner_patch_filter.py + tests/test_agent_fallbacks.py + tests/test_approvals_api.py + tests/test_capability_api.py + tests/test_gatekeeper_e2e.py + tests/test_openclaw_compat.py + tests/test_openhands_controlled_backend.py + jobs: lint-test-audit: runs-on: ubuntu-latest @@ -26,6 +57,11 @@ jobs: with: python-version: ${{ matrix.python-version }} + - name: Show Python support policy + run: | + echo "Project baseline: Python ${PYTHON_BASELINE}+" + echo "CI verification matrix: 3.11, 3.12" + - name: Install dependencies run: | python -m pip install --upgrade pip @@ -38,38 +74,17 @@ jobs: - name: Ruff run: | - ruff check \ - src/autoresearch/agent_protocol \ - src/autoresearch/executions/runner.py \ - scripts/agent_run.py \ - tests/test_agent_protocol_models.py \ - tests/test_agent_policy_merge.py \ - tests/test_agent_runner_patch_filter.py \ - tests/test_agent_fallbacks.py \ - tests/test_openhands_controlled_backend.py + ruff check ${CORE_LINT_PATHS} - name: Black run: | - black --check \ - src/autoresearch/agent_protocol \ - src/autoresearch/executions/runner.py \ - scripts/agent_run.py \ - tests/test_agent_protocol_models.py \ - tests/test_agent_policy_merge.py \ - tests/test_agent_runner_patch_filter.py \ - tests/test_agent_fallbacks.py \ - tests/test_openhands_controlled_backend.py + black --check ${CORE_LINT_PATHS} - name: Pytest env: PYTHONPATH: src run: | - pytest -q \ - tests/test_agent_protocol_models.py \ - tests/test_agent_policy_merge.py \ - tests/test_agent_runner_patch_filter.py \ - tests/test_agent_fallbacks.py \ - tests/test_openhands_controlled_backend.py + pytest -q ${CORE_TEST_PATHS} - name: Pip Audit # Temporary exception: no patched Pygments release published yet. diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 00000000..0e0feb0e --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,380 @@ +# Architecture + +This file is the canonical architecture handoff for the current `autonomous-agent-stack` checkout. + +It describes the system that actually exists in this repository on March 31, 2026, not the older aspirational stack diagrams that still exist in some historical docs. If a historical document disagrees with this file, trust this file first and then verify against code. + +## What This Repository Is Now + +The repository is no longer just a collection of agent experiments. The current stable spine is a bounded control plane for: + +- planning safe repository improvements, +- executing those improvements inside isolated workspaces, +- validating the resulting patch, +- re-checking the patch through a promotion gate, +- and only then upgrading the result to a Draft PR. + +The most important shift is that autonomous work is now intentionally constrained. Agents are allowed to propose and edit inside isolation, but they do not own the repository, the branch graph, or production promotion authority. + +The current mainline flow is: + +1. `AutoResearchPlannerService` scans the repo and selects a bounded change candidate. +2. It emits an `OpenHandsWorkerJobSpec`, a `ControlledExecutionRequest`, and an AEP `JobSpec`. +3. A runtime mode selector chooses the preferred lane and budget envelope. +4. A dispatch adapter wraps execution into a typed `dispatch_run` lifecycle. +5. `OpenHandsWorkerService` translates the worker contract into a strict patch-only prompt. +6. `OpenHandsControlledBackendService` or `AgentExecutionRunner` executes inside an isolated workspace. +7. Validation commands run against the isolated result. +8. `GitPromotionGateService` re-checks scope, runtime artifacts, binary changes, writer lease, approval state, and Draft PR preconditions. +9. Promotion ends as either: + - a patch artifact ready for human review, or + - a Draft PR created from an isolated worktree. + +## Canonical Pipeline + +```mermaid +flowchart TD + A["Repo Scan
AutoResearch Planner"] --> B["Worker Contract
OpenHandsWorkerJobSpec / ControlledExecutionRequest / JobSpec"] + B --> C["Runtime Mode Selector
day / night"] + C --> D["Dispatch Contract
dispatch_run + adapter lane selection"] + D --> E["Isolated Execution
OpenHands controlled backend or AEP runner"] + E --> F["Validation Gate
allowed paths + tests + policy checks"] + F --> G["Git Promotion Gate
writer lease + clean repo + approval + draft PR checks"] + G --> H["Patch Artifact"] + G --> I["Draft PR"] +``` + +This pipeline is intentionally asymmetric: + +- the planning layer can suggest work, +- the execution layer can produce a patch candidate, +- but only the promotion layer is allowed to translate that patch into branch or PR level state. + +That asymmetry is the core safety mechanism. + +## Offline Remote Hardening Layer + +The repository now includes a control-plane wrapper for future Linux execution even when Linux is offline today. + +The added pieces are: + +- `RemoteTaskSpec`, `RemoteRunRecord`, and `RemoteRunSummary` +- a fake remote adapter that simulates dispatch, heartbeat, timeout, and result fetch +- a fixed failure taxonomy with typed recovery actions +- day/night runtime config files that select preferred lane and execution budgets + +This means the repository can stabilize protocol, status, and fallback behavior before a live remote worker is reattached. + +## Zero-Trust Invariants + +### 1. Brain and Hand Separation + +Planning and execution are separate from promotion. + +- The planner finds the next task. +- The worker edits files inside an isolated workspace. +- The promotion gate decides whether the resulting patch may become a patch artifact or Draft PR. + +OpenHands is therefore treated as a constrained worker, not as the system control plane. + +### 2. Patch-Only by Default + +Patch-only is the default contract for autonomous edits. + +The worker prompt built in `src/autoresearch/core/services/openhands_worker.py` explicitly forbids: + +- `git add` +- `git commit` +- `git push` +- `git merge` +- `git rebase` +- `git reset` +- `git checkout` + +It also constrains output to `allowed_paths` and forbids touching `forbidden_paths`. + +### 3. Deny-Wins Policy Merge + +The AEP layer merges policy with deny-wins semantics: + +- `forbidden_paths`: union +- `allowed_paths`: intersection +- network access: stricter wins +- tool allowlist: intersection +- limits such as patch lines and changed files: smaller limit wins + +This prevents a permissive job request from widening a tighter manifest default. + +### 4. Single Writer for Mutable State + +`WriterLeaseService` is the single-writer lock for mutable control-plane operations. + +Current code uses writer leases in the places where concurrent mutation would be most dangerous: + +- git promotion finalization, +- managed skill promotion, +- approval-linked mutation flows, +- other repository or state transitions that must not race. + +If a writer lease cannot be acquired, the system blocks rather than guessing. + +### 5. Runtime Artifacts Never Promote + +Promotion rejects runtime or control artifacts from being smuggled into source changes. + +Current deny prefixes include: + +- `logs/` +- `.masfactory_runtime/` +- `memory/` +- `.git/` + +This rule exists both in AEP patch filtering and in the git promotion gate. + +### 6. Clean Base Requirement + +Two operations refuse to run on a dirty base checkout: + +- `OpenHandsControlledBackendService` blocks OpenHands CLI execution if the repo root has uncommitted changes. +- `GitPromotionGateService` requires a clean base repo before upgrading to Draft PR mode. + +This ensures the system does not mistake unrelated local edits for agent output. + +## Physical and Sandbox Topology + +The current machine layout is intentionally non-default and should be treated as part of the architecture. + +### Host and Storage + +- host machine: MacBook Air M1 +- container runtime: Colima / Docker +- repository checkout: `/Volumes/AI_LAB/Github/autonomous-agent-stack` +- ai-lab writable runtime roots: `/Volumes/AI_LAB/ai_lab/workspace`, `/Volumes/AI_LAB/ai_lab/logs`, `/Volumes/AI_LAB/ai_lab/.cache` + +The `ai_lab.env` file points Docker to the Colima socket and pins the writable lab roots to the external disk. This matters because capacity, cleanup behavior, and path assumptions are all tied to `/Volumes/AI_LAB`. + +### Mount Model + +The current `scripts/launch_ai_lab.sh` logic effectively creates a layered execution topology: + +1. The host repository is the source of truth. +2. The ai-lab launcher mounts the chosen host root into the container at `/workspace` as read-only. +3. When OpenHands controlled runs need a writable execution root, an extra writable mount is attached at `/opt/workspace`. +4. Controlled execution also snapshots a baseline and creates its own isolated `workspace` directory under a per-run root before any validation or promotion decision. + +Conceptually, the stack is: + +```text +Mac host source checkout + -> Colima / Docker runtime + -> ai-lab writable roots on /Volumes/AI_LAB/ai_lab + -> isolated execution workspace + -> isolated promotion worktree +``` + +This is why the handoff shorthand is: + +`Mac host read source -> Colima -> ai-lab external disk read/write -> isolated worktree` + +### Isolation Surfaces + +There are two independent isolation phases: + +#### Execution Isolation + +`OpenHandsControlledBackendService` copies the repository into: + +- `baseline/` +- `workspace/` +- `artifacts/` + +under a run root, with repo noise excluded. The worker edits only the isolated workspace, never the main repo directly. + +#### Promotion Isolation + +`GitPromotionGateService` and `GitPromotionService` use git worktrees rooted under salted `/tmp` paths. + +The current implementation salts the worktree base with a hash of the absolute repo root, for example: + +- `/tmp/-/promotion-worktrees/` +- `/tmp/repo-/promotions//worktree` + +That salt exists so two repositories with the same basename do not collide in `/tmp`. + +## Zero-Trust Promotion and Approval State Machines + +### Managed Skill Promotion + +Managed skills follow a four-stage trust pipeline implemented by `ManagedSkillRegistryService`: + +1. `pending` +2. `quarantined` +3. `cold_validated` +4. `promoted` + +The operational meaning is: + +- `quarantined`: bundle copied out of the untrusted source into a contained holding area +- `cold_validated`: signature, contract, manifest, and bounded checks passed without granting active runtime status +- `promoted`: copied into the active root and made visible to runtime consumers + +Promotion from `cold_validated` to `promoted` is guarded by a writer lease so that only one writer activates skill state at a time. + +### Patch Promotion + +Patch promotion is handled by `GitPromotionGateService`. + +The gate always computes a preflight report first. Only if patch-level checks pass may the result continue. Draft PR mode adds stricter checks: + +- remote health is good, +- repo base is clean, +- credentials are available, +- target base branch exists, +- explicit approval has been granted. + +If Draft PR checks fail but patch checks pass, the system degrades to patch mode rather than silently escalating. + +This is deliberate: patch mode is the safe floor; Draft PR is a privilege upgrade. + +## Current Controlled Execution Loop + +### Planner Layer + +`AutoResearchPlannerService` is the new "active seeker" layer. + +It currently scans `src/`, `scripts/`, and `tests/` for bounded opportunities: + +- backlog markers such as `FIXME`, `TODO`, `HACK`, `XXX`, `BUG` +- source hotspots that do not appear to have a direct regression test + +It scores those candidates and emits three downstream-ready contracts: + +- `OpenHandsWorkerJobSpec` +- `ControlledExecutionRequest` +- AEP `JobSpec` + +The planner does not execute code itself. Its job is to produce the smallest next safe contract. + +### Worker Layer + +`OpenHandsWorkerService` turns the plan into a strict patch-only prompt and contract set. + +The worker prompt repeats the non-negotiable rules: + +- touch only allowed paths, +- never touch forbidden paths, +- do not perform git branch or commit operations, +- keep the patch minimal, +- leave promotion to the gate. + +### Controlled Backend Layer + +`OpenHandsControlledBackendService` is the narrowest path from worker output to promotion input. + +It: + +- snapshots the repo, +- executes the backend, +- calculates changed files, +- writes a patch artifact, +- runs validation commands, +- blocks out-of-scope writes, +- and only then hands the result to the promotion gate. + +If the backend is OpenHands CLI and the repo is dirty, it stops immediately. + +### AEP Runner Layer + +`AgentExecutionRunner` provides a parallel control path using the AEP contract: + +`JobSpec -> driver adapter -> DriverResult -> validation -> promotion patch -> decision` + +The important architectural point is that both the controlled backend path and the AEP path converge on the same promotion discipline. + +## Persistent State and Artifacts + +The system separates long-lived control-plane state from per-run artifacts. + +### SQLite Control Plane + +FastAPI dependencies build `SQLiteModelRepository` instances for typed resources such as: + +- approvals, +- managed skill installs, +- capability snapshots, +- execution runs, +- evaluations, +- AutoResearch plans, +- and other API-visible state. + +SQLite is the system of record for control-plane metadata, not the artifact filesystem. + +AutoResearch plans now persist both: + +- `dispatch_run`: the outer remote-control-plane record +- `run_summary`: the inner legacy AEP/OpenHands summary when a local run exists + +### Artifact Filesystem + +Per-run execution artifacts live under `.masfactory_runtime/` or the controlled backend run root and include: + +- job specification, +- effective policy, +- stdout and stderr logs, +- validation logs, +- patch artifact, +- summary JSON, +- event streams. + +The offline hardening layer also writes remote-control artifacts under the same run root: + +- `remote_control/task_spec.json` +- `remote_control/record.json` +- `remote_control/events.ndjson` +- `remote_control/heartbeat.json` +- `remote_control/summary.json` + +These artifacts are intentionally excluded from promotion. + +## Canonical Source Files + +When another AI or human needs the real architecture, start here: + +- `ARCHITECTURE.md` +- `memory/SOP/MASFactory_Strict_Execution_v1.md` +- `src/autoresearch/core/services/autoresearch_planner.py` +- `src/autoresearch/shared/remote_run_contract.py` +- `src/autoresearch/core/dispatch/fake_remote_adapter.py` +- `src/autoresearch/core/dispatch/failure_classifier.py` +- `src/autoresearch/core/runtime/select_mode.py` +- `src/autoresearch/core/services/openhands_worker.py` +- `src/autoresearch/core/services/openhands_controlled_backend.py` +- `src/autoresearch/executions/runner.py` +- `src/autoresearch/core/services/git_promotion_gate.py` +- `src/autoresearch/core/services/managed_skill_registry.py` +- `src/autoresearch/core/services/writer_lease.py` +- `scripts/launch_ai_lab.sh` + +If one of those files changes meaningfully, this document should be updated in the same branch. + +## Non-Goals and Red Lines + +This architecture is intentionally not trying to do the following: + +- let autonomous workers push directly to `main`, +- let unreviewed remote bundles become live skills, +- let multiple writers mutate promotion state concurrently, +- let runtime artifacts leak into source patches, +- or let a dirty local checkout masquerade as a clean agent result. + +Those are not missing features. They are explicit non-goals. + +## Handoff Rule + +Future AI handoffs should assume: + +- `ARCHITECTURE.md` is the canonical system picture, +- `docs/architecture.md` exists for compatibility with older references, +- `docs/run-lifecycle.md`, `docs/failure-modes.md`, and `docs/deployment-status.md` are the canonical control-plane hardening companions, +- the SOP in `memory/SOP/MASFactory_Strict_Execution_v1.md` is the short operational checklist, +- and all autonomous changes must remain patch-only until the promotion gate says otherwise. diff --git a/Makefile b/Makefile index ba7ee6bf..a1f81ae5 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ PROMOTE_BRANCH_PREFIX ?= codex/auto-upgrade PROMOTE_PUSH ?= 0 PROMOTE_OPEN_DRAFT_PR ?= 0 -.PHONY: help setup doctor start test-quick clean +.PHONY: help setup doctor doctor-linux start test-quick clean .PHONY: ai-lab ai-lab-setup ai-lab-check ai-lab-up ai-lab-down ai-lab-status ai-lab-shell ai-lab-run masfactory-flight hygiene-check openhands openhands-dry-run openhands-controlled openhands-controlled-dry-run openhands-demo agent-run promote-run .PHONY: review-gates-local @@ -39,6 +39,7 @@ help: @echo "" @echo " make setup Create .venv and install dependencies" @echo " make doctor Run environment checks" + @echo " make doctor-linux Run Linux remote-worker checks" @echo " make start Run doctor then start local API" @echo " make ai-lab One-key launch AI lab shell" @echo " make ai-lab-setup Initialize AI lab user and quota volume" @@ -82,6 +83,13 @@ doctor: fi $(VENV_PYTHON) scripts/doctor.py --port $(PORT) +doctor-linux: + @if [[ ! -x "$(VENV_PYTHON)" ]]; then \ + echo "Missing $(VENV_PYTHON). Run 'make setup' first."; \ + exit 1; \ + fi + $(VENV_PYTHON) scripts/doctor.py --profile linux-remote --port $(PORT) + start: @if [[ ! -x "$(VENV_PYTHON)" ]]; then \ echo "Missing $(VENV_PYTHON). Run 'make setup' first."; \ diff --git a/README.md b/README.md index 66236be9..4858d6f8 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,12 @@ 一个面向多智能体编排、工作流触发、自集成验证和零信任加固的工程化仓库。 +## 运行时要求 + +- Python 基线:`3.11+` +- 本仓库当前在 CI 中验证:`3.11`、`3.12` +- 如果本机默认 `python3` 低于 3.11,请先安装 3.11+ 再执行 `make setup` + ## 为什么现在更容易上手 参考 ClawX 的使用体验,这个仓库把新手最常见的三个问题做了统一入口。 @@ -16,8 +22,10 @@ ```bash cd /Volumes/PS1008/Github/autonomous-agent-stack +# 确保这里用的是 Python 3.11+ make setup make doctor +make doctor-linux make start ``` @@ -27,12 +35,65 @@ make start - `http://127.0.0.1:8001/docs` - `http://127.0.0.1:8001/panel` +如果你要启用 Telegram 提醒和 Mini App 审批,至少补齐这 4 个环境变量: + +```bash +AUTORESEARCH_TELEGRAM_BOT_TOKEN=... +AUTORESEARCH_TELEGRAM_ALLOWED_UIDS=你的TelegramUID +AUTORESEARCH_PANEL_JWT_SECRET=随机长串 +AUTORESEARCH_PANEL_BASE_URL=https://你的面板域名/api/v1/panel/view +``` + +如果还希望通知卡片直接带 `Mini App` 按钮,再补: + +```bash +AUTORESEARCH_TELEGRAM_MINI_APP_URL=https://你的面板域名/api/v1/panel/view +``` + +如果你要把上游 OpenClaw 巡检挂成 Planner 的可选低噪音任务,再补这 3 个变量: + +```bash +AUTORESEARCH_UPSTREAM_WATCH_URL=https://github.com/openclaw/openclaw.git +AUTORESEARCH_UPSTREAM_WATCH_WORKSPACE_ROOT=/Volumes/AI_LAB/ai_lab/workspace +AUTORESEARCH_UPSTREAM_WATCH_MAX_COMMITS=5 +``` + +当前代码会优先使用 `AUTORESEARCH_TELEGRAM_BOT_TOKEN`;旧变量 `TELEGRAM_BOT_TOKEN` 还能兼容,但已经是 deprecated。 + +## Linux 远端节点 + +如果你准备把 Linux 当“执行面”来跑真实 OpenHands,最稳的第一步不是照搬 Mac/Colima,而是直接走 `host` runtime。 + +最小路径: + +```bash +python3.11 -m venv .venv +source .venv/bin/activate +make setup +OPENHANDS_RUNTIME=host make doctor-linux +OPENHANDS_RUNTIME=host make start +``` + +更完整的落地清单、环境变量建议和远端使用姿势见: + +- [Linux Remote Worker Guide](./docs/linux-remote-worker.md) +- [Deployment Status](./docs/deployment-status.md) +- [cc-switch Usage Guide](./docs/cc-switch-usage.md) +- [OpenHands Controlled Backend Integration](./docs/openhands-cli-integration.md) + +当前要点: + +- 本地控制面和本地执行链可用 +- Linux 远端 lane 当前仍然 offline +- 本仓库已经先把远端协议、状态机和 fake remote adapter 固定下来,等 Linux 恢复后直接接执行面 + ## 常用命令 ```bash make help make setup make doctor +make doctor-linux make start make test-quick make ai-lab @@ -53,14 +114,42 @@ make review-gates-local `make hygiene-check` 会把结果写到 `logs/audit/prompt_hygiene/report.txt` 和 `logs/audit/prompt_hygiene/report.json`。 -`make openhands` 会调用 `scripts/openhands_start.sh`(CLI 直连模式),默认注入 `DIFF_ONLY=1` 与 `MAX_FILES_PER_STEP=3` 的执行约束,并优先读取 `memory/SOP/MASFactory_Strict_Execution_v1.md`。 +`make openhands` 会调用 `scripts/openhands_start.sh`(CLI 直连模式),默认注入 `DIFF_ONLY=1` 与 `MAX_FILES_PER_STEP=3` 的执行约束;当前真实边界请以 [ARCHITECTURE.md](./ARCHITECTURE.md) 为总图,以 [memory/SOP/MASFactory_Strict_Execution_v1.md](./memory/SOP/MASFactory_Strict_Execution_v1.md) 为执行清单。 + +当前 launcher 会优先读取根目录 `ai_lab.env`。`host` 模式下会优先寻找 `./.masfactory_runtime/tools/openhands-cli-py312/bin/openhands` 这类独立工具 venv,并自动在本地 OpenHands home 下生成 `agent_settings.json`;`ai-lab` 模式则默认调用容器内的 `openhands`。默认模板会走 `--exp --headless`,因为本地验证的 `OpenHands CLI 1.5.0` 在这条路径上能自动收尾退出,更适合作为 pipeline worker: + +```bash +RUNTIME=process \ +SANDBOX_VOLUMES=/你的workspace:/workspace:rw \ +openhands --exp --headless -t "你的任务" +``` + +实际执行时 launcher 会先 `cd` 到目标 worktree,再启动 CLI,所以 OpenHands 的 workspace 会对准当前任务目录。如果你要切回旧的“把 prompt 当位置参数”模式,可显式设置 `OPENHANDS_HEADLESS=0`;如果你明确想关闭 `--exp`,可设置 `OPENHANDS_EXPERIMENTAL=0`。`OPENHANDS_JSON=1` 仅适用于明确支持该 flag 的 CLI 版本;当前本地验证的 `OpenHands CLI 1.5.0` 默认不带它。如果要走真实 `ai-lab` 容器链,除了容器内 `openhands` 本身可用,还需要当前 shell 对配置的 Docker/Colima socket 有访问权限。`sandbox/ai-lab/Dockerfile` 也默认锁到同一个 `OpenHands CLI 1.5.0`,避免容器冷启动时漂移到未验证的新版本。 + +`launch_ai_lab.sh` 也会显式识别 `DOCKER_HOST=unix://...` 这类 Colima socket。如果当前配置指向了一个当前用户不可访问的 Colima socket,它会先尝试安全回退:有外置盘 Colima store 时走 repo 自带的 `scripts/colima-external.sh`,否则直接回退到当前用户自己的 `~/.colima//docker.sock`,而不是直接放宽宿主机 socket 权限。当前用户回退分支还会显式把 `/Volumes/AI_LAB` 挂进 Colima;如果你不想碰现有默认 profile,可直接用独立 profile,例如 `COLIMA_PROFILE=ai-lab bash ./scripts/launch_ai_lab.sh status`。 `make openhands-controlled` 会走最窄闭环:创建隔离 workspace、执行 OpenHands 子任务、运行校验、输出 promotion patch 与审计摘要(不直接污染主仓库)。 `make agent-run` 走 AEP v0 统一执行内核:`JobSpec -> driver adapter -> patch gate -> decision`,OpenHands/Codex/本地脚本都可作为 driver 接入。 +这条主线现在额外有一层离线 control-plane 收口: + +- [Architecture](./ARCHITECTURE.md) +- [Run Lifecycle](./docs/run-lifecycle.md) +- [Failure Modes](./docs/failure-modes.md) +- [Deployment Status](./docs/deployment-status.md) +- [Agent Execution Protocol (AEP v0)](./docs/agent-execution-protocol.md) + `make review-gates-local` 会在本地运行 reviewer 核心模块的 `mypy + bandit + semgrep`,与 CI 的 `Quality Gates` 流程保持一致。 +## 本地 CLI 切换工具的边界 + +像 `cc-switch` 这类工具,适合放在本地开发工作台,用来切换 `Codex`、`OpenClaw`、`Claude Code` 等 CLI 进行人工调试和 prompt 试验。 + +但它不应该替代本仓库的执行主链。这里真正负责受控执行的是 `make agent-run`、`make openhands-controlled`、AEP runner、validator 和 promotion gate。 + +如果你想把 `cc-switch` 接进日常工作流,推荐只做旁路工作台,不要改写 `drivers/openhands_adapter.sh` 或 `scripts/openhands_start.sh` 的主逻辑。详细边界见 [cc-switch Usage Guide](./docs/cc-switch-usage.md)。 + ## PR 审查与门禁 - OpenHands 首轮审查(comment-only):`.github/workflows/pr-review-by-openhands.yml` @@ -71,6 +160,7 @@ make review-gates-local - 检查项:`mypy + bandit + semgrep`(工具版本固定在 `requirements-review.lock`) - 包含 `merge_group` 触发,兼容 merge queue - 仓库 required checks 建议:`CI / lint-test-audit` + `Quality Gates / reviewer-gates` +- 试运行与反馈闭环:见 [PR Review Hardening](./docs/pr-review-hardening.md) 里的 `Trial Rubric` 与 `Feedback Loop` 完整落地说明见:[PR Review Hardening](./docs/pr-review-hardening.md) @@ -108,7 +198,8 @@ PORT=8010 make start - [API 主入口](./src/autoresearch/api/main.py) - [工作流引擎](./src/workflow/workflow_engine.py) -- [Telegram Webhook](./src/gateway/telegram_webhook.py) +- [Telegram Gateway(主线)](./src/autoresearch/api/routers/gateway_telegram.py) +- [Telegram Webhook(legacy compatibility only)](./src/gateway/telegram_webhook.py) - [自集成服务](./src/autoresearch/core/services/self_integration.py) - [自集成路由](./src/autoresearch/api/routers/integrations.py) - [技能注册表](./src/opensage/skill_registry.py) @@ -118,9 +209,10 @@ PORT=8010 make start ## 快速排错 1. 先跑 `make doctor`,看是否有 `FAIL` -2. 如果是依赖问题,执行 `make setup` -3. 如果是端口问题,执行 `PORT=8010 make start` -4. 如果是导入问题,确认通过 `make start` 启动(脚本会自动设置 `PYTHONPATH=src`) +2. Linux 远端执行节点先跑 `OPENHANDS_RUNTIME=host make doctor-linux` +3. 如果提示 Python 版本过低,先切到 Python 3.11+,再执行 `make setup` +4. 如果是端口问题,执行 `PORT=8010 make start` +5. 如果是导入问题,确认通过 `make start` 启动(脚本会自动设置 `PYTHONPATH=src`) ## 🎯 灵感来源(Inspirations) @@ -216,6 +308,7 @@ PORT=8010 make start ## 深入文档 - [快速启动文档](./docs/QUICK_START.md) +- [架构总图](./ARCHITECTURE.md) - [Admin View 字段填写教程](./docs/admin-view-field-guide.md) - [状态与发布说明](./STATUS_AND_RELEASE_NOTES.md) - [工作流引擎验证报告](./docs/WORKFLOW_ENGINE_VERIFICATION_REPORT.md) diff --git a/ai-research/README.md b/ai-research/README.md new file mode 100644 index 00000000..d5be4bd6 --- /dev/null +++ b/ai-research/README.md @@ -0,0 +1,110 @@ +# AI Agent 前沿研究 + +> **整理时间**: 2026-03-30 12:14 +> **来源**: YouTube 播放列表深度分析 + +--- + +## 📚 研究文档 + +### 1. Self-Evolving Agent 自进化智能体 +**文件**: `self-evolving-agent.md` +**字数**: 2,720 字 + +**核心内容**: +- 🧬 自我学习机制 +- 🔧 自我优化策略 +- 🔄 自我迭代路径 + +**关键代码**: +```python +class SelfEvolvingAgent: + async def evolve(self): + result = await self.execute_task() + performance = self.evaluator.evaluate(result) + self.memory.store(result, performance) + self.optimizer.optimize(self.memory) + await self.iterate() +``` + +--- + +### 2. DyTopo 动态拓扑网络革命 🚨 +**文件**: `dytopo-dynamic-topology.md` +**字数**: 4,121 字 + +**核心发现**: +- 🚨 **80亿参数"绞杀"1200亿参数** +- ❌ **突破 Scaling Law 铁律** +- 🔄 **静态群聊 → 自由交易集市** +- 💡 **上下文污染解决方案** + +**性能对比**: +| 模型 | 参数量 | 传统架构 | DyTopo 架构 | 提升 | +|------|--------|----------|-------------|------| +| **DyTopo-Small** | **8B** | **65%** | **88%** | **+35%** | +| DyTopo-Large | 120B | 78% | 91% | +17% | + +**架构设计**: +``` +┌─────────────────────────────────────┐ +│ Task Router │ +└──────────────┬──────────────────────┘ + │ + ┌──────────┼──────────┐ + │ │ │ +┌───▼────┐ ┌───▼────┐ ┌───▼────┐ +│ Agent 1│ │ Agent 2│ │ Agent 3│ +│ (8B) │ │ (8B) │ │ (8B) │ +└───┬────┘ └───┬────┘ └───┬────┘ + │ 动态拓扑网络 │ +┌───▼────┐ ┌───▼────┐ ┌───▼────┐ +│ Agent 4│ │ Agent 5│ │ Agent 6│ +└────────┘ └────────┘ └────────┘ +``` + +--- + +## 🎯 核心价值 + +### 1. 突破传统限制 +- ✅ 打破 Scaling Law 铁律 +- ✅ 小模型超越大模型 +- ✅ 动态架构优化 + +### 2. 实战应用 +- ✅ 复杂推理任务 +- ✅ 大规模协作 +- ✅ 资源受限环境 + +### 3. 未来方向 +- 🔄 自适应拓扑网络 +- 🌐 跨模态协作 +- 🤖 通用人工智能(AGI) + +--- + +## 📊 研究统计 + +**总文档数**: 2 个 +**总字数**: 6,841 字 +**代码示例**: 10+ 个 +**最佳实践**: 15+ 条 + +--- + +## 🔗 相关资源 + +### 论文 +- **DyTopo: Dynamic Topology for Multi-Agent Systems** (2026) +- **Breaking the Scaling Law with Dynamic Networks** +- **Context Pollution in Multi-Agent Systems** + +### 开源项目 +- **DyTopo Framework** (即将开源) +- **Multi-Agent Topology Optimizer** + +--- + +**维护者**: srxly888-creator +**最后更新**: 2026-03-30 12:14 diff --git a/ai-research/dytopo-dynamic-topology-2026-03-30.md b/ai-research/dytopo-dynamic-topology-2026-03-30.md new file mode 100644 index 00000000..14afac3e --- /dev/null +++ b/ai-research/dytopo-dynamic-topology-2026-03-30.md @@ -0,0 +1,256 @@ +# DyTopo 动态拓扑网络革命 + +> **视频来源**: 重构未来:顶尖AI群聊 = 灾难?|80亿"绞杀"1200亿? +> **视频ID**: pTNE1qZKf1M +> **时长**: 27 分钟 +> **分析时间**: 2026-03-30 12:14 + +--- + +## 🚨 突破性发现 + +### 核心突破 +**80亿参数小模型** 在复杂逻辑推理上 **"绞杀" 1200亿参数巨无霸**! + +- ❌ **Scaling Law 铁律被打破** +- ✅ **动态拓扑网络是关键** +- 🔄 **从静态群聊到自由交易集市** + +--- + +## 📊 问题分析 + +### 传统多智能体系统的死结 + +#### 上下文污染 +``` +问题:所有 Agent 共享同一上下文 +结果: +- 信息过载 +- 相互干扰 +- 效率下降 +``` + +#### 静态架构限制 +``` +问题:固定通信拓扑 +结果: +- 无法动态调整 +- 资源浪费 +- 扩展困难 +``` + +--- + +## 💡 DyTopo 解决方案 + +### 核心思想 + +**动态拓扑网络** - 根据任务需求,动态调整 Agent 之间的通信结构。 + +### 关键特性 + +#### 1. 动态路由 +```python +class DynamicTopology: + def route_task(self, task): + # 分析任务需求 + requirements = self.analyze(task) + + # 动态选择最佳 Agent 组合 + agents = self.select_agents(requirements) + + # 建立临时通信链路 + topology = self.build_topology(agents) + + return topology +``` + +#### 2. 自由交易集市 +``` +传统:固定群聊(所有 Agent 在同一房间) +DyTopo:交易集市(Agent 按需交易信息) + +优势: +- 减少噪音 +- 提高效率 +- 节省资源 +``` + +#### 3. 上下文隔离 +```python +class ContextIsolation: + def __init__(self): + self.agent_contexts = {} + + def get_context(self, agent_id): + # 每个 Agent 有独立上下文 + return self.agent_contexts.get(agent_id, {}) + + def share_info(self, from_agent, to_agent, info): + # 按需共享信息 + if self.should_share(from_agent, to_agent, info): + self.transfer(from_agent, to_agent, info) +``` + +--- + +## 📈 性能对比 + +### 基准测试 + +| 模型 | 参数量 | 传统架构 | DyTopo 架构 | 提升 | +|------|--------|----------|-------------|------| +| GPT-4 | 1.8T | 85% | - | - | +| Claude-3 | 200B | 82% | - | - | +| **DyTopo-Small** | **8B** | **65%** | **88%** | **+35%** | +| DyTopo-Large | 120B | 78% | 91% | +17% | + +--- + +## 🛠️ 技术实现 + +### 架构设计 + +``` +┌─────────────────────────────────────┐ +│ Task Router │ +│ (动态任务路由) │ +└──────────────┬──────────────────────┘ + │ + ┌──────────┼──────────┐ + │ │ │ +┌───▼────┐ ┌───▼────┐ ┌───▼────┐ +│ Agent 1│ │ Agent 2│ │ Agent 3│ +│ (8B) │ │ (8B) │ │ (8B) │ +└───┬────┘ └───┬────┘ └───┬────┘ + │ │ │ + │ 动态拓扑网络 │ + │ │ │ +┌───▼────┐ ┌───▼────┐ ┌───▼────┐ +│ Agent 4│ │ Agent 5│ │ Agent 6│ +│ (8B) │ │ (8B) │ │ (8B) │ +└────────┘ └────────┘ └────────┘ +``` + +### 代码示例 + +```python +import asyncio +from dytopo import DynamicNetwork, Agent + +class DyTopoSystem: + def __init__(self): + self.network = DynamicNetwork() + self.agents = [ + Agent(f"agent_{i}", model="8b-model") + for i in range(6) + ] + + async def solve_complex_task(self, task): + # 1. 分析任务 + subtasks = self.decompose(task) + + # 2. 动态组建团队 + for subtask in subtasks: + team = self.network.form_team(subtask) + + # 3. 执行子任务 + result = await team.execute(subtask) + + # 4. 解散团队 + self.network.dissolve(team) + + # 5. 汇总结果 + return self.aggregate_results() +``` + +--- + +## 🎯 应用场景 + +### ✅ 适合场景 +1. **复杂推理任务** - 需要多步骤思考 +2. **大规模协作** - 超过 10 个 Agent +3. **资源受限环境** - 算力不足 + +### ❌ 不适合场景 +1. **简单任务** - 单个 Agent 足够 +2. **固定流程** - 不需要动态调整 +3. **小规模协作** - 3-5 个 Agent + +--- + +## 🔬 深度分析 + +### 为什么 80亿能"绞杀"1200亿? + +#### 1. 并行优势 +``` +1200亿模型:单线程处理 +DyTopo 6×8B:6 线程并行 + +理论加速:6x +实际加速:3-4x(通信开销) +``` + +#### 2. 专家组合 +``` +通用大模型:什么都懂,什么都不精 +DyTopo 专家:每个 8B 专注一个领域 + +结果:专家组合 > 通用模型 +``` + +#### 3. 上下文效率 +``` +传统:所有信息塞进一个上下文 +DyTopo:按需分配上下文 + +节省:70% 上下文空间 +``` + +--- + +## 📚 相关研究 + +### 论文 +- **DyTopo: Dynamic Topology for Multi-Agent Systems** (2026) +- **Breaking the Scaling Law with Dynamic Networks** +- **Context Pollution in Multi-Agent Systems** + +### 开源项目 +- **DyTopo Framework** (即将开源) +- **Multi-Agent Topology Optimizer** + +--- + +## 🔮 未来展望 + +### 短期(6个月) +- 开源 DyTopo 框架 +- 支持更多基础模型 +- 优化通信协议 + +### 中期(1年) +- 商业化部署方案 +- 行业解决方案 +- 性能持续优化 + +### 长期(3年) +- 自适应拓扑网络 +- 跨模态 Agent 协作 +- 通用人工智能(AGI) + +--- + +## 🎓 学习资源 + +- **视频链接**: https://youtu.be/pTNE1qZKf1M +- **论文预印本**: arXiv:2026.xxxxx +- **代码仓库**: github.com/dytopo (即将发布) + +--- + +**整理仓库**: `autonomous-agent-stack`(公开)、`ai-tools-compendium`(公开) +**标签**: #DyTopo #动态拓扑 #多智能体 #AI革命 diff --git a/ai-research/self-evolving-agent-2026-03-30.md b/ai-research/self-evolving-agent-2026-03-30.md new file mode 100644 index 00000000..316127d0 --- /dev/null +++ b/ai-research/self-evolving-agent-2026-03-30.md @@ -0,0 +1,167 @@ +# Self-Evolving Agent 自进化智能体 + +> **视频来源**: 让 AI 自我进化! Self-Evolving Agent 怎么做到的? +> **视频ID**: vDw2IKBXmB4 +> **时长**: 10 分钟 +> **分析时间**: 2026-03-30 12:14 + +--- + +## 🎯 核心概念 + +### Self-Evolving Agent 定义 +AI 智能体通过自我学习、自我优化、自我迭代,实现能力的持续提升。 + +--- + +## 🧬 自进化机制 + +### 1. 自我学习 +- **反馈循环** - 从结果中学习 +- **经验积累** - 存储成功案例 +- **知识蒸馏** - 提取关键模式 + +### 2. 自我优化 +- **性能监控** - 追踪执行效率 +- **参数调整** - 动态优化配置 +- **架构改进** - 重构代码结构 + +### 3. 自我迭代 +- **版本控制** - 管理不同版本 +- **A/B 测试** - 对比不同策略 +- **持续部署** - 自动更新部署 + +--- + +## 🛠️ 技术实现 + +### 架构设计 +```python +class SelfEvolvingAgent: + def __init__(self): + self.memory = ExperienceMemory() + self.optimizer = SelfOptimizer() + self.evaluator = PerformanceEvaluator() + + async def evolve(self): + # 1. 执行任务 + result = await self.execute_task() + + # 2. 评估性能 + performance = self.evaluator.evaluate(result) + + # 3. 存储经验 + self.memory.store(result, performance) + + # 4. 自我优化 + self.optimizer.optimize(self.memory) + + # 5. 迭代改进 + await self.iterate() +``` + +### 关键组件 + +#### 1. 经验记忆 +```python +class ExperienceMemory: + def __init__(self): + self.success_cases = [] + self.failure_cases = [] + + def store(self, result, performance): + if performance > 0.8: + self.success_cases.append(result) + else: + self.failure_cases.append(result) +``` + +#### 2. 自我优化器 +```python +class SelfOptimizer: + def optimize(self, memory): + # 分析成功模式 + patterns = self.analyze_patterns(memory.success_cases) + + # 调整参数 + self.adjust_parameters(patterns) + + # 改进架构 + self.improve_architecture(patterns) +``` + +--- + +## 📊 进化路径 + +``` +Level 1: 基础 Agent + ↓ 自我学习 +Level 2: 经验 Agent + ↓ 自我优化 +Level 3: 智能 Agent + ↓ 自我迭代 +Level 4: 自进化 Agent +``` + +--- + +## 💡 应用场景 + +### ✅ 适合场景 +- **持续优化** - 需要不断改进的系统 +- **复杂决策** - 需要学习经验的任务 +- **动态环境** - 环境不断变化的场景 + +### ❌ 不适合场景 +- **固定规则** - 规则明确的任务 +- **短期项目** - 不需要长期优化 +- **安全关键** - 不允许自主修改 + +--- + +## 🎯 实战案例 + +### 案例 1: 代码生成优化 +```python +# 初始版本 +def generate_code_v1(task): + return basic_llm_call(task) + +# 自进化后版本 +def generate_code_evolved(task): + # 使用历史最佳实践 + best_practices = memory.get_best_practices() + + # 应用优化策略 + optimized_prompt = optimizer.optimize_prompt(task, best_practices) + + return advanced_llm_call(optimized_prompt) +``` + +--- + +## 🔬 研究前沿 + +### 当前挑战 +1. **稳定性** - 防止负面进化 +2. **可控性** - 确保进化方向 +3. **效率** - 减少进化成本 + +### 未来方向 +1. **多 Agent 协同进化** +2. **元学习加速进化** +3. **安全约束机制** + +--- + +## 📚 参考资源 + +- **视频链接**: https://youtu.be/vDw2IKBXmB4 +- **相关论文**: Self-Evolving Agent Architectures +- **开源项目**: AutoGPT, BabyAGI + +--- + +**整理仓库**: `autonomous-agent-stack`(公开) +**标签**: #SelfEvolving #AI进化 #智能体 diff --git a/ai_lab.env.example b/ai_lab.env.example index ad50660b..67f33970 100644 --- a/ai_lab.env.example +++ b/ai_lab.env.example @@ -11,3 +11,38 @@ COMPOSE_DIR=/Volumes/AI_LAB/Github/autonomous-agent-stack/sandbox/ai-lab COMPOSE_FILE=/Volumes/AI_LAB/Github/autonomous-agent-stack/sandbox/ai-lab/docker-compose.yml AUTO_OPEN_DOCKER=1 +AUTO_START_COLIMA=1 +# Optional but recommended on shared machines: +# COLIMA_PROFILE=ai-lab + +# Minimal OpenHands / LLM wiring +LLM_API_KEY=replace-with-your-llm-key +LLM_MODEL=openai/glm-5 +LLM_BASE_URL=https://open.bigmodel.cn/api/coding/paas/v4 + +# Optional: point host docker CLI at a user-owned Colima socket +# Local user profile example: +# DOCKER_HOST=unix:///Users/your-user/.colima/default/docker.sock +# External store example: +# DOCKER_HOST=unix:///Volumes/ColimaStore/.colima-home/default/docker.sock +# COLIMA_HOME_PATH=/Volumes/ColimaStore/.colima-home +# COLIMA_PROFILE=default + +# Optional: host-side OpenHands CLI fallback +# OPENHANDS_LOCAL_BIN=/absolute/path/to/openhands +# OPENHANDS_CONTAINER_BIN=openhands +# OPENHANDS_EXPERIMENTAL=1 +# OPENHANDS_SANDBOX_PROVIDER=process +# OPENHANDS_HOME_DIR=/absolute/path/to/shared/openhands-home + +# Optional Telegram / Panel control surface +AUTORESEARCH_TELEGRAM_BOT_TOKEN=your-telegram-bot-token +AUTORESEARCH_TELEGRAM_ALLOWED_UIDS=123456789 +AUTORESEARCH_PANEL_JWT_SECRET=replace-with-random-secret +AUTORESEARCH_PANEL_BASE_URL=https://panel.example.com/api/v1/panel/view +AUTORESEARCH_TELEGRAM_MINI_APP_URL=https://panel.example.com/api/v1/panel/view + +# Optional upstream watcher +AUTORESEARCH_UPSTREAM_WATCH_URL=https://github.com/openclaw/openclaw.git +AUTORESEARCH_UPSTREAM_WATCH_WORKSPACE_ROOT=/Volumes/AI_LAB/ai_lab/workspace +AUTORESEARCH_UPSTREAM_WATCH_MAX_COMMITS=5 diff --git a/configs/agents/mock.yaml b/configs/agents/mock.yaml index 9e447bef..a9a481ea 100644 --- a/configs/agents/mock.yaml +++ b/configs/agents/mock.yaml @@ -14,7 +14,7 @@ "network": "disabled", "network_allowlist": [], "tool_allowlist": ["read", "write", "bash"], - "allowed_paths": ["src/**", "tests/**", "docs/**"], + "allowed_paths": ["src/**", "tests/**", "scripts/**", "docs/**", "apps/**"], "forbidden_paths": [ ".git/**", "logs/**", @@ -22,7 +22,7 @@ "memory/**" ], "max_changed_files": 5, - "max_patch_lines": 200, + "max_patch_lines": 2000, "allow_binary_changes": false, "cleanup_on_success": true, "retain_workspace_on_failure": true diff --git a/configs/agents/openhands.yaml b/configs/agents/openhands.yaml index be07785a..7aa5684a 100644 --- a/configs/agents/openhands.yaml +++ b/configs/agents/openhands.yaml @@ -16,7 +16,7 @@ "network": "disabled", "network_allowlist": [], "tool_allowlist": ["read", "write", "bash"], - "allowed_paths": ["src/**", "tests/**"], + "allowed_paths": ["src/**", "tests/**", "scripts/**", "apps/**"], "forbidden_paths": [ ".git/**", "logs/**", @@ -24,7 +24,7 @@ "memory/**" ], "max_changed_files": 20, - "max_patch_lines": 500, + "max_patch_lines": 2000, "allow_binary_changes": false, "cleanup_on_success": true, "retain_workspace_on_failure": true diff --git a/configs/runtime/day.yaml b/configs/runtime/day.yaml new file mode 100644 index 00000000..017be6cb --- /dev/null +++ b/configs/runtime/day.yaml @@ -0,0 +1,10 @@ +preferred_lane: local +max_workers: 1 +max_concurrency: 1 +allow_exploration: false +allow_patch: true +allow_draft_pr: false +require_high_risk_approval: true +step_budget: 8 +token_budget: 20000 +timeout_sec: 900 diff --git a/configs/runtime/night.yaml b/configs/runtime/night.yaml new file mode 100644 index 00000000..60f8d979 --- /dev/null +++ b/configs/runtime/night.yaml @@ -0,0 +1,10 @@ +preferred_lane: remote +max_workers: 1 +max_concurrency: 2 +allow_exploration: true +allow_patch: true +allow_draft_pr: true +require_high_risk_approval: true +step_budget: 16 +token_budget: 80000 +timeout_sec: 3600 diff --git a/data/sessions.db-shm b/data/sessions.db-shm new file mode 100644 index 00000000..fe9ac284 Binary files /dev/null and b/data/sessions.db-shm differ diff --git a/data/sessions.db-wal b/data/sessions.db-wal new file mode 100644 index 00000000..e69de29b diff --git a/deployment/systemd/aas-api.service b/deployment/systemd/aas-api.service new file mode 100644 index 00000000..4578dba5 --- /dev/null +++ b/deployment/systemd/aas-api.service @@ -0,0 +1,15 @@ +[Unit] +Description=Autonomous Agent Stack API +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/opt/autonomous-agent-stack +EnvironmentFile=-/opt/autonomous-agent-stack/.env.linux +ExecStart=/opt/autonomous-agent-stack/.venv/bin/python -m uvicorn autoresearch.api.main:app --host 0.0.0.0 --port 8000 +Restart=on-failure +RestartSec=5 + +[Install] +WantedBy=multi-user.target diff --git a/deployment/systemd/aas-housekeeper-mode-day.service b/deployment/systemd/aas-housekeeper-mode-day.service new file mode 100644 index 00000000..75fd836e --- /dev/null +++ b/deployment/systemd/aas-housekeeper-mode-day.service @@ -0,0 +1,9 @@ +[Unit] +Description=Switch housekeeper to day_safe +After=aas-api.service + +[Service] +Type=oneshot +WorkingDirectory=/opt/autonomous-agent-stack +EnvironmentFile=-/opt/autonomous-agent-stack/.env.linux +ExecStart=/opt/autonomous-agent-stack/scripts/housekeeper_api.sh mode-day diff --git a/deployment/systemd/aas-housekeeper-mode-day.timer b/deployment/systemd/aas-housekeeper-mode-day.timer new file mode 100644 index 00000000..281dcf46 --- /dev/null +++ b/deployment/systemd/aas-housekeeper-mode-day.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run housekeeper day mode switch at 09:00 + +[Timer] +OnCalendar=*-*-* 09:00:00 +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/deployment/systemd/aas-housekeeper-mode-night.service b/deployment/systemd/aas-housekeeper-mode-night.service new file mode 100644 index 00000000..43e4c255 --- /dev/null +++ b/deployment/systemd/aas-housekeeper-mode-night.service @@ -0,0 +1,9 @@ +[Unit] +Description=Switch housekeeper to night_readonly_explore +After=aas-api.service + +[Service] +Type=oneshot +WorkingDirectory=/opt/autonomous-agent-stack +EnvironmentFile=-/opt/autonomous-agent-stack/.env.linux +ExecStart=/opt/autonomous-agent-stack/scripts/housekeeper_api.sh mode-night diff --git a/deployment/systemd/aas-housekeeper-mode-night.timer b/deployment/systemd/aas-housekeeper-mode-night.timer new file mode 100644 index 00000000..174380bc --- /dev/null +++ b/deployment/systemd/aas-housekeeper-mode-night.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run housekeeper night mode switch at 23:00 + +[Timer] +OnCalendar=*-*-* 23:00:00 +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/deployment/systemd/aas-housekeeper-morning-summary.service b/deployment/systemd/aas-housekeeper-morning-summary.service new file mode 100644 index 00000000..b74ea0ec --- /dev/null +++ b/deployment/systemd/aas-housekeeper-morning-summary.service @@ -0,0 +1,9 @@ +[Unit] +Description=Send housekeeper morning summary +After=aas-api.service + +[Service] +Type=oneshot +WorkingDirectory=/opt/autonomous-agent-stack +EnvironmentFile=-/opt/autonomous-agent-stack/.env.linux +ExecStart=/opt/autonomous-agent-stack/scripts/housekeeper_api.sh morning-summary diff --git a/deployment/systemd/aas-housekeeper-morning-summary.timer b/deployment/systemd/aas-housekeeper-morning-summary.timer new file mode 100644 index 00000000..031a5c9a --- /dev/null +++ b/deployment/systemd/aas-housekeeper-morning-summary.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run housekeeper morning summary at 08:30 + +[Timer] +OnCalendar=*-*-* 08:30:00 +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/deployment/systemd/aas-housekeeper-night-explore.service b/deployment/systemd/aas-housekeeper-night-explore.service new file mode 100644 index 00000000..24a673fa --- /dev/null +++ b/deployment/systemd/aas-housekeeper-night-explore.service @@ -0,0 +1,9 @@ +[Unit] +Description=Run one housekeeper night explore tick +After=aas-api.service + +[Service] +Type=oneshot +WorkingDirectory=/opt/autonomous-agent-stack +EnvironmentFile=-/opt/autonomous-agent-stack/.env.linux +ExecStart=/opt/autonomous-agent-stack/scripts/housekeeper_api.sh night-tick diff --git a/deployment/systemd/aas-housekeeper-night-explore.timer b/deployment/systemd/aas-housekeeper-night-explore.timer new file mode 100644 index 00000000..64b018f6 --- /dev/null +++ b/deployment/systemd/aas-housekeeper-night-explore.timer @@ -0,0 +1,9 @@ +[Unit] +Description=Run housekeeper night explore every 2 hours overnight + +[Timer] +OnCalendar=*-*-* 23,1,3,5,7:15:00 +Persistent=true + +[Install] +WantedBy=timers.target diff --git a/docs/architecture.md b/docs/architecture.md index 5110adb9..ee9c4ed2 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -1,301 +1,312 @@ -# 🏗️ 架构总览 +# Architecture -> **完整堆栈视图**:6 部分核心架构深度解析 +Compatibility mirror for legacy documentation links. ---- +Canonical source: [`../ARCHITECTURE.md`](../ARCHITECTURE.md) -## 📊 架构堆栈 +This file intentionally mirrors the current architecture in a docs-relative location because older docs, reports, and completeness checks still expect `docs/architecture.md` to exist. Keep this file aligned with the root `ARCHITECTURE.md`. -``` -┌─────────────────────────────────────────────────────────────┐ -│ 自主智能体融合架构 - 完整堆栈视图 │ -└─────────────────────────────────────────────────────────────┘ - │ - ├─ Part 1: MetaClaw 自演化(最顶层) - │ ├─ 快循环:技能驱动即时适应(毫秒级) - │ ├─ 慢循环:机会主义策略优化(小时级) - │ └─ MAML 版本化数据隔离(D_sup vs D_qry) - │ - ├─ Part 2: Autoresearch API-first(科学准则层) - │ ├─ Karpathy 循环三大原语(可编辑资产/标量度量/时间盒) - │ ├─ 5 大 RESTful API(Generator/Executor/Evaluator/Synthesis/Loop) - │ └─ 三种优化拓扑(超参数/架构/提示词) - │ - ├─ Part 3: Deer-flow 编排与隔离(物理熔炉层) - │ ├─ Lead Agent + Sub-agents 并发编排(最大 3 并行) - │ ├─ 三级沙盒隔离(L1/L2/L3) - │ └─ 动态上下文工程(9+ 中间件) - │ - ├─ Part 4: InfoQuest/MCP 深度耦合(知识引擎层) - │ ├─ Web Search + Link Reader 双核引擎 - │ ├─ MCP 远端服务映射(动态发现) - │ └─ 双 API 架构(LangGraph 2024 + Gateway 8001) - │ - ├─ Part 5: Claude Code 终端集成(传输层) - │ ├─ 四维执行模式(Flash/Standard/Pro/Ultra) - │ ├─ MCP 传输层矩阵(stdio/SSE/HTTP Streamable) - │ └─ SSE 流式挂起异常分析(59.2-138.6s 停滞) - │ - └─ Part 6: OpenClaw 持久化架构(记忆神经中枢层) - ├─ SOUL.md + MEMORY.md + Daily Logs - ├─ 记忆刷新机制(Token 上限拦截) - └─ AppleDouble 污染防治(P0 级) +## Current System Summary + +`autonomous-agent-stack` is currently a bounded control plane for autonomous repository changes, not an unconstrained self-editing agent. + +The stable path is: + +1. plan a bounded repository improvement, +2. choose a runtime mode and lane, +3. execute it inside isolation, +4. validate the patch, +5. re-check promotion conditions, +6. emit either a patch artifact or a Draft PR. + +That means the system is optimized for controlled mutation, not unrestricted autonomy. + +## Canonical Mainline + +```mermaid +flowchart TD + A["Repo Scan
AutoResearch Planner"] --> B["Worker Contract
strict OpenHands or AEP job"] + B --> C["Runtime Mode Selector
day / night"] + C --> D["Dispatch Contract
dispatch_run + adapter"] + D --> E["Isolated Workspace
baseline + workspace + artifacts"] + E --> F["Validation Gate
tests + path policy + artifact filtering"] + F --> G["Git Promotion Gate
writer lease + approval + draft PR checks"] + G --> H["Patch Artifact"] + G --> I["Draft PR"] ``` ---- +The architectural principle is simple: -## Part 1: MetaClaw 自演化机制 +- planning may select work, +- workers may edit in isolation, +- promotion may upgrade the result, +- but no single layer owns all three powers. -### 核心价值 -**持续自演化 + 双循环学习** +## Control-Plane Hardening Additions -### 技术架构 +The current docs set also assumes: -#### 双循环适应机制 +- a typed `dispatch_run` record wraps execution, +- a fake remote adapter can simulate future Linux orchestration while Linux is offline, +- failure classes map to typed recovery actions, +- and runtime mode config chooses the preferred lane before dispatch. -| 循环 | 机制 | 时间尺度 | 价值 | -|------|------|---------|------| -| **快速循环** | 技能驱动即时适应 | 毫秒级 | 重试率 -24.8% | -| **慢速循环** | 机会主义策略优化 | 小时级 | 鲁棒性 +18.3% | +## Zero-Trust Rules -#### 关键技术 +### Brain and Hand Separation -**1. 技能进化器(Skill Evolver)** -- 失败轨迹因果分析 -- 自然语言空间合成新技能($\Delta S$) -- 即时注入到系统提示词 +OpenHands and other workers are execution hands, not the control plane. -**2. OMLS 调度器(Opportunistic Meta-Learning Scheduler)** -- 监控用户不活跃窗口(睡眠/会议时段) -- 云端触发 LoRA 微调 -- 异步合并优化权重 +The control plane lives in the repository code: -**3. MAML 版本化数据隔离** -``` -支持集(Support Data, D_sup_g) → 技能进化器 -查询集(Query Data, D_qry) → RL 策略优化 -``` +- planner services, +- execution contracts, +- validation logic, +- promotion gates, +- approval flows, +- writer leases. -### 关键指标 -- **准确率提升**:21.4% → 40.6%(+89.7%) -- **阶段重试率下降**:10.5% → 7.9%(-24.8%) -- **综合鲁棒性提升**:+18.3% +### Patch-Only Default ---- +Autonomous edits should default to patch-only mode. -## Part 2: Autoresearch API-first 设计 +The OpenHands worker prompt built by `src/autoresearch/core/services/openhands_worker.py` explicitly forbids direct git mutation commands such as commit, push, merge, rebase, reset, and checkout. The worker is expected to produce the smallest possible patch inside `allowed_paths`. -### 核心价值 -**标准化研究循环 + 可扩展架构** +### Deny-Wins Policy Merge -### Karpathy 循环三大原语 +The AEP layer merges policy with deny-wins behavior: -| 原语 | 约束 | 工程价值 | -|------|------|---------| -| **可编辑资产** | 单一/极少数文件 | 可解释性 + git diff 审查 | -| **标量度量** | 自动化计算 + 无歧义 | 消除人类判断依赖 | -| **时间盒约束** | 固定物理时间 | 实验成本一致性 | +- forbidden paths widen, +- allowed paths narrow, +- stricter network mode wins, +- smaller mutation limits win. -### 5 大 RESTful API 接口 +This prevents a single request from widening safety boundaries beyond the manifest defaults. -| API | 职责 | 技术边界 | 当前状态 | -|-----|------|---------|---------| -| **Generator API** | 合成候选变异方案 | LLM 重度依赖 | ⏳ 待实现 | -| **Executor API** | 沙盒执行代码/训练 | 计算密集型 | ⏳ 待实现 | -| **Evaluator API** | 提取标量度量 | 确定性逻辑 | ✅ **已落地** | -| **Synthesis API** | 结构化记录实验 | I/O 密集型 | ⏳ 待实现 | -| **Loop Control API** | 顶层状态机 | 异步调度 | ⏳ 待实现 | +### Single Writer Rule -### 三种优化拓扑 +`WriterLeaseService` is the repository's single-writer lock for dangerous mutable transitions. -| 拓扑 | 目标 | 场景 | -|------|------|------| -| **超参数搜索** | 系数/权重调整 | Precision@12 + MRR 优化 | -| **架构变异** | 网络层/优化器 | AutoML + 创造性探索 | -| **提示词优化** | 系统指令改进 | AI 优化 AI | +It is used in the current codebase for: -### 关键指标 -- **最小闭环已打通**:Evaluator API ✅ -- **从 Demo → 生产级**:SQLite 持久化 + evaluator_command override +- git promotion finalization, +- managed skill promotion, +- approval-linked mutation paths, +- and any place where two concurrent writers would create ambiguous state. ---- +If a lease is unavailable, the system should block rather than guess. -## Part 3: Deer-flow 并发隔离 +### Runtime Artifacts Never Promote -### 核心价值 -**多智能体并发 + 三级沙盒隔离** +The promotion path rejects runtime/control artifacts from source promotion. The active deny prefixes include: -### 架构核心 +- `logs/` +- `.masfactory_runtime/` +- `memory/` +- `.git/` -#### Lead Agent + Sub-agents 编排 +This rule exists in both the AEP patch filtering logic and the git promotion gate. -**并发熔断策略**: -- 最大并行子智能体数量:3 个 -- 绝对执行超时阈值:15 分钟 -- 上下文绝对隔离(视界限制) +### Clean Base Requirement -#### 三级沙盒隔离矩阵 +Two current operations enforce a clean checkout: -| 级别 | 架构定位 | 实现机制 | 适用场景 | -|------|---------|---------|---------| -| **L1** | 本地原生执行 | 宿主机 OS | 极简验证(风险最高) | -| **L2** | Docker AIO 容器 | 虚拟化路径 | 生产级标准(推荐) | -| **L3** | Kubernetes 集群 | 独立 Pod | 企业级高并发 | +- OpenHands CLI execution in `OpenHandsControlledBackendService` +- Draft PR upgrade in `GitPromotionGateService` -#### 动态上下文工程(9+ 中间件) +This prevents unrelated local changes from being mixed into agent output. -| 中间件 | 职责 | 技术实现 | -|--------|------|---------| -| **MemoryMiddleware** | 数据防抖 + 持久化 | 异步队列 + LLM 写入 | -| **SummarizationMiddleware** | 上下文压缩 | Token 监控 + 卸载到文件 | -| **DanglingToolCallMiddleware** | 异常悬挂处理 | 占位符注入 | +## Physical and Sandbox Topology -### 关键指标 -- **会话间零污染**:✅ -- **并发熔断**:3 并行 + 15 分钟超时 +The physical environment is part of the architecture, not just an ops footnote. ---- +### Host Layout -## Part 4: InfoQuest/MCP 深度耦合 +- host: MacBook Air M1 +- runtime: Colima / Docker +- repository path: `/Volumes/AI_LAB/Github/autonomous-agent-stack` +- ai-lab writable roots: + - `/Volumes/AI_LAB/ai_lab/workspace` + - `/Volumes/AI_LAB/ai_lab/logs` + - `/Volumes/AI_LAB/ai_lab/.cache` -### 核心价值 -**企业级知识获取 + 双核引擎** +`ai_lab.env` binds the current environment to those external disk paths and points Docker to the Colima socket. Capacity, cleanup, and mount behavior all assume that external-disk layout. -### 双核引擎 +### Mount Behavior -| 引擎 | 核心能力 | 工程价值 | -|------|---------|---------| -| **Web Search** | 域名限定 + 时间窗口 | 屏蔽互联网噪音 + 提升时效性 | -| **Link Reader** | 反爬突破 + 结构化重组 | LLM 友好格式 + Token 优化 | +`scripts/launch_ai_lab.sh` builds a layered mount strategy: -### MCP 远端服务映射 +1. host source checkout remains the baseline, +2. the selected host root is mounted into the container at `/workspace` as read-only, +3. when OpenHands controlled execution needs a writable surface, an extra writable mount is attached at `/opt/workspace`, +4. controlled execution still snapshots into its own per-run baseline/workspace/artifact directories before validation or promotion. -**配置端点**:`https://mcp.infoquest.bytepluses.com/mcp` +In practical handoff language: -**动态发现**:运行时注册企业级工具 +`Mac host source -> Colima -> ai-lab writable roots on /Volumes/AI_LAB -> isolated workspace -> isolated promotion worktree` -### 双 API 架构 +### Execution Isolation -| API | 端口 | 职责 | 技术边界 | -|-----|------|------|---------| -| **LangGraph API** | 2024 | 智能体线程流转 | 状态追踪 + 流式返回 | -| **Gateway API** | 8001 | 网关控制 | 模型路由 + Artifacts 管理 + MCP 凭证注入 | +`OpenHandsControlledBackendService` creates: -### 关键指标 -- **Token 优化**:✅ -- **高并发稳定性**:✅ +- `baseline/` +- `workspace/` +- `artifacts/` ---- +under a per-run root. The main repo checkout is copied, not edited in place. -## Part 5: Claude Code 终端集成 +### Promotion Isolation -### 核心价值 -**开发者心流 + 四维执行模式** +`GitPromotionGateService` and `GitPromotionService` create git worktrees under salted `/tmp` paths derived from the repo root hash. That salt is there to stop same-named repos from colliding. -### 四维执行模式 +Examples of the current patterns: -| 模式 | 能力 | 适用场景 | 延迟 | -|------|------|---------|------| -| **Flash** | 关闭深层思考 + 子智能体 | 快速问答 | < 1s | -| **Standard** | 标准 LLM 推理 | 代码重构 | 1-10s | -| **Pro** | 强制规划模式(is_plan_mode) | 项目搭建 | 10-60s | -| **Ultra** | Lead Agent + 子智能体矩阵 | 深度研究 + Karpathy 循环 | 分钟级 | +- `/tmp/-/promotion-worktrees/` +- `/tmp/repo-/promotions//worktree` -### MCP 传输层矩阵 +This separation matters because promotion is intentionally a second isolation hop, not a continuation of execution isolation. -| 传输层 | 延迟 | 适用场景 | 稳定性 | -|--------|------|---------|--------| -| **stdio** | < 10ms | 本地开发 | ⭐⭐⭐⭐⭐ | -| **SSE** | 100-500ms | 远程服务器 | ⭐⭐(风险高) | -| **HTTP Streamable** | 50-200ms | 新规范 | ⭐⭐⭐⭐ | +## Trust State Machines -### SSE 流式挂起异常 +### Managed Skill Ladder -**实际风险**: -- 🔴 停滞时间:59.2s - 138.6s -- 🔴 失败模式:ECONNRESET + 5 分钟超时 +Managed skills advance through: -**工程解决方案**: -- ✅ 自动重连(Auto-resume) -- ✅ 心跳包保活(Keep-alive) -- ✅ HTTP Streamable 协议迁移 +`pending -> quarantined -> cold_validated -> promoted` ---- +Interpretation: -## Part 6: OpenClaw 持久化架构 +- `pending`: request has been accepted but not yet trusted +- `quarantined`: copied into holding +- `cold_validated`: static and contract checks passed +- `promoted`: copied into the active skill root -### 核心价值 -**透明状态管理 + AppleDouble 污染防治** +Promotion to active runtime is guarded by a writer lease. The system is intentionally biased toward rejection or stalling over unsafe activation. -### 三层状态管理 +### Patch Promotion Ladder -| 层级 | 文件 | 职责 | 生命周期 | -|------|------|------|---------| -| **核心骨架** | SOUL.md | 身份人格 + 操作边界 | 静态元数据 | -| **动态大脑** | MEMORY.md | 用户偏好 + 项目原则 | 持续生长 | -| **短期草稿** | Daily Logs | 运行时发现 + 遥测 | 仅追加模式 | +Patch promotion begins with a patch artifact and then computes a preflight report. -### 记忆刷新机制 +Patch-level checks include: -**流程**: -1. Token 上限拦截 -2. LLM 提取提示词 -3. 写入 MEMORY.md -4. 清空短期记忆 +- patch exists, +- forbidden paths are untouched, +- runtime artifacts are excluded, +- changed file count limit, +- patch line limit, +- no binary changes unless explicitly allowed, +- no direct write to the base branch, +- writer lease is available. -### AppleDouble 污染防治(P0 级) +Draft PR adds stricter checks: -**物理成因**: -- macOS 双叉架构(数据分叉 + 资源分叉) -- 跨文件系统(macOS → FAT32/exFAT/SMB) -- 自动拆分(script.py → ._script.py) +- remote is healthy, +- base repo is clean, +- credentials are available, +- target base branch exists, +- approval is granted. -**致命破坏**: -- 🔴 compileall 崩溃(语法解析错误) -- 🔴 虚假失败判定(False Negative) -- 🔴 正确改进回滚 +If Draft PR cannot be safely upgraded but patch checks pass, the system degrades to patch mode. -**防御方案**: -```bash -# 1. Mac 原生工具合并 -dot_clean -m /path/to/workspace +## Controlled Execution Loop -# 2. Linux 沙盒递归粉碎 -find /workspace -type f -name '._*' -delete -find /workspace \( -name .DS_Store -o -name .apdisk \) -type f -delete -rm -rf /workspace/{.Trashes,.Spotlight-V100,.fseventsd} -``` +### Planner ---- +`AutoResearchPlannerService` is the current active-seeking layer. -## 🎯 架构价值总结 +It scans the repo for bounded, patch-friendly work. The current heuristics are intentionally simple and auditable: -### 层级协同 +- high-signal backlog markers such as `FIXME`, `BUG`, `HACK`, `XXX`, `TODO` +- source hotspots without a direct regression test -``` -MetaClaw(最顶层) - ↓ 提供自演化能力 -Autoresearch(科学准则层) - ↓ 提供标准化研究循环 -Deer-flow(物理熔炉层) - ↓ 提供并发隔离执行 -InfoQuest/MCP(知识引擎层) - ↓ 提供企业级知识获取 -Claude Code(传输层) - ↓ 提供开发者心流 -OpenClaw(记忆神经中枢层) - ↓ 提供透明状态管理 -``` +The planner emits three downstream-ready contracts: + +- `OpenHandsWorkerJobSpec` +- `ControlledExecutionRequest` +- AEP `JobSpec` + +This means downstream execution does not need to reinterpret a vague natural-language task. The contract is explicit from the start. + +### Worker Contract + +`OpenHandsWorkerService` turns the selected plan into a strict worker prompt: + +- modify only allowed paths, +- never touch forbidden paths, +- do not perform git branching or commit actions, +- keep the patch minimal, +- leave promotion to the gate. + +### Controlled Backend + +`OpenHandsControlledBackendService` is the narrowest end-to-end path: + +- snapshot repo, +- run backend, +- collect changed files, +- write patch artifact, +- detect scope violations, +- run validation command, +- hand result to promotion gate only if policy and validation pass. + +If the repo root is dirty and the backend is OpenHands CLI, execution is blocked before it starts. + +### AEP Runner + +`AgentExecutionRunner` provides a contract-first execution path using: + +`JobSpec -> driver adapter -> DriverResult -> validation -> promotion patch -> decision` + +Both execution paths converge on the same architectural principle: promotion is downstream of validation and never worker-owned. + +## Persistent State and Artifacts + +The control plane stores typed metadata in SQLite repositories. This includes: + +- approvals, +- managed skill installs, +- AutoResearch plans, +- execution runs, +- evaluations, +- capability snapshots, +- other API-visible state. + +Per-run artifacts live on disk under runtime directories and include: + +- specs, +- policies, +- logs, +- validation artifacts, +- patch files, +- summary JSON, +- event streams. + +These runtime artifacts are intentionally excluded from promotion. + +## Canonical Files to Read During Handoff + +Start here when reloading context: + +- `ARCHITECTURE.md` +- `memory/SOP/MASFactory_Strict_Execution_v1.md` +- `src/autoresearch/core/services/autoresearch_planner.py` +- `src/autoresearch/core/services/openhands_worker.py` +- `src/autoresearch/core/services/openhands_controlled_backend.py` +- `src/autoresearch/executions/runner.py` +- `src/autoresearch/core/services/git_promotion_gate.py` +- `src/autoresearch/core/services/managed_skill_registry.py` +- `src/autoresearch/core/services/writer_lease.py` +- `scripts/launch_ai_lab.sh` -### 关键指标 +## Red Lines -| 指标 | 数值 | -|------|------| -| **架构层级** | 6 层 | -| **核心技术** | 25+ 个 | -| **P0 任务** | 3 个(全部完成 ✅) | -| **关键指标** | 准确率 +89.7%,鲁棒性 +18.3% | +The current architecture is specifically designed to prevent: ---- +- direct pushes to `main` by workers, +- direct activation of untrusted skill bundles, +- concurrent mutation of shared promotion state, +- runtime artifacts leaking into source patches, +- uncontrolled widening of worker scope, +- dirty local changes being mistaken for clean autonomous output. -**构建无需人类干预的超级智能体网络** 🚀 +Those are not future features to "unlock". They are intentional safety boundaries. diff --git a/docs/cc-switch-usage.md b/docs/cc-switch-usage.md new file mode 100644 index 00000000..1e581d48 --- /dev/null +++ b/docs/cc-switch-usage.md @@ -0,0 +1,166 @@ +# cc-switch Usage Guide + +这份说明只回答一个问题: + +`cc-switch` 在这个仓库里该放哪,不该放哪。 + +## 一句话结论 + +`cc-switch` 适合放在本地开发工作台,不适合接管仓库的执行主链路。 + +更直接一点: + +- 适合:给人用 +- 不适合:给 AEP runner 当底盘 + +## 为什么 + +这个仓库的主价值已经不是“切换哪个 CLI 更方便”,而是把 agent 执行收进统一协议。 + +当前主链路已经包含这些关键环节: + +1. `make agent-run` 进入 `scripts/agent_run.py` +2. 组装 `JobSpec`、`ValidatorSpec`、fallback 策略 +3. 交给 `AgentExecutionRunner` +4. 在受控 workspace 中执行 adapter +5. 跑 validator +6. 生成 promotion patch +7. 决定 promote / reject / human_review + +对 `openhands` 来说,这条链还进一步拆成: + +- `configs/agents/openhands.yaml` + 定义 process adapter、默认 policy、allowed paths、patch/file 限额 +- `drivers/openhands_adapter.sh` + 读取 AEP 环境变量,执行 OpenHands,并标准化为 `driver_result.json` +- `scripts/openhands_start.sh` + 负责 `host` / `ai-lab` runtime、workspace、settings、audit、启动参数 + +所以这里的核心诉求是: + +- 可审计 +- 可约束 +- 可验证 +- 可回放 +- 可 promotion + +这和“切换哪个 CLI/provider 更顺手”不是一层东西。 + +## 适合怎么用 + +推荐把 `cc-switch` 放在 Mac 控制面,作为开发者工作台。 + +典型用途: + +- 手工切换 `Codex`、`OpenClaw`、`Claude Code` 等 CLI +- 做 prompt 试验 +- 对比不同 provider 的回答或代码风格 +- 人工复现某个 agent 行为 +- 在正式进入 AEP 前先做低成本探索 + +这个定位下,`cc-switch` 是提效工具,不是执行协议的一部分。 + +## 不适合怎么用 + +不建议让 `cc-switch` 直接接管下面这些环节: + +- `make agent-run` +- `make openhands-controlled` +- `AgentExecutionRunner` +- `drivers/openhands_adapter.sh` +- `scripts/openhands_start.sh` +- validation gate +- promotion gate +- Linux `OPENHANDS_RUNTIME=host` 执行链 + +原因很简单: + +- 这些环节需要稳定的输入输出契约 +- 需要固定的审计和 artifact 产出 +- 需要可重复的 policy 约束 +- 需要在失败时进入 fallback / human review + +`cc-switch` 更像开发者侧的“入口切换器”,而不是执行侧的“受控协议层”。 + +## 推荐拓扑 + +最稳的做法是: + +- Mac:控制面 + 工作台 +- Linux:执行面 + +对应分工: + +- Mac + - Telegram + - 审批 + - 面板 + - 本地 CLI 切换(`cc-switch`) +- Linux + - OpenHands + - pytest + - patch 生成 + - promotion 前验证 + +这和当前仓库推荐的 Linux remote worker 方向是一致的。 + +## 推荐工作流 + +### 工作流 1:人工探索,再进入受控执行 + +1. 在 Mac 上用 `cc-switch` 选一个你要试的 CLI/provider +2. 手工验证 prompt、任务拆解、输出风格 +3. 确认任务契约后,回到仓库主入口: + +```bash +make agent-run AEP_AGENT=openhands AEP_TASK="Create apps/demo/lead_capture.py with tests." +``` + +### 工作流 2:Mac 调试,Linux 执行 + +1. Mac 上用 `cc-switch` 做人工调试 +2. Linux 保持真实执行面: + +```bash +OPENHANDS_RUNTIME=host make doctor-linux +OPENHANDS_RUNTIME=host make start +make agent-run AEP_AGENT=openhands AEP_TASK="Create apps/demo/lead_capture.py with tests." +``` + +### 工作流 3:只做旁路工具,不改主链 + +如果想给团队加一点便利,建议只加旁路入口,例如: + +- `scripts/dev/with_cc_switch.sh` +- `make cli-shell` +- `make cli-check` + +但这些入口应满足 4 条规则: + +1. 给人用,不给 runner 用 +2. 不参与 promotion +3. 不作为 CI 依赖 +4. 不替换现有 adapter / launcher + +## 风险提醒 + +如果把 `cc-switch` 误放到执行主链里,常见风险是: + +- 执行输入输出不再稳定 +- provider 切换把审计边界打散 +- fallback 语义变模糊 +- promotion 结果更难复现 +- 线上故障时不容易定位是 runner 问题还是工作台问题 + +## 推荐边界 + +可以接,而且值得接。 + +但推荐边界很明确: + +- `cc-switch` 负责开发者体验 +- AEP / OpenHands controlled backend 负责受控执行 + +一句话说完: + +`cc-switch` 可以做工作台,别做底盘。 diff --git a/docs/deployment-status.md b/docs/deployment-status.md new file mode 100644 index 00000000..878fe32f --- /dev/null +++ b/docs/deployment-status.md @@ -0,0 +1,37 @@ +# Deployment Status + +## As Of March 31, 2026 + +- Mac control plane: online +- Local AEP / OpenHands execution lane: online +- Linux remote execution lane: offline +- Real SSH / supervisor / systemd remote integration: not connected in this branch + +## What This Branch Does + +`feat/control-plane-hardening` hardens the control plane before Linux comes back: + +- fixed remote-run contract +- JSON schemas for task and summary payloads +- fake remote adapter for offline orchestration testing +- centralized failure taxonomy +- day/night runtime mode config layer +- lifecycle / failure / deployment docs + +## What This Branch Explicitly Does Not Do + +- no live SSH dispatch to Linux +- no always-on Linux supervisor +- no systemd or cron remote scheduler +- no claim that remote Draft PR flow is production-ready + +## Practical Reading + +Right now the repository behaves like this: + +- preferred lane can still be configured as `remote` +- if remote is unavailable, control-plane selection falls back to `local` +- the fallback is recorded in `dispatch_run` +- fake remote artifacts are written under the normal run root so future Linux worker wiring can reuse the same shape + +When the Linux lane is restored, the next step is to replace the fake adapter with a real remote adapter, not redesign the protocol from scratch. diff --git a/docs/failure-modes.md b/docs/failure-modes.md new file mode 100644 index 00000000..0dee6416 --- /dev/null +++ b/docs/failure-modes.md @@ -0,0 +1,76 @@ +# Failure Modes + +这份文档定义控制面统一使用的 failure taxonomy。无论以后任务跑在 Mac 还是 Linux,都先按这套分类落账。 + +## Canonical Failure Classes + +### `planner_stalled` + +- Meaning: 控制面调度本身卡住,轮询没有拿到终态。 +- Default action: `require_human_review` +- Typical signal: dispatch polling exhausted before terminal status. + +### `executor_stalled` + +- Meaning: 执行面启动了,但没有持续进展。 +- Default action: `retry` +- Typical signal: AEP `stalled_no_progress` or fake remote `stalled`. + +### `tool_timeout` + +- Meaning: 执行超时。 +- Default action: `retry` +- Typical signal: AEP `timed_out` or fake remote `timed_out`. + +### `model_fallback` + +- Meaning: 主要执行链退化到了 mock / fallback model,但结果仍可用。 +- Default action: `downgrade_to_draft` +- Typical signal: fallback agent succeeded and produced a valid patch. + +### `assertion_failed_after_fallback` + +- Meaning: fallback model 产出了结果,但验证仍然失败。 +- Default action: `require_human_review` +- Typical signal: fallback agent is `mock` and validator stays red. + +### `env_missing` + +- Meaning: 运行环境缺依赖、命令、路径或运行时前置条件。 +- Default action: `abort` +- Typical signal: `EnvironmentCheckFailed: ...` + +### `workspace_dirty` + +- Meaning: 基线工作区不干净,不允许继续 promotion 或受控执行。 +- Default action: `abort` +- Typical signal: `repository worktree is not clean` or `clean git checkout`. + +### `transient_network` + +- Meaning: 远端连接瞬断或暂时性网络故障。 +- Default action: `retry` +- Typical signal: `ssh: connection reset by peer`, `connection refused`, `network is unreachable`. + +### `unknown` + +- Meaning: 当前证据不足以更精确分类。 +- Default action: `quarantine` +- Typical signal: terminal failure without a stronger classifier hit. + +## Action Semantics + +- `retry`: 控制面可安全重试,不自动放大权限。 +- `abort`: 当前环境不满足前置条件,先停。 +- `require_human_review`: 需要人判断要不要继续。 +- `downgrade_to_draft`: 允许保留结果,但不要当成高置信执行面成功。 +- `quarantine`: 先隔离结果和状态,再人工看。 + +## Current Implementation Sources + +- AEP / OpenHands runner outcomes +- Fake remote adapter terminal states +- Validation report failures +- Clean-worktree / environment preflight errors + +这套 taxonomy 是控制面协议的一部分,不是某个 adapter 私有约定。 diff --git a/docs/linux-remote-worker.md b/docs/linux-remote-worker.md new file mode 100644 index 00000000..7bbbc018 --- /dev/null +++ b/docs/linux-remote-worker.md @@ -0,0 +1,291 @@ +# Linux Remote Worker Guide + +这份指南的目标很单纯:把一台 Linux 机器尽快变成这套仓库的稳定执行节点。 + +> 状态说明(2026-03-31): +> 当前 live Linux lane 仍然是 offline。 +> `feat/control-plane-hardening` 先把协议、状态机、回退逻辑和测试固定下来;真实远端接入还没在这条分支里启用。 + +先看这 3 份控制面文档,再看本指南: + +- [Run Lifecycle](./run-lifecycle.md) +- [Failure Modes](./failure-modes.md) +- [Deployment Status](./deployment-status.md) + +当前最推荐的拓扑不是“Linux 完全复制 Mac/Colima”,而是: + +- Mac: 控制面 + - Telegram + - 审批 + - Review + - Memory +- Linux: 执行面 + - OpenHands + - pytest + - patch 生成 + - promotion 前验证 + +## 为什么 Linux 先走 `host` + +这个仓库在 Mac 上对 `ai-lab + Colima + 外置盘` 做了大量加固,但 Linux 节点的最佳起步路径更简单: + +- 避开 Colima / Mac socket 差异 +- 先验证真实 OpenHands 能写代码、跑测试、产 patch +- 等业务链稳定后,再考虑把 Linux 执行面容器化 + +所以第一阶段建议固定: + +```bash +export OPENHANDS_RUNTIME=host +``` + +等 Linux lane 恢复后,优先对齐的不是 shell 脚本细节,而是控制面契约: + +- `src/autoresearch/shared/remote_run_contract.py` +- `schemas/task_run.schema.json` +- `schemas/run_summary.schema.json` + +## 最小准备 + +要求: + +- Python `3.11+` +- `git` +- `curl` +- 建议安装 `gh` +- 建议安装 `tmux` + +初始化: + +```bash +git clone +cd autonomous-agent-stack + +python3.11 -m venv .venv +source .venv/bin/activate + +make setup +OPENHANDS_RUNTIME=host make doctor-linux +``` + +`make doctor-linux` 会额外检查几件对 Linux 节点最关键的事: + +- 当前是否真在 Linux 上 +- `OPENHANDS_RUNTIME` 是否设成 `host` +- `DOCKER_HOST` 是否错误继承了 Mac / Colima 路径 +- `.masfactory_runtime`、`artifacts`、`logs` 是否可写 +- `gh` 和 `tmux` 是否可用 + +## 推荐环境变量 + +最低限度: + +```bash +export OPENHANDS_RUNTIME=host +export AUTORESEARCH_API_HOST=0.0.0.0 +export AUTORESEARCH_API_PORT=8001 +``` + +如果 Linux 节点上已经准备好了独立 OpenHands CLI,可再补: + +```bash +export OPENHANDS_LOCAL_BIN=/absolute/path/to/openhands +``` + +如果由 Linux 节点直接访问模型: + +```bash +export LLM_MODEL=openai/glm-5 +export LLM_API_KEY=... +export LLM_BASE_URL=... +``` + +## 启动方式 + +API: + +```bash +OPENHANDS_RUNTIME=host make start +``` + +真实 OpenHands worker 冒烟: + +```bash +OPENHANDS_RUNTIME=host make openhands OH_TASK="Create apps/demo/lead_capture.py with tests." +``` + +AEP runner: + +```bash +OPENHANDS_RUNTIME=host make agent-run \ + AEP_AGENT=openhands \ + AEP_TASK="Create apps/demo/lead_capture.py with tests." +``` + +promotion: + +```bash +PYTHONPATH=src .venv/bin/python scripts/promote_run.py \ + --run-id \ + --push \ + --open-draft-pr +``` + +## Linux 远端怎么用最值钱 + +最实用的用法不是“把 Linux 也当控制台”,而是明确分工。 + +### 模式 1: Mac 控制面 + Linux 执行面 + +最推荐。 + +- Mac 继续收 Telegram +- Mac 做审批和看板 +- Linux 专门负责跑 OpenHands / pytest / promotion 验证 + +### 模式 2: Linux 纯 worker 箱 + +适合长任务。 + +- SSH 到 Linux +- 用 `tmux` 起会话 +- 在 tmux 里跑真实 OpenHands 和全量测试 + +示例: + +```bash +tmux new -s aas-worker +cd /path/to/autonomous-agent-stack +source .venv/bin/activate +OPENHANDS_RUNTIME=host make start +``` + +### 模式 3: Linux 作为 promotion / CI 预演机 + +适合把“能不能出 PR”这件事放到更干净的机器上验证。 + +- Linux 跑 `pytest` +- Linux 跑 `scripts/promote_run.py` +- Mac 只看结果和审批 + +## 最佳实践 + +### 1. 先 `host`,后容器 + +Linux 第一阶段先固定: + +```bash +export OPENHANDS_RUNTIME=host +``` + +先证明这 4 件事都稳定,再考虑容器化: + +- OpenHands 能写入业务目录 +- pytest 能跑通 +- patch 能产出 +- promotion 能完成 + +### 2. 把 Linux 当执行机,不当第二控制台 + +最优分工是: + +- Mac 收 Telegram 和审批 +- Linux 跑重任务和长测试 + +这样能把“控制面”故障和“执行面”故障拆开。 + +### 3. 永远先跑 `make doctor-linux` + +每次新节点上线、重装 Python、迁移目录、换用户后,先跑: + +```bash +OPENHANDS_RUNTIME=host make doctor-linux +``` + +不要先跑长任务,再回头找 `DOCKER_HOST` 或目录权限问题。 + +### 4. 用 `tmux` 托管长任务 + +不要把真实 OpenHands 长跑直接挂在脆弱 SSH 会话上。 + +最小建议: + +```bash +tmux new -s aas-worker +``` + +### 5. 把 promotion 放到更干净的机器上 + +如果 Mac 本地工作区经常是脏的,Linux 节点更适合承担: + +- 全量 pytest +- promotion gate +- draft PR 创建 + +这样更容易避免因为本地临时改动导致降级成 patch-only。 + +### 6. 不要把 Mac 的运行态变量原样 rsync 过去 + +尤其是: + +- `DOCKER_HOST` +- `COLIMA_*` +- `/Users/...` +- `/Volumes/...` + +Linux 节点要有自己的本地路径和本地执行假设。 + +### 7. 业务任务优先落 `apps/` + +Linux 节点最适合跑边界清晰的业务任务,例如: + +- `apps//...` +- `tests/apps/...` + +不要一上来拿它做模糊的大范围框架重构。 + +## 常见坑 + +### 1. 继承了 Mac 的 `DOCKER_HOST` + +现象: + +- `make doctor-linux` 提示 `DOCKER_HOST` 指向 `/Users/.../.colima/...` + +处理: + +```bash +unset DOCKER_HOST +``` + +### 2. 运行目录不可写 + +现象: + +- `.masfactory_runtime` +- `artifacts` +- `logs` + +处理: + +```bash +mkdir -p .masfactory_runtime artifacts logs +chmod -R u+rwX .masfactory_runtime artifacts logs +``` + +### 3. 想一上来复制 `ai-lab` + +不建议。 + +Linux 远端第一阶段先用 `host` runtime 打通: + +- Manager +- Worker +- Validator +- Promotion + +把这条链跑稳后,再考虑容器化隔离。 + +## 一句话建议 + +Linux 节点现在最有价值的角色是“干净、稳定、能长跑的执行机”,不是去复刻 Mac 的 Colima 环境。 diff --git a/docs/openhands-cli-integration.md b/docs/openhands-cli-integration.md index 7e6b0a69..d0966de4 100644 --- a/docs/openhands-cli-integration.md +++ b/docs/openhands-cli-integration.md @@ -19,6 +19,32 @@ For dry-run behavior while validating local wiring: OPENHANDS_DRY_RUN=1 make agent-run AEP_AGENT=openhands AEP_TASK="Create src/demo_math.py with add(a,b)." ``` +For a real non-interactive CLI path, the launcher now defaults to OpenHands headless mode and sources `ai_lab.env` before dispatch: + +```bash +OPENHANDS_SANDBOX_PROVIDER=process \ +make openhands OH_TASK="Scan /opt/workspace/src and add the smallest passing regression." +``` + +Equivalent command template: + +```bash +RUNTIME=process \ +SANDBOX_VOLUMES=/actual/workspace:/workspace:rw \ +openhands --exp --headless -t "your task" +``` + +Notes: + +- `scripts/openhands_start.sh` prefers a dedicated host-side CLI at `./.masfactory_runtime/tools/openhands-cli-py312/bin/openhands` when present, and bootstraps `agent_settings.json` from `LLM_MODEL` / `LLM_API_KEY` / `LLM_BASE_URL`. +- `sandbox/ai-lab/Dockerfile` now pins the container-side CLI to the same validated `OpenHands CLI 1.5.0`, so the host and `ai-lab` runtime do not silently diverge. +- The launcher now `cd`s into the target worktree before invoking the CLI, so OpenHands sees the real isolated workspace as its current working directory. +- The local `OpenHands CLI 1.5.0` smoke checks confirmed that `--exp --headless` auto-exits cleanly for pipeline use, while plain `--headless` completes the task but can remain attached to the prompt. +- The same local smoke checks confirmed `--headless` and `-t`, but not `--json`, so JSON mode is opt-in only for CLI builds that actually expose that flag. +- `ai-lab` runtime intentionally defaults to `openhands` inside the container, instead of reusing a host-only binary path. +- If your session cannot access the configured Docker/Colima socket, the launcher first tries a safe Colima fallback: repo-managed external store when configured, otherwise the current user's own `~/.colima/` socket. The current-user fallback also adds `/Volumes/AI_LAB` as a Colima mount when that external workspace root exists; on shared machines, using a dedicated profile such as `COLIMA_PROFILE=ai-lab` is the lowest-risk path. +- The process provider is operationally useful but weaker than the full container sandbox, so it should be treated as an explicit fallback rather than the end-state isolation model. + ## Runtime Layout Each run writes to: diff --git a/docs/pr-review-hardening.md b/docs/pr-review-hardening.md index 89675eb9..7c3af466 100644 --- a/docs/pr-review-hardening.md +++ b/docs/pr-review-hardening.md @@ -55,3 +55,75 @@ This repository now uses a two-layer review setup: - `CODEOWNERS` is in `.github/CODEOWNERS` and should be kept in sync with maintainers. - Repository-specific review policy is in `.agents/skills/custom-codereview-guide.md`. - If merge queue is enabled, keep both `CI` and `Quality Gates` workflows listening on `merge_group`. + +## Trial Rubric + +Before expanding reviewer scope or adding a second AI reviewer, run a small trial on 5-10 low-risk PRs and score the current OpenHands reviewer with the same rubric each time. + +Track these four metrics for every reviewed PR: + +1. False positive rate +- Definition: comments that are technically incorrect, already addressed by the diff, or not actionable in context. +- Target for trial: keep this low enough that humans do not start ignoring the bot by default. + +2. False negative rate +- Definition: important review findings later caught by humans, CI, or post-merge fixes that the reviewer missed. +- Target for trial: identify whether the bot is missing recurring classes of issues. + +3. Useful comment rate +- Definition: percentage of comments that directly lead to a code change, clarification, or an accepted follow-up task. +- Target for trial: favor signal over volume; a smaller number of useful comments is better than many low-value comments. + +4. Average repair rounds +- Definition: how many human or bot follow-up rounds are needed before the PR reaches an acceptable state. +- Target for trial: measure whether the reviewer is reducing or increasing iteration cost. + +## Trial Execution + +Use this lightweight process for each PR in the trial window: + +1. Mark the PR with `review-this` and let OpenHands produce comment-only feedback. +2. Record the raw bot comments in the PR timeline or an external tracker. +3. After human review completes, classify each bot comment as one of: +- `useful` +- `false_positive` +- `non_actionable` +- `duplicate` +4. If humans, CI, or post-merge fixes catch a material issue the bot missed, log one `false_negative` entry for that issue class. +5. Record how many total repair rounds were needed before merge or close. + +## Feedback Loop + +Keep the feedback loop simple and explicit: + +1. Add a short reviewer summary to each trial PR: +- `Bot comments: N` +- `Useful: N` +- `False positives: N` +- `False negatives discovered later: N` +- `Repair rounds: N` + +2. If a bot comment is low-value, mark it in the PR discussion with a short reason: +- `incorrect` +- `duplicate` +- `out_of_scope` +- `not_actionable` + +3. At the end of the 5-10 PR trial, summarize: +- recurring false-positive patterns +- recurring false-negative patterns +- whether useful comment rate is high enough to justify wider rollout +- whether average repair rounds went down, stayed flat, or increased + +## Rollout Gate + +Do not add a second reviewer or make the AI reviewer more prominent until the trial says it is helping. + +A wider rollout is justified only if all of the following hold: + +1. Humans still trust the comments enough to read them. +2. Useful comment rate is meaningfully higher than false-positive rate. +3. False negatives are not clustering around one obvious blind spot. +4. Average repair rounds are flat or improving. + +If the trial fails these checks, keep the current reviewer advisory-only and tune prompts, trigger rules, or review scope before expanding. diff --git a/docs/run-lifecycle.md b/docs/run-lifecycle.md new file mode 100644 index 00000000..58cedc66 --- /dev/null +++ b/docs/run-lifecycle.md @@ -0,0 +1,61 @@ +# Run Lifecycle + +这份文档描述的是离线 hardening 之后的控制面生命周期,不要求真实 Linux worker 在线。 + +## Canonical Lifecycle + +```mermaid +flowchart TD + A["Planner Selects Candidate"] --> B["request_dispatch()
dispatch_run.status = queued"] + B --> C["Remote Adapter Dispatch
queued -> running"] + C --> D["Terminal Result"] + D --> E["succeeded"] + D --> F["failed"] + D --> G["stalled"] + D --> H["timed_out"] + E --> I["fetch_summary()
persist dispatch_run + optional legacy run_summary"] + F --> I + G --> I + H --> I + I --> J["derive legacy dispatch_status"] + J --> K["DISPATCHED or FAILED"] +``` + +## Planner State + +- `AutoResearchPlanRead.dispatch_run` is the new control-plane record. +- `AutoResearchPlanRead.run_summary` stays as the legacy AEP/OpenHands summary when local execution exists. +- `AutoResearchPlanRead.dispatch_status` remains the compatibility field exposed to current panel/API consumers. + +## Remote Status Machine + +- `queued`: planner has accepted the run and reserved a lane. +- `running`: adapter has started orchestration and may emit heartbeat data. +- `succeeded`: execution reached a terminal success state. +- `failed`: execution terminated without a usable result. +- `stalled`: orchestration stopped making progress. +- `timed_out`: execution exceeded its allowed runtime budget. + +## Compatibility Mapping + +- `dispatch_run.status in {queued, running}` -> legacy `dispatch_status = dispatching` +- `dispatch_run.status = succeeded` and summary is promotable -> legacy `dispatch_status = dispatched` +- `dispatch_run.status in {failed, stalled, timed_out}` -> legacy `dispatch_status = failed` + +If a local/AEP run happened underneath the adapter, `dispatch_run` is the outer control-plane envelope and `run_summary` remains the inner execution result. + +## Artifact Layout + +The fake remote adapter writes control-plane artifacts under the normal run root: + +```text +.masfactory_runtime/runs// + remote_control/ + task_spec.json + record.json + events.ndjson + heartbeat.json + summary.json +``` + +When the lane falls back to a local AEP run, the existing AEP files stay in the same run root and `remote_control/` becomes the wrapper contract around them. diff --git a/drivers/mock_adapter.sh b/drivers/mock_adapter.sh index fef08a75..a419e0a7 100755 --- a/drivers/mock_adapter.sh +++ b/drivers/mock_adapter.sh @@ -16,10 +16,13 @@ require_env "AEP_RESULT_PATH" PY_BIN="${PYTHON_BIN:-python3}" "${PY_BIN}" - "${AEP_JOB_SPEC}" "${AEP_WORKSPACE}" "${AEP_RESULT_PATH}" <<'PY' +import fnmatch import json import re +import shlex import sys from pathlib import Path +from typing import Optional job_path = Path(sys.argv[1]) workspace = Path(sys.argv[2]) @@ -29,16 +32,171 @@ payload = json.loads(job_path.read_text(encoding="utf-8")) task = str(payload.get("task") or "mock task").strip() run_id = str(payload.get("run_id") or "mock-run").strip() agent_id = str(payload.get("agent_id") or "mock").strip() +policy = payload.get("policy") or {} +allowed_paths = [ + str(item).strip() + for item in policy.get("allowed_paths") or [] + if isinstance(item, str) and str(item).strip() +] +validators = payload.get("validators") or [] +validator_command = "" +for spec in validators: + if isinstance(spec, dict) and str(spec.get("kind") or "") == "command": + validator_command = str(spec.get("command") or "").strip() + if validator_command: + break slug = re.sub(r"[^a-z0-9]+", "_", task.lower()).strip("_") or "task" -target = workspace / "src" / "mock_agent_output.py" -target.parent.mkdir(parents=True, exist_ok=True) -content = ( - '"""Generated by the mock AEP adapter."""\n\n' - "def run() -> dict[str, str]:\n" - f' return {{"run_id": "{run_id}", "task": "{slug}", "status": "mocked"}}\n' -) -target.write_text(content, encoding="utf-8") + + +def matches_any(path: str, patterns: list[str]) -> bool: + normalized = path.replace("\\", "/") + return any(fnmatch.fnmatch(normalized, pattern) for pattern in patterns) + + +def infer_validator_target() -> Optional[Path]: + if not validator_command: + return None + for token in reversed(shlex.split(validator_command)): + candidate = token.strip().strip("'\"").replace("\\", "/") + if not candidate or candidate.startswith("-"): + continue + if candidate in {"python", "python3", "pytest", "py_compile"}: + continue + if not any(marker in candidate for marker in ("/", ".")): + continue + if matches_any(candidate, allowed_paths): + return workspace / candidate + return None + + +def infer_primary_source_target() -> Optional[Path]: + exact_paths: list[str] = [] + for item in allowed_paths: + if any(char in item for char in "*?["): + continue + normalized_item = item.replace("\\", "/") + if "/tests/" in f"/{normalized_item}/": + continue + exact_paths.append(item) + for candidate in exact_paths: + path = Path(candidate) + if path.name.startswith("test_") and "tests" in path.parts: + continue + return workspace / candidate + + for pattern in allowed_paths: + prefix = pattern.split("*", 1)[0].split("?", 1)[0].split("[", 1)[0].rstrip("/") + if not prefix: + continue + prefix_path = Path(prefix) + if prefix_path.name.startswith("test_") and "tests" in prefix_path.parts: + continue + if "." in prefix_path.name: + return workspace / prefix + if "apps" in prefix_path.parts: + return workspace / prefix / "lead_capture.py" + return workspace / prefix / "mock_agent_output.py" + return None + + +def select_target() -> Path: + inferred = infer_validator_target() + if inferred is not None: + return inferred + + exact_paths = [item for item in allowed_paths if not any(char in item for char in "*?[")] + for candidate in exact_paths: + if candidate and candidate in validator_command: + return workspace / candidate + for candidate in exact_paths: + path = Path(candidate) + if path.name.startswith("test_") and "tests" in path.parts: + return workspace / candidate + if exact_paths: + return workspace / exact_paths[0] + for pattern in allowed_paths: + prefix = pattern.split("*", 1)[0].split("?", 1)[0].split("[", 1)[0].rstrip("/") + if not prefix: + continue + if "." in Path(prefix).name: + return workspace / prefix + return workspace / prefix / "mock_agent_output.py" + return workspace / "src" / "mock_agent_output.py" + + +def build_source_content(target_path: Path) -> str: + relative = target_path.relative_to(workspace) + if relative.name == "lead_capture.py" and "apps" in relative.parts: + return ( + '"""Generated by the mock AEP adapter."""\n\n' + "import re\n\n" + "PHONE_PATTERN = re.compile(r'^1[3-9]\\\\d{9}$')\n\n" + "class PhoneValidator:\n" + " @staticmethod\n" + " def validate(phone: str) -> bool:\n" + " normalized = str(phone or '').replace(' ', '').replace('-', '')\n" + " return bool(PHONE_PATTERN.fullmatch(normalized))\n\n" + "def capture_lead(phone: str) -> dict[str, object]:\n" + " if not PhoneValidator.validate(phone):\n" + " return {'success': False, 'error': 'invalid_phone_format'}\n" + " normalized = str(phone).replace(' ', '').replace('-', '')\n" + " return {'success': True, 'phone_masked': normalized[:3] + '****' + normalized[-4:]}\n" + ) + return ( + '"""Generated by the mock AEP adapter."""\n\n' + "def run() -> dict[str, str]:\n" + f' return {{"run_id": "{run_id}", "task": "{slug}", "status": "mocked"}}\n' + ) + + +def build_test_content(target_path: Path, source_target: Path) -> str: + relative_source = source_target.relative_to(workspace) + module_name = ".".join(relative_source.with_suffix("").parts) + if relative_source.name == "lead_capture.py": + return ( + '"""Generated by the mock AEP adapter."""\n\n' + f"from {module_name} import PhoneValidator, capture_lead\n\n" + "def test_phone_validator_accepts_normalized_mobile() -> None:\n" + " assert PhoneValidator.validate('13812345678') is True\n" + " assert PhoneValidator.validate('138 1234 5678') is True\n" + " assert PhoneValidator.validate('12812345678') is False\n\n" + "def test_capture_lead_masks_phone() -> None:\n" + " result = capture_lead('13812345678')\n" + " assert result['success'] is True\n" + " assert result['phone_masked'].startswith('138')\n" + " assert '12345678' not in result['phone_masked']\n" + ) + return ( + '"""Generated by the mock AEP adapter."""\n\n' + "def test_mock_autoresearch_candidate() -> None:\n" + " assert True\n" + ) + + +target = select_target() +source_target = infer_primary_source_target() or target +changed_paths: list[str] = [] + +if ( + target.name.startswith("test_") + and "tests" in target.relative_to(workspace).parts + and "apps" in source_target.relative_to(workspace).parts +): + source_target.parent.mkdir(parents=True, exist_ok=True) + source_target.write_text(build_source_content(source_target), encoding="utf-8") + changed_paths.append(source_target.relative_to(workspace).as_posix()) + + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(build_test_content(target, source_target), encoding="utf-8") + changed_paths.append(target.relative_to(workspace).as_posix()) +else: + target.parent.mkdir(parents=True, exist_ok=True) + if target.name.startswith("test_") and "tests" in target.relative_to(workspace).parts: + target.write_text(build_test_content(target, source_target), encoding="utf-8") + else: + target.write_text(build_source_content(target), encoding="utf-8") + changed_paths.append(target.relative_to(workspace).as_posix()) result = { "protocol_version": "aep/v0", @@ -47,7 +205,7 @@ result = { "attempt": 1, "status": "succeeded", "summary": "mock adapter produced a synthetic patch", - "changed_paths": ["src/mock_agent_output.py"], + "changed_paths": changed_paths, "output_artifacts": [], "metrics": { "duration_ms": 0, diff --git a/drivers/openhands_adapter.sh b/drivers/openhands_adapter.sh index 2fee219b..45bef7e7 100755 --- a/drivers/openhands_adapter.sh +++ b/drivers/openhands_adapter.sh @@ -79,7 +79,7 @@ PROMPT="${TASK} Execution contract: - Single task only. - Do not commit, push, or edit git settings. -- Only edit files under /opt/workspace. +- Only edit files inside the provided workspace root. - Return concise summary and changed files." mkdir -p "${AEP_ARTIFACT_DIR}" diff --git a/memory/2026-03-28.md b/memory/2026-03-28.md index c31f9f75..8effd13e 100644 --- a/memory/2026-03-28.md +++ b/memory/2026-03-28.md @@ -1,5 +1,10 @@ # 2026-03-28 +- 对 `/Users/iCloud_GZ/Downloads/deep-research-report.md` 做了按 GitHub 已 push 状态(以 `origin/main` 为准)的复核修订: + - 改正了“`requirements.txt` 缺少 `litellm/redis`”这一已过时判断 + - 改正了“当前主线 Telegram webhook 未校验 secret header”这一过度外推,明确该问题只适用于 legacy `src/gateway/telegram_webhook.py` + - 保留并强化了仍成立的问题:Python 版本基线不一致、CI 覆盖偏窄、缺统一 metrics/tracing、缺主服务官方 Docker/K8s/Helm 交付 + - 阶段二-B继续推进:补了三层记忆最小 contract、OpenClaw memory service,以及 Telegram `/memory` 读写长期记忆。 - 新增 provider contract 与 registry,统一承载 Apple Calendar、GitHub、OpenClaw skills、MCP 这几类能力源。 - 新增 `/api/v1/capabilities/*` 只读能力 API,已覆盖 provider 列表、skill catalog/detail、MCP tools、calendar read、github search。 @@ -30,3 +35,79 @@ - Telegram 决策动作直接写回统一 approval store,并写入 `resolved_via=telegram_command` 元数据,不再是只读查询。 - help 文案和 approval detail 都补了决策用法,减少用户在 bot 内来回试错。 - 相关 targeted regression 已多轮通过,当前最新结果为 `62 passed`,`624 tests collected`。 +- AutoResearch 规划员最小闭环已落地: + - 新增 repo scanner,会从 `src/` / `scripts/` / `tests/` 里识别 TODO/FIXME/HACK/XXX 热点与缺失直连测试的大文件。 + - 规划结果现在会直接产出 `OpenHandsWorkerJobSpec`、`ControlledExecutionRequest` 与 AEP `JobSpec`,默认保持 patch-only / approval-gated promotion 兼容。 + - 新增 `/api/v1/autoresearch/plans` create/list/get API,便于后续 Telegram / panel / admin 面直接消费同一份规划产物。 + - 本轮 targeted regression:`PATH=\"$PWD/.venv/bin:$PATH\" .venv/bin/pytest -q tests/test_autoresearch_planner.py tests/test_openhands_worker.py tests/test_openhands_controlled_backend.py tests/test_git_promotion_gate.py` → `24 passed`。 +- 架构文档债已清理: + - 新增根目录 `ARCHITECTURE.md`,把当前真实主线收口为 `AutoResearch Planner -> OpenHands patch-only -> validation -> Git Promotion Gate -> patch/Draft PR`。 + - 重写 `docs/architecture.md` 为兼容镜像,避免旧文档链接和 `tests/test_completeness.py` 继续指向过时架构稿。 + - 新建 `memory/SOP/MASFactory_Strict_Execution_v1.md`,把 patch-only、WriterLease、Quarantine -> Cold Validated -> Promoted、clean repo / Draft PR preflight 等执行红线压成短清单。 + - `README.md` 已从“优先读取不存在 SOP”的误导描述改成真实指向:`ARCHITECTURE.md` 总图 + `memory/SOP/MASFactory_Strict_Execution_v1.md` 执行清单。 + - 本轮文档校验:`PATH=\"$PWD/.venv/bin:$PATH\" .venv/bin/pytest -q tests/test_completeness.py -k 'test_file_structure or test_documentation'` → `2 passed`,仅有测试文件自身遗留的 `PytestReturnNotNoneWarning`。 +- AutoResearch 可视化与调度闭环已接上: + - `autoresearch_plans.py` 在新 plan 落库后会尝试发 Telegram 提醒;如果当前环境没有 HTTPS panel / Mini App 地址,则自动退化为 text-only 通知,不让无效 `web_app` 链接把消息发送搞挂。 + - panel state 已新增 `pending_autoresearch_plans`,内联控制面板也补了 `AutoResearch Plans` 区块与 `Dispatch to OpenHands` 动作。 + - panel dispatch 现在会写统一审计日志,并异步触发 planner -> OpenHands worker 执行链,完成后再回推一条 Telegram dispatch 结果通知。 + - 为此扩展了 panel audit contract,新增 `dispatch` / `autoresearch_plan` 事件类型。 + - 本轮回归:`PATH=\"$PWD/.venv/bin:$PATH\" .venv/bin/pytest -q tests/test_autoresearch_planner.py tests/test_panel_security.py tests/test_openhands_worker.py tests/test_openhands_controlled_backend.py tests/test_git_promotion_gate.py` → `34 passed`。 + - 真实烟雾测试已手工触发:向 allowlisted Telegram UID 发送了一条 AutoResearch 新规划提醒,API 返回 `notification_sent=true`;当前生成的 candidate 是 `scripts/check_prompt_hygiene.py` 上的 TODO hotspot。 + - 真实环境仍有一个配置缺口:`AUTORESEARCH_PANEL_BASE_URL` / Mini App URL 还没指向可公开访问的 HTTPS 地址,所以 Telegram 侧目前只能可靠发送 text-only 提醒,暂时不给 Mini App 按钮。 +- Panel action link 继续收口: + - `PanelAccessService` 新增 tokenized `build_action_url(...)`,Planner 和 managed-skill admin 现在共用同一套“带 action query + 短效 panel token”的 URL 构造逻辑。 + - Telegram 按钮现在按能力分级:有 `mini_app_url` 时发 `web_app`,只有 HTTPS panel URL 时退化为普通 `url` 按钮,再没有就退成 text-only。 + - `.env.example`、`ai_lab.env.example`、`README.md` 都补上了 `AUTORESEARCH_TELEGRAM_BOT_TOKEN`、`AUTORESEARCH_TELEGRAM_ALLOWED_UIDS`、`AUTORESEARCH_PANEL_JWT_SECRET`、`AUTORESEARCH_PANEL_BASE_URL`、`AUTORESEARCH_TELEGRAM_MINI_APP_URL` 的最小配置说明。 + - 本轮扩展回归:`PATH=\"$PWD/.venv/bin:$PATH\" .venv/bin/pytest -q tests/test_autoresearch_planner.py tests/test_panel_security.py tests/test_admin_managed_skills.py tests/test_openhands_worker.py tests/test_openhands_controlled_backend.py tests/test_git_promotion_gate.py` → `41 passed`。 +- Planner -> Panel -> Worker live 闭环继续加固: + - 严格链现在允许受控 `scripts/**` 写入:`HARD_POLICY`、`configs/agents/openhands.yaml`、`configs/agents/mock.yaml` 已对齐,避免 planner 扫到 `scripts/` 热点却在执行层被 allowed_paths 错杀。 + - `OpenHandsWorkerService` 会把 `pytest ...` 验证命令收口成 `sys.executable -m pytest ...`,修掉 daemon / background task 环境下 bare `pytest` 找不到的问题。 + - `drivers/mock_adapter.sh` 已改成尊重 `allowed_paths` 与 validator 目标;当目标是缺失测试文件时,会优先生成一个最小 passing pytest 文件,不再写死 `src/mock_agent_output.py` 造成越界。 + - Telegram 通知链新增发送降级:先尝试 `web_app` 按钮,若 Telegram 拒绝当前 Mini App 域名,则自动回退到普通 HTTPS `url` 按钮,再失败才退为 text-only;admin managed-skill promotion 与 AutoResearch plan 提醒都用同一策略。 + - 本轮综合回归:`PATH=\"$PWD/.venv/bin:$PATH\" .venv/bin/pytest -q tests/test_agent_policy_merge.py tests/test_openhands_worker.py tests/test_mock_adapter.py tests/test_autoresearch_planner.py tests/test_admin_managed_skills.py tests/test_panel_security.py tests/test_openhands_worker_strict_chain.py tests/test_openhands_controlled_backend.py tests/test_git_promotion_gate.py` → `50 passed`。 + - 真实烟雾验证: + - 新 plan `plan_6533fec668ab` 创建时返回 `notification_sent=true`,说明 Telegram 在 `web_app` 被拒后已成功自动降级发送。 + - 公网 Panel token dispatch 后,plan 最终进入 `dispatch_status=dispatched` / `run_final_status=ready_for_promotion` / `promotion_success=true`,patch 输出在 `.masfactory_runtime/runs/plan_6533fec668ab-72bf9fe893fb/artifacts/promotion.patch`。 + - 当前真实执行 backend 仍未接上可用 OpenHands CLI;live 跑通依赖两次 `openhands` contract_error 后自动回退到 `mock` agent,第 3 次尝试成功产出受控 patch。下一步若要从“可控演示闭环”升级为“真实自动修复闭环”,需要补齐 `OPENHANDS_CMD` / runtime 实装。 +- 真实 OpenHands CLI 接入今天有了实锤进展: + - 使用独立 Python 3.12 工具 venv 成功安装 `OpenHands CLI 1.5.0` 到 `.masfactory_runtime/tools/openhands-cli-py312/`,本机 `python3.14` 不满足其版本要求。 + - `scripts/openhands_start.sh` 现已支持: + - 自动加载 `ai_lab.env` + - `host` 模式下自动生成 `agent_settings.json` + - 启动前 `cd` 到目标 worktree + - `ai-lab` 模式默认调用容器内 `openhands`,不再错误复用宿主机 binary 路径 + - `OPENHANDS_HOME_DIR` 可透传到 `launch_ai_lab.sh` / compose + - 实跑验证得出的关键事实: + - `OpenHands CLI 1.5.0` 支持 `-t` / `--headless` / `--exp` + - 该版本不支持 `--json`,所以 launcher 默认已改为不带 `--json` + - plain `--headless` 会完成任务但停在提示符;`--exp --headless` 会自动输出 conversation summary 并退出,适合作为 pipeline worker + - 本轮 launcher / controlled backend targeted regression:`PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_openhands_launcher.py tests/test_openhands_worker.py tests/test_openhands_worker_strict_chain.py tests/test_openhands_controlled_backend.py` → `18 passed`。 + - 真实 host smoke: + - 直接 launcher smoke 已在 `.masfactory_runtime/smokes/openhands-host-smoke.UvoXT6/` 成功写入 `smoke_result.txt`,内容为 `READY_FOR_PROMOTION` + - `--exp --headless` smoke 已在 `.masfactory_runtime/smokes/openhands-exp-smoke.BT0rLs/` 成功自动创建 `smoke_exp.txt` 且进程自行退出 + - shell 版 controlled backend 也补了两个真实缺口: + - `scripts/openhands_controlled_backend.sh` 的 rsync snapshot 现在排除 `.masfactory_runtime` 与 `logs`,避免复制运行产物导致递归膨胀 + - promotion diff 现改为 `filtered baseline -> isolated workspace`,不再把排除目录误算成整批删除 + - 真实 controlled smoke 已自动收口成功: + - `run_id=real-smoke-222341` + - 任务是创建 `src/openhands_real_smoke.py` 并定义 `VALUE = 'READY_FOR_PROMOTION'` + - 输出 `promotion_ready=true` + - patch 在 `.masfactory_runtime/smokes/real-smoke-222341/artifacts/promotion.patch` + - 仍存在的外部 blocker: + - `ai-lab` 容器链尚未打通,不是仓库代码问题,而是当前用户 `iCloud_GZ` 无法访问 `DOCKER_HOST=unix:///Users/shoushu/.colima/default/docker.sock`;socket 权限属于 `shoushu`,导致 `docker info` 在该 Colima 路径下不可用 +- patch 归一化缺口也已补平: + - `scripts/openhands_controlled_backend.sh` 现在会在 `git diff --no-index` 后做 header/path 清洗,把 shell 版 controlled backend 的 patch 统一改写为 `diff --git a/... b/...`、`--- a/...`、`+++ b/...` 的相对路径格式 + - 最新 smoke `run_id=real-smoke-223336` 仍为 `promotion_ready=true`,且产物 patch 头已是 `diff --git a/src/openhands_real_smoke.py b/src/openhands_real_smoke.py` + - 截至当前,剩余唯一明确 blocker 是 Colima / Docker socket 权限问题,repo 内路径、launcher、headless auto-exit、controlled backend baseline/diff、patch normalization 都已打通 +- Colima socket 这条“最后一公里”今天也补上了安全回退: + - `scripts/launch_ai_lab.sh` 现在不会把所有 Colima fallback 都笼统叫成 `repo-managed`;若外置盘 Colima store 可用,则日志明确为 `repo-managed Colima is ready`,否则会回退到当前用户自己的 `~/.colima/` 并输出 `current-user Colima is ready` + - 新增 `tests/test_launch_ai_lab.py::test_launch_ai_lab_can_fallback_to_current_user_colima`,覆盖“无外置 store 时直接启动当前用户 Colima profile”的分支 + - live 验证:`AUTO_OPEN_DOCKER=0 bash ./scripts/launch_ai_lab.sh status` 在当前机器上先识别到不可访问的 `/Users/shoushu/.colima/default/docker.sock`,随后安全回退到 `unix:///Users/iCloud_GZ/.colima/default/docker.sock` 并成功返回 status;全程未放宽 socket 权限 +- `ai-lab` 容器链继续往前打,今天又暴露并补掉了两层更深的集成问题: + - 之前 compose 直接把 Colima socket 当单文件 bind mount 到 `/var/run/docker.sock`,在当前用户 socket 路径上会触发 `mkdir ...docker.sock: operation not supported`;现已改成“挂父目录 + 容器内用目录中的 socket 路径”,对应修改在 `scripts/launch_ai_lab.sh` 和 `sandbox/ai-lab/docker-compose.yml` + - `sandbox/ai-lab/Dockerfile` 已锁定到与宿主机一致的 `OpenHands CLI 1.5.0`,并移除了其实没被使用的 `software-properties-common`,避免容器冷启动漂到未验证新版本,也减少一大串 `dbus/packagekit/systemd` 依赖 + - live 验证继续推进: + - 当前用户默认 `~/.colima/default` profile 虽能起 Docker,但看不到 `/Volumes/AI_LAB` 的真实内容,容器里对应 bind mount 是空目录 + - 因此 current-user fallback 现已在 `colima start` 时显式追加 `--mount /Volumes/AI_LAB:w` + - 另外用 `COLIMA_PROFILE=ai-lab` 安全拉起了独立 profile,成功得到 `unix:///Users/iCloud_GZ/.colima/ai-lab/docker.sock`,避免去改现有 default profile + - 在这个独立 profile 下,`ai-lab` 路径已经越过 socket 权限与 compose socket mount 错误,进入真实镜像冷启动;当前剩余成本主要是该 profile 的首次镜像下载/构建时间 diff --git a/memory/2026-03-29.md b/memory/2026-03-29.md new file mode 100644 index 00000000..c38b8401 --- /dev/null +++ b/memory/2026-03-29.md @@ -0,0 +1,381 @@ +## OpenClaw upstream sync probe + +- Added `scripts/sync_openclaw_upstream.sh` to shallow-clone the latest OpenClaw upstream into `/Volumes/AI_LAB/ai_lab/workspace`. +- Executed `bash scripts/sync_openclaw_upstream.sh` successfully. +- Clone landed at `/Volumes/AI_LAB/ai_lab/workspace/openclaw-upstream.MGRUmT`. +- Latest upstream commit in the shallow clone was `b23ed7530b82d56bcc80548a61b5daeab3e0cd50` (`chore: ignore local tmp workspace`, 2026-03-29 00:51:03 +0000). +- Because the clone is `depth=1`, local history inspection is intentionally limited. Used GitHub commit metadata to inspect recent upstream movement. +- Recent upstream commits were mostly channel/runtime fixes: + - `acca306` `fix: polish LINE status snapshot checks (#45701)` + - `03941e2` `fix(line): use configured field in collectStatusIssues instead of raw token` + - `5ebccf5` `test: harden zalo webhook lifecycle tests` + - `9e1b524` `fix: break mattermost runtime cycle` +- Initial judgment: upstream activity looks concentrated on LINE/Zalo/Mattermost integration robustness, not obviously on our current highest-priority stack areas (`AutoResearch`, approval surface, OpenHands execution chain). +- Follow-up option: tighten the clone command with `--single-branch --no-tags --filter=blob:none` while keeping `depth=1`, and optionally bake remote commit metadata lookup into the script for better trend analysis. + +## Upstream watcher hardening + +- Added `src/autoresearch/core/services/upstream_watcher.py` as an isolated upstream reconnaissance service for OpenClaw. +- Planner contract now supports `include_upstream_watch`, persists `upstream_watch` results, and can send low-noise Telegram skip reports when upstream changes are judged irrelevant to core infra. +- `src/autoresearch/api/routers/autoresearch_plans.py` now sends a dedicated skip notification: + - `🛡️ 已完成上游巡检,最新变更(... 修复)与核心基建无关,已自动拦截跳过。` +- `scripts/sync_openclaw_upstream.sh` now: + - fetches enough history to inspect recent commits properly, + - prints `origin/main` recent commits/touched files, + - defaults to cleaning `/Volumes/AI_LAB/ai_lab/workspace/openclaw-upstream.*` after analysis. +- Added tests: + - `tests/test_upstream_watcher.py` + - `tests/test_sync_openclaw_upstream.py` + - expanded `tests/test_autoresearch_planner.py` +- Validation: + - `12 passed` for planner/upstream/script tests + - `2 passed` for completeness docs checks, with the existing `PytestReturnNotNoneWarning` in `tests/test_completeness.py` +- Live smoke: + - Ran `OPENCLAW_SYNC_MAX_COMMITS=3 bash ./scripts/sync_openclaw_upstream.sh` + - Latest upstream head observed during smoke: `f9b1079 build: cut 2026.3.28 stable` + - Script printed recent commits/files and then cleaned the temp clone successfully; `find /Volumes/AI_LAB/ai_lab/workspace -maxdepth 1 -type d -name 'openclaw-upstream.*'` returned nothing. + +## Manager Agent bootstrap + +- Added `src/autoresearch/agents/manager_agent.py` as the first business-layer “manager” agent. +- Added `src/autoresearch/shared/manager_agent_contract.py` for dispatch request/result models. +- Added `src/autoresearch/api/routers/manager_agent.py` and mounted it in `src/autoresearch/api/main.py`. +- New API: + - `POST /api/v1/agents/manager/dispatch` + - `GET /api/v1/agents/manager/dispatches` + - `GET /api/v1/agents/manager/dispatches/{dispatch_id}` +- Behavior: + - Accepts vague founder prompts. + - Routes them into bounded intents (`game_prototype`, `telegram_surface`, `approval_surface`, `worker_execution`, fallback `generic_product`). + - Builds `OpenHandsWorkerJobSpec`, controlled request, and AEP `JobSpec`. + - Can background-dispatch through the existing runner. +- Validation: + - `11 passed` for `tests/test_manager_agent.py tests/test_autoresearch_planner.py` + - completeness docs checks still `2 passed` with the pre-existing `PytestReturnNotNoneWarning`. +- Lightweight real smoke on current repo: + - Prompt: `我想做个小游戏,先在现有 panel 里做一个最小可玩的版本。` + - Result: + - `intent=game_prototype` + - allowed paths included `panel/**`, `src/autoresearch/api/routers/panel.py`, `src/autoresearch/api/routers/openclaw.py`, `tests/test_panel_security.py` + - `test_command=pytest -q tests/test_panel_security.py` + +## Manager Agent promoted to Project Manager + +- Upgraded `src/autoresearch/shared/manager_agent_contract.py` to include: + - `ManagerExecutionPlanRead` + - `ManagerPlanTaskRead` + - `ManagerPlanStrategy` + - `ManagerTaskStage` +- Reworked `src/autoresearch/agents/manager_agent.py` so complex prompts now generate a DAG-style execution plan instead of a single worker contract. +- Complex prompts now decompose into ordered stages: + - `backend` + - `tests` + - `frontend` +- `POST /api/v1/agents/manager/dispatch` now returns the full decomposition tree through `execution_plan.tasks`, while preserving the first task at top level for compatibility. +- `execute_dispatch()` now runs plan tasks sequentially and stops on the first failed stage, persisting per-task status and `run_summary`. +- Added/updated validation in `tests/test_manager_agent.py`: + - simple prompt stays `single_task` + - complex dashboard prompt becomes `task_dag` + - API background execution completes all three plan stages +- Validation: + - `3 passed` for `tests/test_manager_agent.py` + - completeness docs checks still `2 passed` with the pre-existing `PytestReturnNotNoneWarning` +- Live decomposition smoke on current repo: + - Prompt: `在 Admin Panel 里加一个带图表的实时服务器资源监控大屏。` + - Result: `strategy=task_dag` + - Tasks: + - `backend`: `src/autoresearch/api/routers/admin.py`, `src/autoresearch/core/services/**` + - `tests`: `tests/test_panel_security.py`, `tests/test_admin_managed_skills.py` + - `frontend`: `src/autoresearch/api/routers/panel.py`, `panel/**` + +## Agent Audit Trail module + +- Added `src/autoresearch/core/services/agent_audit_trail.py` to aggregate recent worker activity from: + - manager DAG task runs + - autoresearch planner dispatch runs + - Claude/OpenClaw agent scheduler runs + - runtime artifacts under `.masfactory_runtime/**` and `logs/audit/openhands/jobs/**` +- Wired new admin API `GET /api/v1/admin/audit-trail` through `get_agent_audit_trail_service()`. +- Extended the inline Admin Panel page in `src/autoresearch/api/routers/admin.py` with an `Agent Audit Trail` table that shows recent run status, duration, scope, and changed paths. +- Added admin integration coverage in `tests/test_admin_backend.py`: + - bearer token guard now covers `/api/v1/admin/audit-trail` + - admin HTML check now asserts the new panel section and endpoint wiring + - seeded planner + manager + claude + runtime artifacts to verify the aggregated snapshot +- Validation: + - `9 passed` for `tests/test_admin_backend.py` + - `14 passed` for `tests/test_manager_agent.py tests/test_autoresearch_planner.py tests/test_completeness.py -k 'test_file_structure or test_documentation or test_manager_agent or test_autoresearch'` + - only the existing `PytestReturnNotNoneWarning` remains from `tests/test_completeness.py` + +## Agent Audit Trail drill-down + filtering + +- Upgraded the admin audit surface from a flat table into a filterable + inspectable control plane. +- `GET /api/v1/admin/audit-trail` now accepts: + - `status_filter=success|failed|pending|running|review` + - `agent_role=manager|planner|worker` +- Added `GET /api/v1/admin/audit-trail/{entry_id}` to return a rich detail payload with: + - full input prompt + - serialized `JobSpec` + - serialized worker spec / controlled request + - patch text preview + - error reason / traceback + - raw merged source record +- Admin inline page now includes: + - top-level status + role filters + - per-row `查看` drill-down action + - inline detail pane for input context, patch diff, and failure details +- `tests/test_admin_backend.py` now verifies: + - filtered failed worker view + - detail endpoint response + - patch diff visibility + - failed runtime traceback visibility +- Validation: + - `9 passed` for `tests/test_admin_backend.py` + - `14 passed` for `tests/test_manager_agent.py tests/test_autoresearch_planner.py tests/test_completeness.py -k 'test_file_structure or test_documentation or test_manager_agent or test_autoresearch'` + - only the existing `PytestReturnNotNoneWarning` remains from `tests/test_completeness.py` + +## Telegram `/task issue ...` -> Manager Agent -> GitHub reply approval + +- Added `src/autoresearch/core/services/github_issue_service.py` as a thin `gh issue` wrapper. + - Supports issue refs in three forms: + - full URL + - `owner/repo#123` + - `#123` resolved against local `origin` + - Can read issue title/body/comments and post a comment back through `gh issue comment` +- Wired the Telegram gateway to a new explicit command: + - `/task <需求>` + - `/task issue [补充说明]` +- Routing change in `src/autoresearch/api/routers/gateway_telegram.py`: + - ordinary Telegram text still goes to the Claude/OpenClaw chat path + - explicit `/task` now routes into `ManagerAgentService` + - issue tasks fetch GitHub issue context first, then synthesize a bounded manager prompt +- Execution flow for `/task issue ...` now is: + - Telegram command accepted + - manager dispatch created and executed in background + - Telegram receives queued/result messages + - if the task originated from a GitHub issue, a pending approval is created to comment back externally + - `/approve approve` will post the prepared summary back to the issue +- Help text now advertises `/task` and `/task issue ...` +- Added test coverage: + - `tests/test_github_issue_service.py` + - `tests/test_gateway_telegram.py` + - help output includes `/task` + - `/task issue ...` dispatches manager + creates pending GitHub reply approval + - `/approve ... approve` posts the prepared GitHub comment +- Validation: + - `25 passed` for `tests/test_github_issue_service.py tests/test_gateway_telegram.py` + - `34 passed` for `tests/test_manager_agent.py tests/test_completeness.py tests/test_github_issue_service.py tests/test_gateway_telegram.py` + - only the existing `PytestReturnNotNoneWarning` remains from `tests/test_completeness.py` + +## Telegram runtime notes + +- `gh auth status` is healthy on this machine with `repo` scope, so GitHub issue read/comment prerequisites are present. +- Local Telegram webhook self-check against API `:8000` is healthy for: + - `/api/v1/gateway/telegram/webhook` + - `getMe` + - allowlisted UID routing + - `/status` +- Fixed poller config drift: + - `migration/openclaw/scripts/start-telegram-poller.sh` now defaults to API port `8000` + - it now loads env from repo `.env` / `.env.local` as well as `migration/openclaw/.env.local` + - it exports a consistent local webhook URL + - `migration/openclaw/scripts/telegram_poller_bridge.py` now accepts both `AUTORESEARCH_TELEGRAM_BOT_TOKEN` and `TELEGRAM_BOT_TOKEN` +- Live runtime observation: + - poller log now shows the correct bot `@ccclife_agent_bot` and successfully forwarded at least one real update with `accepted=True` + - however the daemon PID recorded by `start-telegram-poller.sh` still goes stale under the current shell/tool environment, so long-lived background persistence is not fully trustworthy yet even though the webhook path itself is working +- Current repo has no open GitHub issues (`gh issue list -L 5 --json number,title,url,state` returned `[]`), so a real `/task issue #123` smoke was not executed against production issue data today. + +## Live runtime reality check after GPT-style critique + +- Inspected the currently running API on `http://127.0.0.1:8000`. +- Live server does serve Telegram/OpenClaw routes, but `GET /api/v1/agents/manager/dispatches` returned `404`, so the running process is not yet exposing the newest manager-agent route set from the current checkout. +- Live Telegram session evidence showed only ordinary Claude/OpenClaw chat traffic: + - `self-check ping` + - `没发现待审批事项,是不是哪里不对` +- No live manager dispatches or GitHub-issue-driven approvals were present in the running process at inspection time. +- This means the "real commercial chaos test" has not actually started yet: + - no live `/task issue ...` execution + - no live manager DAG run + - no draft PR produced from the new Telegram issue path +- Practical implication: + - code for `/task issue ...` is implemented and tested in-repo + - but the currently running API/poller stack must be restarted onto the new code before a real end-to-end chaos review can happen + +## Industrial chaos run: live Telegram `/task issue` trial + +- Restarted the live API on `127.0.0.1:8000` and re-verified: + - `/api/v1/agents/manager/dispatch` + - `/api/v1/admin/audit-trail` + - `/api/v1/gateway/telegram/webhook` +- Created a real GitHub issue for the pressure test: + - `#12 Chaos Run: 玛露遮瑕膏落地页商业化压力测试` +- First live trigger: + - `/task issue #12 优先做最小可运行版本,并保留业务风险说明` + - `dispatch_id=mgrdispatch_d028ca08b98f` +- Findings from the first dispatch: + - Manager routing is live, but intent selection is wrong for this kind of business request. + - The landing-page ask was misclassified as `worker_execution`, so the generated scope was infra-only: + - `src/autoresearch/core/services/openhands_worker.py` + - `src/autoresearch/core/services/autoresearch_planner.py` + - related infra tests + - Execution reached the worker layer and produced a full failure record: + - `summary.json` final status: `human_review` + - mock fallback eventually produced a synthetic patch + - validation then failed because the patch broke imports and `pytest` could not import `OpenHandsWorkerService` + - Host-runtime blocker before fallback: + - the run inherited an inaccessible `DOCKER_HOST` from local `ai_lab.env` + - worker logs showed the host socket path belonged to another local user and could not be used + - Control surface behaved correctly: + - the run did not write back to GitHub automatically + - instead it created a pending GitHub reply approval: + - `apr_9f2c756274cb` +- Local runtime repair after the first dispatch: + - Updated ignored local `ai_lab.env` to point at the current-user `ai-lab` Colima socket + - Added `COLIMA_PROFILE=ai-lab` + - Hardened `scripts/launch_ai_lab.sh` readiness detection: + - switched from `docker info` to `docker version --format '{{.Server.Version}}'` + - reason: `docker ps` / `docker version` were responsive here, while `docker info` could hang and make Docker look unavailable + - Verified the fix with: + - `AUTO_OPEN_DOCKER=0 bash ./scripts/launch_ai_lab.sh status` + - result: `current-user Colima is ready via unix:///Users/iCloud_GZ/.colima/ai-lab/docker.sock` +- Second live trigger: + - `/task issue #12 runtime 修复后重试,继续保留业务风险说明` + - `dispatch_id=mgrdispatch_8e2c9fe5073a` +- Findings from the second dispatch: + - The same intent-routing bug remained; request was still classified as `worker_execution` + - Runtime progressed deeper than the first attempt: + - it reached `docker-compose ... run --rm ai-lab ...` + - it no longer failed immediately on the stale socket + - At the last inspection, this retry had not yet produced `summary.json` / `driver_result.json` + - No compose container was visible yet from the host, so the remaining blocker appears to be in the compose/container launch phase rather than the original host-socket bootstrap +- Practical conclusion from the live trial: + - Telegram `/task issue ...` is genuinely wired into manager dispatch creation + - external GitHub write-back is correctly gated behind approval + - the most important business-layer gap is intent routing quality + - the most important runtime gap after the socket fix is the compose/container execution phase for real OpenHands runs + +## Live follow-up: manager routing fix + ai-lab state dir fix + +- Fixed manager intent selection for business landing-page requests: + - added `product_landing_page` intent to `src/autoresearch/agents/manager_agent.py` + - keyword coverage now includes `落地页 / landing page / 预约 / 留资 / 品牌 / 美妆 / marketing` + +## Strict-view watchdog hardening + live retry + +- Hardened `src/autoresearch/executions/runner.py` with a no-progress watchdog on top of the earlier strict-view workspace lock: + - the runner now treats meaningful progress as either: + - actual file changes inside `allowed_paths`, or + - state files appearing/updating under `.openhands-state/**` + - if neither happens for a bounded window, the adapter is terminated early instead of waiting for the full OpenHands timeout. +- Added a new watchdog timeout helper: + - default stall window is derived from total timeout and currently clamps to `<= 180s` + - for the OpenHands worker path this now lands at roughly `105s`, because the worker default timeout was tightened to `420s` +- Tightened `OpenHandsWorkerService.build_agent_job_spec()` so OpenHands jobs now explicitly set: + - `policy.timeout_sec = 420` + - this replaces the previous implicit `900s` manifest default for manager-generated OpenHands work +- Added/updated tests: + - `tests/test_openhands_worker.py` + - `tests/test_openhands_worker_strict_chain.py` + - regression now includes: + - strict shadow-workspace denial for out-of-scope writes + - fast-fail on broken Python syntax + - stall watchdog abort for an adapter that only sleeps and never touches the workspace +- Validation: + - `26 passed` for: + - `tests/test_openhands_worker.py` + - `tests/test_openhands_worker_strict_chain.py` + - `tests/test_openhands_launcher.py` + - `tests/test_launch_ai_lab.py` + - `tests/test_openhands_controlled_backend.py` + +## Live retry after watchdog rollout + +- Restarted the live API on `127.0.0.1:8000` against the current checkout using: + - `PYTHONPATH="$PWD/src" .venv/bin/python -m uvicorn autoresearch.api.main:app --host 127.0.0.1 --port 8000` +- Live manager retry executed directly against issue `#12`: + - `dispatch_id=mgrdispatch_5e3965853709` +- Result: + - intent routing is now correct: + - `selected_intent = product_landing_page` + - first backend task: + - `task_id = mgrdispatch_5e3965853709-backend` + - `allowed_paths = src/autoresearch/api/routers/openclaw.py, src/autoresearch/api/**, src/autoresearch/core/services/**` + - `policy.timeout_sec = 420` +- Runtime behavior: + - `attempt 1 (openhands)` ended as `timed_out` + - `attempt 2 (openhands)` ended as `timed_out` + - both failures happened before any scoped workspace change was observed, which is exactly the new watchdog target + - runner then continued into `attempt 3 (mock)` per existing fallback policy +- Final dispatch outcome: + - manager status: `failed` + - backend `run_summary.final_status = human_review` + - final driver result came from mock fallback: + - `agent_id = mock` + - `status = succeeded` + - `changed_paths = ["src/autoresearch/api/routers/openclaw.py"]` + - validation still failed: + - `worker.test_command` failed while importing `autoresearch.api.main` + - no promotion / draft PR was produced +- Practical conclusion: + - strict-view + watchdog materially improved the real chain: + - OpenHands no longer burns ~10 minutes spinning without touching the workspace + - instead it now gets cut off after about 105 seconds of zero progress + - the next blocker is no longer “infinite idle OpenHands” + - the next blocker is quality after fallback and, more broadly, getting real OpenHands to produce first scoped edits before the watchdog window expires + - issue-style prompts like `#12 Chaos Run: 玛露遮瑕膏落地页商业化压力测试` now route to `product_landing_page` instead of `worker_execution` + - new manager regression coverage added in `tests/test_manager_agent.py` +- Repaired the ai-lab OpenHands state-directory mount: + - `scripts/openhands_start.sh` no longer places `OPENHANDS_PERSISTENCE_DIR` under read-only `/workspace/...` + - ai-lab runtime now writes persistence under writable `/opt/workspace/.openhands-state/` + - launcher regression coverage added in `tests/test_openhands_launcher.py` +- Validation after these fixes: + - `tests/test_manager_agent.py` -> `4 passed` + - `tests/test_launch_ai_lab.py` -> `3 passed` + - `tests/test_openhands_launcher.py` -> `6 passed` + - `tests/test_github_issue_service.py tests/test_gateway_telegram.py` -> `25 passed` +- Third live trigger after the fixes: + - `/task issue #12 ai-lab state 写目录修复后重试,继续保留业务风险说明` + - `dispatch_id=mgrdispatch_c16616a138d3` +- New live outcome: + - manager routing was corrected in production: + - selected intent: `product_landing_page` + - DAG stages: `backend -> tests -> frontend` + - backend scope now targeted: + - `src/autoresearch/api/routers/openclaw.py` + - `src/autoresearch/api/**` + - `src/autoresearch/core/services/**` + - test scope now targeted: + - `tests/test_panel_security.py` + - `tests/test_admin_backend.py` + - frontend scope now targeted: + - `src/autoresearch/api/routers/panel.py` + - `panel/**` + - the previous read-only `state` crash was gone + - a real `ai-lab` container launched and OpenHands (GLM-5) started working instead of falling back immediately + - container logs showed: + - `Initializing agent...` + - `✓ Agent initialized with model: openai/glm-5` + - then continuous `Agent is working` + - OpenHands conversation traces confirmed real repo exploration and edits, including a write to `src/autoresearch/shared/models.py` +- Final verdict for the third run: + - the run was manually stopped after it kept running for ~633s without clean completion + - after stop, AEP captured a real driver result (not mock): + - `agent_id=openhands` + - `status=failed` -> promoted to `policy_blocked` + - changed paths: + - `src/autoresearch/api/dependencies.py` + - `src/autoresearch/api/main.py` + - `src/autoresearch/api/routers/landing_pages.py` + - `src/autoresearch/core/services/landing_page_leads.py` + - `src/autoresearch/shared/models.py` + - because `src/autoresearch/shared/models.py` was outside `allowed_paths`, built-in path validation blocked promotion: + - `builtin.allowed_paths = failed` + - dispatch final status: `blocked` + - the synthetic backend it drafted was directionally sensible (new landing-page router + lead service), but the concrete code still failed imports: + - `ImportError: cannot import name 'ModelRepository' from 'autoresearch.shared.store'` +- Updated diagnosis after the third run: + - business routing is no longer the top blocker + - ai-lab persistence mount is no longer the top blocker + - the new top blockers are: + - real OpenHands can spend a long time exploring before converging + - runtime scope control is still post-hoc validation, not proactive filesystem isolation + - when OpenHands does generate code, it may edit out-of-scope files and still fail basic import-level validation diff --git a/memory/2026-03-30.md b/memory/2026-03-30.md new file mode 100644 index 00000000..51da7509 --- /dev/null +++ b/memory/2026-03-30.md @@ -0,0 +1,621 @@ +## 2026-03-30 + +- Implemented three roadmap hardening items around real OpenHands stall handling instead of expanding scope: + - split runner no-progress kills into `stalled_no_progress` instead of overloading `timed_out` + - added first-progress metrics to `DriverMetrics`: `first_progress_ms`, `first_scoped_write_ms`, `first_state_heartbeat_ms` + - skipped redundant `retry` fallback steps after a zero-progress stall, falling through directly to fallback agent / human review +- Updated audit trail aggregation and admin detail surface to expose the new timing metrics and raw stalled status. +- Added regression coverage for: + - stalled watchdog semantics + - first state/write progress timing capture + - skipping duplicate real-agent retries after stall + - audit trail serialization of first-progress metrics +- Targeted regression suite passed: + - `PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_openhands_worker_strict_chain.py tests/test_agent_fallbacks.py tests/test_admin_backend.py tests/test_agent_runner_outcomes.py` + - result: `18 passed in 33.62s` +- Current project state is still best described as an engineering-validation agent stack, not production: + - failure modes are now more diagnosable and cheaper + - biggest remaining product blockers are still intent routing accuracy, runtime pre-flight consistency, and a true business-task closed loop +- Follow-up hardening completed in the same session: + - tightened Manager intent routing so direct business prompts like "玛露 6g 遮瑕膏落地页 + 浅色品牌 UI + 留资接口" route to `product_landing_page` instead of infra-oriented `worker_execution` + - added runner-side OpenHands environment preflight for the real `drivers/openhands_adapter.sh` path (non-dry-run only), returning `EnvironmentCheckFailed: ...` before dispatching into a dirty ai-lab runtime + - kept strict-view implementation as-is because it already existed in both AEP runner and controlled backend; validated via existing strict-chain tests rather than rewriting it +- Additional verification passed: + - `PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_manager_agent.py tests/test_agent_runner_outcomes.py tests/test_openhands_worker_strict_chain.py tests/test_agent_fallbacks.py tests/test_admin_backend.py` + - result: `24 passed in 33.44s` +- Lightweight live checks after the code changes: + - `AUTO_OPEN_DOCKER=0 COLIMA_PROFILE=ai-lab bash ./scripts/launch_ai_lab.sh status` -> `environment ready` + - direct Manager instantiation in current repo now routes the Malu landing-page prompt to `product_landing_page` with a `task_dag` backend/tests/frontend plan +- Still not claimed as solved: + - no full live chaos rerun yet for the Malu business task after these exact changes + - current repo remains an engineering-validation stack, not a proven commercial delivery system +- Full live chaos rerun completed afterward via the real local Telegram webhook path: + - started fresh uvicorn on `127.0.0.1:8000` with the latest `manager_agent.py` and `runner.py` + - injected `/task 给我做一个玛露 6g 罐装遮瑕膏单页。要求:优雅浅色背景,专业去工厂化文案,提供留资接口和手机号正则校验,不得在日志里打印明文手机号。` + - webhook acknowledged with session `oc_7118a29c1476` and manager dispatch `mgrdispatch_845484bfd4a3` +- Live run outcome for `mgrdispatch_845484bfd4a3`: + - routing gate passed: Manager selected `product_landing_page` and created a `backend -> tests -> frontend` DAG + - environment gate passed: runner preflight succeeded and real OpenHands launched in `ai-lab` + - real backend attempt (`mgrdispatch_845484bfd4a3-backend`, attempt 1, `openhands`) ended as `stalled_no_progress` + - redundant real retry was skipped as designed (`fallback_skipped` with reason `stalled_no_progress`) + - fallback attempt 2 (`mock`) succeeded in producing a synthetic patch, but validation failed and the manager stopped at `human_review` +- Important artifacts from the live run: + - backend events show `attempt_started -> attempt_completed(stalled_no_progress) -> fallback_skipped(retry) -> attempt_started(mock) -> attempt_completed(mock succeeded)` + - backend promotion patch existed at `.masfactory_runtime/runs/mgrdispatch_845484bfd4a3-backend/artifacts/promotion.patch` + - strict allowed-path validation passed for the fallback patch (`src/autoresearch/api/routers/openclaw.py` only), so there was no out-of-scope file change in the accepted artifact + - however, the business scope is still not truly aligned with a dedicated `apps/malu`-style surface; the backend task was still constrained to internal API/service paths under `src/autoresearch/...` + - validation failed on `worker.test_command` during import/collection, stopping the DAG before tests/frontend stages and preventing promotion / PR creation +- Best current reading after the live chaos run: + - the new routing and preflight hardening did their jobs + - the stack no longer died on stale runtime configuration or on the old business-vs-infra routing mistake + - the main blocker has moved to business-task scope modeling plus real OpenHands first-write reliability, with mock fallback quality still below PR-ready +- Follow-up scope hardening after reviewing the live chaos artifacts: + - `product_landing_page` no longer defaults to `src/autoresearch/api/routers/openclaw.py`, `src/autoresearch/api/**`, or `panel/**` + - Manager now maps this intent to an isolated business surface rooted at `apps/{surface_slug}/**` with matching `tests/apps/test_{surface_slug}_landing_page.py` + - for the Malu prompt, the derived surface is `apps/malu` + - stage scopes now resolve to: + - backend: `apps/malu/**` + - tests: `tests/apps/test_malu_landing_page.py` + - frontend: `apps/malu/**` + - backend validation command for this intent now compiles `apps/{surface_slug}/lead_capture.py` instead of forcing framework tests before the app surface exists +- Runner / policy changes needed to make the new business surface physically real: + - added `apps/**` to both the default `ExecutionPolicy.allowed_paths` and the global `HARD_POLICY` whitelist so `apps/malu/**` is not intersected away during policy merge + - fixed shadow-workspace writable-target resolution so a missing scoped target does not collapse to the workspace root + - added a writable path-chain helper so the runner can pre-create `apps/malu/` in the shadow workspace without unlocking unrelated repo paths +- Post-mortem note on the original `mgrdispatch_845484bfd4a3` stall: + - `stdout.log` showed only the prompt payload plus the OpenHands spinner + - there were no visible shell commands such as `mkdir`, `ls`, `cat`, or any permission errors before the watchdog kill + - best current reading is model-layer no-progress stall, not a filesystem denial during that particular run +- Regression coverage added for the new scope model and shadow-workspace behavior: + - manager landing-page routing now asserts `apps/malu/**` and `tests/apps/test_malu_landing_page.py` + - policy merge now asserts isolated `apps/**` scopes survive hard-policy intersection + - shadow workspace now asserts a new `apps/malu/lead_capture.py` file can be created while `src/forbidden.py` still throws `PermissionError` + - targeted suite passed: + - `PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_agent_policy_merge.py tests/test_manager_agent.py tests/test_openhands_worker.py tests/test_openhands_worker_strict_chain.py` + - result: `20 passed in 21.10s` +- Live chaos rerun performed immediately after the scope / policy fix: + - fresh uvicorn started on `127.0.0.1:8000` from the updated code + - injected the Malu task again through the local Telegram webhook path + - accepted dispatch id: `mgrdispatch_b48ce7969cbb` +- Live rerun facts for `mgrdispatch_b48ce7969cbb`: + - routing gate passed with the new business surface model: + - `selected_intent = product_landing_page` + - `surface_root = apps/malu` + +## PR triage snapshot (excluding #13) + +- Reviewed open PRs `#15 #14 #11 #10 #7` in `srxly888-creator/autonomous-agent-stack`. +- `#14` and `#15` are duplicate PRs: + - both change only `scripts/quick_status.py` + - both point to the same head SHA `7bd0e227349137a00c7433ce5f0e59839d015e32` + - `#15` title/content mismatch (`玛露遮瑕膏落地页` title but quick-status script diff), so it should likely be closed in favor of `#14` +- `#10`, `#11`, `#14`, and `#15` have no review comments and are blocked only by small lint failures: + - `src/autoresearch/executions/runner.py`: unused `last_patch_filtered_paths` + - `tests/test_openhands_controlled_backend.py`: unused `import os` on branches that still include it (`#10`, `#14`, `#15`) +- `#11` is the only non-duplicate feature PR that looks worth salvaging immediately; its visible CI blocker is just the `runner.py` unused variable. +- `#7` is effectively obsolete: + - head branch `codex/pr-review-hardening` was already merged to `main` as PR `#8` + - current `#7` still targets old base `codex/openhands-controlled-backend` + - fixing its 360 lint errors is lower value than closing it + +## Live rerun after apps-scope + mock-fallback fixes + +- Verified current working-tree fixes for the two previously reported blockers: + - `configs/agents/openhands.yaml` now includes `apps/**` in `policy_defaults.allowed_paths` + - `drivers/mock_adapter.sh` now uses `Optional[Path]` and validator-target inference for globbed `apps/**` scopes +- Focused regression suites passed: + - `25 passed` for `tests/test_agent_policy_merge.py tests/test_mock_adapter.py tests/test_manager_agent.py tests/test_openhands_worker.py tests/test_openhands_launcher.py` + - `31 passed` for `tests/test_openhands_launcher.py tests/test_mock_adapter.py tests/test_openhands_worker.py tests/test_manager_agent.py tests/test_agent_policy_merge.py tests/test_openhands_worker_strict_chain.py` +- Fresh local webhook rerun executed on uvicorn `127.0.0.1:8010` with: + - `/task 给我做一个玛露 6g 罐装遮瑕膏单页。要求:优雅浅色背景,专业去工厂化文案,提供留资接口和手机号正则校验,不得在日志里打印明文手机号。` + - accepted dispatch id: `mgrdispatch_f544f97e49bf` +- What the rerun proved: + - the old policy-empty blocker is gone: + - `effective_policy.json` shows `manifest_default.allowed_paths` includes `apps/**` + - `merged.allowed_paths` resolves to `["apps/malu/**"]` + - the old mock fallback syntax blocker is gone: + - fallback produced `driver_result.status = succeeded` + - `changed_paths = ["apps/malu/lead_capture.py"]` + - generated `workspace/apps/malu/lead_capture.py` compiles successfully with `python3 -m py_compile` +- New blocker exposed by the rerun: + - real OpenHands attempt 1 never reached execution because environment preflight failed with: + - `EnvironmentCheckFailed: mkdir: /Users/ai_lab/logs/openhands-home: Permission denied` + - `events.ndjson` shows: + - `attempt_blocked` for `openhands` + - retry skipped due `environment_preflight_failed` + - fallback attempt 2 (`mock`) started and produced the synthetic app file +- Current best reading: + - the two requested fixes are effective + - the next real blocker is runtime home/log directory writability for the ai-lab OpenHands environment, not policy merge and not mock Python syntax + - backend scope = `apps/malu/**` + - tests scope = `tests/apps/test_malu_landing_page.py` + - frontend scope = `apps/malu/**` + - real OpenHands no longer stalled on misaligned scope; instead it failed fast on a concrete runtime permission bug: + - attempt 1 stderr: `mkdir: cannot create directory ‘/opt/workspace/.openhands-state’: Permission denied` + - attempt 2 repeated the same failure + - backend summary stopped at `human_review` + - manager status = `failed` + - run_final_status = `human_review` + - backend driver status after fallback = `contract_error` +- Additional new blocker exposed by the fallback path: + - mock attempt 3 also failed under the now-isolated app-only scope + - stderr shows `PermissionError` trying to write `/workspace/src/mock_agent_output.py` + - validation also reports `/bin/sh: python: command not found` for the backend test command because the bare `python` executable is unavailable in that execution path +- Best current reading after the rerun: + - the business-vs-infra routing problem is fixed for this prompt + - the system still does not complete the business task + - the next concrete blockers are: + - OpenHands persistence path permissions under `/opt/workspace/.openhands-state` + - mock fallback target selection still assuming `src/...` instead of respecting `apps/malu/**` + - bare `python` in manager-generated backend validation should be normalized to the interpreter path used elsewhere +- Follow-up runtime repair for those three blockers: + - moved ai-lab OpenHands persistence for run-as-user mode from the strict-view-mounted workspace path to `${OPENHANDS_RUN_AS_HOME}/state/` in `scripts/openhands_start.sh`, while pre-creating and chmod-opening the state directory before launching `runuser` + - normalized bare `python` / `python3` worker test commands to `sys.executable` inside `OpenHandsWorkerService`, and updated the manager landing-page backend validator to emit `{sys.executable} -m py_compile apps/{slug}/lead_capture.py` + - fixed mock fallback target selection to infer validator-targeted files even when `allowed_paths` is globbed (for example `apps/malu/**`), instead of collapsing back to `src/mock_agent_output.py` + - extended mock default policy to allow `apps/**` so fallback path selection is not intersected away +- Added regression coverage for the runtime repair: + - launcher dry-run now expects the ai-lab persistence path under `/tmp/openhands-home/state/` + - manager landing-page routing test now expects the backend compile command to use the active interpreter + - worker tests now assert both `pytest` and `python -m py_compile ...` normalize to `sys.executable` + - mock adapter tests now assert a globbed `apps/malu/**` scope writes to `apps/malu/lead_capture.py` +- Verification: + - `PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_openhands_launcher.py tests/test_mock_adapter.py tests/test_openhands_worker.py tests/test_manager_agent.py tests/test_agent_policy_merge.py tests/test_openhands_worker_strict_chain.py` + - result: `30 passed in 23.65s` +- Not yet re-run after these exact fixes: + - no fresh live webhook / Telegram chaos run yet + - no new Draft PR claim yet +- Fresh live chaos rerun executed after the runtime fixes: + - restarted uvicorn on `127.0.0.1:8000` from the latest checkout + - verified `/healthz` and live manager route availability + - injected a real Telegram webhook payload with: + - `玛露 6g 罐装遮瑕膏` + - `挑战游泳级别持妆 / 不脱妆 / 不用调色 / 遮瑕力强` + - `优雅浅色背景` + - `专业去工厂化文案` + - `留资接口 + 手机号正则校验` + - `日志禁打明文手机号` + - accepted dispatch id: `mgrdispatch_cf3305a6b50e` +- What the rerun proved: + - routing is still correct at runtime: + - `intent = product_landing_page` + - `allowed_paths = ["apps/malu/**"]` + - suggested backend entry = `apps/malu/lead_capture.py` + - the previous runtime blockers are genuinely gone from this path: + - no `/opt/workspace/.openhands-state` permission error + - no `/bin/sh: python: command not found` + - validator command in the live prompt uses the active interpreter path +- Final outcome of `mgrdispatch_cf3305a6b50e-backend`: + - real OpenHands attempt 1 still ended as `stalled_no_progress` + - retry was skipped + - fallback attempt 2 (`mock`) failed before producing any change + - manager stopped at `human_review` + - no Draft PR was created +- New concrete blockers exposed by this rerun: + 1. `configs/agents/openhands.yaml` (or equivalent OpenHands manifest defaults) still does not allow `apps/**` + - `effective_policy.json` shows: + - manifest default allowed paths: `src/**`, `tests/**`, `scripts/**` + - job allowed paths: `apps/malu/**` + - merged allowed paths: `[]` + - this means the real OpenHands business-surface task is still being intersected down to an empty writable scope before execution + 2. mock fallback crashed under the adapter runtime itself + - `stderr.log` shows: + - `TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'` + - current reading: the inline mock adapter script uses a `Path | None` annotation on a Python runtime that does not support that syntax + 3. backend validator failed because no file was ever created + - `summary.json` check `worker.test_command` failed with: + - `[Errno 2] No such file or directory: 'apps/malu/lead_capture.py'` +- Best reading after this rerun: + - the three earlier runtime blockers were real and are now out of the way + - the stack is still not at business-task closed loop + - the next two code fixes should be: + - add `apps/**` to the OpenHands manifest default allowed paths so merged policy is not emptied + - make the mock adapter inline script syntax compatible with the Python interpreter used in fallback execution +- Applied the two follow-up fixes: + - added `apps/**` to `configs/agents/openhands.yaml` manifest defaults so business-surface jobs do not intersect down to an empty writable scope + - downgraded the inline mock adapter typing from `Path | None` to `Optional[Path]` for the fallback runtime +- Verification after those two fixes: + - `PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_agent_policy_merge.py tests/test_mock_adapter.py tests/test_openhands_launcher.py tests/test_openhands_worker.py tests/test_manager_agent.py tests/test_openhands_worker_strict_chain.py` + - result: `31 passed in 23.66s` +- Fresh live webhook rerun with the same Malu payload: + - restarted the live API on `127.0.0.1:8000` + - accepted dispatch id: `mgrdispatch_bdef30eb128c` +- What this rerun proved: + - backend routing stayed correct at runtime: + - intent = `product_landing_page` + - allowed_paths = `apps/malu/**` + - real OpenHands finally wrote into the business surface before stalling: + - first real patch created `apps/malu/lead_capture.py` + - second real retry still ended as `stalled_no_progress` + - fallback no longer crashed: + - attempt 3 (`mock`) completed with `status = succeeded` + - validation passed + - promotion succeeded in `patch` mode +- Final backend outcome for `mgrdispatch_bdef30eb128c-backend`: + - `final_status = ready_for_promotion` + - `promotion_success = true` + - produced patch: + - `.masfactory_runtime/runs/mgrdispatch_bdef30eb128c-backend/artifacts/promotion.patch` + - changed files: + - `apps/malu/lead_capture.py` + - `apps/malu/test_lead_capture.py` +- Important limitation from the same run: + - the winning patch came from the `mock` fallback, not from a stable real OpenHands completion + - the patch quality is weak: + - `apps/malu/lead_capture.py` in the final patch is a synthetic `run()` stub, not the richer real OpenHands implementation seen in attempt 1 stdout + - `apps/malu/test_lead_capture.py` assumes symbols that do not exist in that stub, yet the only validator for the backend stage was `py_compile apps/malu/lead_capture.py` +- Dispatch-level result: + - manager dispatch ended `failed` + - summary: `Manager plan stopped on mgrdispatch_bdef30eb128c-tests with blocked.` + - backend task completed + - tests task failed with `final_status = blocked` + - frontend task never ran +- Draft PR still was not created: + - promotion preflight downgraded to `patch` because: + - base repo was dirty + - approval was not granted +- Quality gate upgrade for landing-page backend tasks: + - changed manager product-surface backend validator from `py_compile apps/{slug}/lead_capture.py` to `pytest -q tests/apps/test_{slug}_landing_page.py` + - widened backend task scope just enough to allow the paired surface test file: + - backend allowed paths now include both `apps/{slug}/**` and `tests/apps/test_{slug}_landing_page.py` + - hardened the mock adapter so a pytest-targeted business surface no longer wins by only writing a trivial `assert True` test file: + - for `apps/** + tests/apps/test_*.py` it now synthesizes both a source file and its paired test file +- Verification for the gate upgrade: + - `PATH="$PWD/.venv/bin:$PATH" .venv/bin/pytest -q tests/test_manager_agent.py tests/test_mock_adapter.py tests/test_openhands_worker.py tests/test_openhands_worker_strict_chain.py` + - result: `21 passed in 21.31s` +- Fresh live webhook rerun after the stricter pytest validator: + - dispatch id: `mgrdispatch_f1e5928d423d` + - backend run id: `mgrdispatch_f1e5928d423d-backend` +- What the rerun proved: + - manager emitted the stricter runtime contract correctly: + - allowed paths: `apps/malu/**`, `tests/apps/test_malu_landing_page.py` + - validator: `/Volumes/AI_LAB/Github/autonomous-agent-stack/.venv/bin/python -m pytest -q tests/apps/test_malu_landing_page.py` + - real OpenHands attempt 1 did much better than previous runs: + - it created: + - `apps/malu/__init__.py` + - `apps/malu/lead_capture.py` + - `tests/apps/test_malu_landing_page.py` + - `apps/malu/README.md` + - validator output in `summary.json`: + - `19 passed in 0.06s` + - despite passing pytest, the run still ended blocked at the policy layer: + - `driver_result.status = policy_blocked` + - `final_status = blocked` +- Exact blockers exposed by the stricter run: + 1. Out-of-scope runtime artifacts from the pytest execution: + - `.pytest_cache/**` + - `tests/apps/__pycache__/test_malu_landing_page.cpython-314.pyc` + 2. Extra business-surface file outside the narrow current contract: + - `apps/malu/README.md` + 3. Patch size exceeded the current manager limit: + - `patch_lines=581 limit=500` +- Dispatch-level outcome: + - dispatch status: `failed` + - summary: `Manager plan stopped on mgrdispatch_f1e5928d423d-backend with blocked.` + - tests/frontend stages never ran because backend was blocked first +- Best current reading after the stricter validator: + - real OpenHands can now generate both the backend code and a paired test suite that passes pytest inside the business surface + - the next bottleneck is no longer “logic missing” but “policy too narrow for the artifacts produced by a real pytest-backed business task” + - likely next fixes: + - filter pytest cache / `__pycache__` artifacts out of changed paths before policy validation + - decide whether `apps/{slug}/README.md` should be disallowed or silently pruned + - either tighten the prompt to discourage docs/test bloat or raise the patch limit for this task class + +## Clean-repo live rerun after benign-artifact filtering + +- Landed and locally committed the latest policy/runtime fixes on `codex/openhands-worker-strict`: + - commit: `8ebffa1 Harden business-surface promotion validation` +- This batch included: + - raising default/hard/manifest `max_patch_lines` to `2000` + - filtering benign test artifacts from policy/gate evaluation: + - `.pytest_cache/**` + - `**/__pycache__/**` + - `apps/*/README.md`-style business docs + - new regression coverage for runner/gate benign-artifact filtering +- Verification before the rerun: + - `21 passed in 7.08s` for `tests/test_agent_policy_merge.py tests/test_agent_runner_outcomes.py tests/test_git_promotion_gate.py` + - `21 passed in 23.29s` for `tests/test_manager_agent.py tests/test_mock_adapter.py tests/test_openhands_worker.py tests/test_openhands_worker_strict_chain.py` +- Ran a clean-repo direct manager dispatch with explicit draft-PR approval: + - dispatch id: `mgrdispatch_d89f7b973d91` + - request used: + - `pipeline_target=draft_pr` + - `approval_granted=true` + - prompt = the full Malu `6g` concealer landing-page ask with lead capture + phone regex + masked logging +- What the live run proved: + - the repo-clean / approval-pregranted path is real; dispatch was accepted and backend entered live OpenHands + - real OpenHands did not immediately stall; it wrote substantial business-surface code into: + - `apps/malu/__init__.py` + - `apps/malu/api_router.py` + - `apps/malu/lead_capture.py` + - `tests/apps/test_malu_landing_page.py` + - it also produced `artifacts/promotion.patch` for the backend run +- Final backend outcome for `mgrdispatch_d89f7b973d91-backend`: + - `summary.final_status = blocked` + - `driver_result.status = policy_blocked` + - promotion never ran (`promotion_preflight = null`, `promotion = null`) +- The two concrete blockers exposed by this clean-repo rerun were both real code/scope issues, not environment: + 1. `builtin.allowed_paths` failed because the backend task still did not allow `tests/apps/__init__.py` + - changed paths included: + - `apps/malu/__init__.py` + - `apps/malu/api_router.py` + - `apps/malu/lead_capture.py` + - `tests/apps/__init__.py` + - `tests/apps/test_malu_landing_page.py` + - benign `__pycache__` output + 2. `worker.test_command` failed on generated FastAPI typing: + - `apps/malu/api_router.py` used `Dict[str, any]` + - FastAPI raised `Invalid args for response field!` +- Important interpretation: + - the old blockers are genuinely gone here: + - no repo-dirty downgrade + - no approval missing + - no `.pytest_cache` / `__pycache__` policy false positive + - no `python: command not found` + - the system is now blocked on business-code quality and a remaining scope omission (`tests/apps/__init__.py`), which is much closer to the real target problem. + +## Scope fix + retry-feedback hardening + live webhook rerun + +- Tightened `product_landing_page` scope again so the business surface now explicitly includes: + - `apps/{slug}/**` + - `tests/apps/__init__.py` + - `tests/apps/test_{slug}_landing_page.py` +- Updated `ManagerAgentService` DAG bucketing so: + - backend tasks allow `tests/apps/__init__.py` in addition to the surface test file + - tests tasks also allow `tests/apps/__init__.py` +- Added regression coverage in `tests/test_manager_agent.py` to assert the Malu landing-page prompt now yields: + - backend allowed paths = `["apps/malu/**", "tests/apps/__init__.py", "tests/apps/test_malu_landing_page.py"]` + - tests allowed paths = `["tests/apps/__init__.py", "tests/apps/test_malu_landing_page.py"]` +- Hardened runner retry behavior so a real retry attempt can receive raw validator failures from the previous attempt: + - `run_job()` now rewrites `job.json` per attempt + - retry attempts append a `Retry feedback from the previous attempt...` block to `JobSpec.task` + - the injected block includes failed validator IDs/details and prior changed paths +- Added regression coverage in `tests/test_agent_runner_outcomes.py` proving a retry attempt can read the literal validator error text (`Invalid args for response field!`) from the updated `job.json`. +- Additional watchdog/heartbeat work was also committed: + - `3c6efe9 Treat adapter stdout as heartbeat signal` + - `ede643c Cover post-write stdout heartbeat stalling` +- Verification after the new scope + retry feedback changes: + - `12 passed in 10.89s` for `tests/test_manager_agent.py tests/test_agent_runner_outcomes.py` + - `18 passed in 29.33s` for `tests/test_openhands_worker.py tests/test_openhands_worker_strict_chain.py tests/test_agent_policy_merge.py` + - `34 passed in 40.44s` for `tests/test_mock_adapter.py tests/test_openhands_worker.py tests/test_openhands_worker_strict_chain.py tests/test_manager_agent.py tests/test_agent_runner_outcomes.py tests/test_agent_policy_merge.py` +- New commits on `codex/openhands-worker-strict`: + - `085dfd0 Tighten landing-page scope and retry feedback` + - `3c6efe9 Treat adapter stdout as heartbeat signal` + - `ede643c Cover post-write stdout heartbeat stalling` +- Fresh live webhook rerun executed from a clean repo: + - restarted uvicorn on `127.0.0.1:8000` + - injected a real Telegram-style `/task` webhook payload with the Malu `6g` concealer brief + - dispatch id: `mgrdispatch_e44f9d77827f` +- What this rerun proved immediately: + - Manager runtime routing stayed correct: + - `selected_intent = product_landing_page` + - backend worker scope = `apps/malu/**`, `tests/apps/__init__.py`, `tests/apps/test_malu_landing_page.py` + - because the request came through Telegram `/task`, runtime metadata still had: + - `pipeline_target = draft_pr` + - `approval_granted = false` + - meaning this path still cannot directly create a Draft PR even if it later succeeds; it would need a separate approval-aware entrypoint or direct manager dispatch +- Live rerun failure mode shifted again: + - backend attempt 1 did not fail on scope, permissions, or validator setup + - stderr showed repeated GLM-side `RateLimitError` and one `InternalServerError` + - no business files were written into `apps/malu/` during the first attempt window +- The backend run eventually produced: + - `driver_result.json` with: + - `status = timed_out` + - `summary = adapter timed out after 420s` + - `first_state_heartbeat_ms = 3037` + - `first_scoped_write_ms = 417123` + - but `summary.json` was still missing at last inspection, so the manager dispatch remained externally visible as: + - `status = queued` + - backend task status = `running` +- Event log for `.masfactory_runtime/runs/mgrdispatch_e44f9d77827f-backend/events.ndjson` advanced to: + - `attempt_started` (attempt 1, `openhands`) + - `attempt_completed` with `driver_status = timed_out` + - `attempt_started` (attempt 2, `openhands`) +- This means the retry-feedback loop is now active in live execution as well: + - the backend runner did progress from attempt 1 into a second real OpenHands retry + - a separate inspected run `mgrdispatch_7896c9c92740-backend/job.json` confirmed the retry prompt literally contains: + - `Retry feedback from the previous attempt...` + - failed validator details such as `Invalid args for response field!` / `no tests ran...` +- New operational blocker exposed by this live rerun: + - the real OpenHands path can now be dominated by model-side rate limiting / internal service errors, and the manager control-plane status can lag behind the backend runner state + - after manual cleanup of lingering attempt-1 child processes, `events.ndjson` advanced, but `summary.json` still did not materialize immediately +- Best current reading at end of session: + - scope is now correct for business-surface landing-page tasks + - raw validator feedback is now wired into real retry attempts + - the next blocker is no longer prompt scope or Python typing alone, but live runtime robustness under GLM rate limiting plus a summary/control-plane lag after timeout transitions + +## 2026-03-30 - Runner stall refinement and latest live rerun + +- Tightened `src/autoresearch/executions/runner.py` around OpenHands progress detection: + - ai-lab env override sanitization remains in place + - stdout/stderr growth is now treated as a bounded warmup heartbeat instead of an unconditional forever-heartbeat + - after the first scoped write, logs no longer keep the adapter alive by themselves + - fast-fail probe behavior was preserved while refactoring the stall branch +- Fixed an additional mock fallback issue in `drivers/mock_adapter.sh`: + - removed a backslash-containing f-string expression that caused `SyntaxError: f-string expression part cannot include a backslash` +- Added/updated regression coverage in `tests/test_openhands_worker_strict_chain.py` and `tests/test_agent_runner_outcomes.py` for: + - stdout heartbeat before first write + - spinner-only heartbeat after first scoped write must stall + - log-only heartbeat with no scoped/state progress must be bounded + - ai-lab env override/preflight behavior +- Verification on local test suite: + - `16 passed` for focused runner/strict-chain tests + - `42 passed` for broader runner/launcher/mock/worker/manager/policy suite +- Live rerun on fresh uvicorn instance (`127.0.0.1:8012`) with webhook dispatch `mgrdispatch_d03f165f2216` showed: + - no `.openhands-state` files + - no writes under `apps/malu/` + - no generated `tests/apps/test_malu_landing_page.py` + - stderr repeatedly showed GLM-side `RateLimitError` + - `events.ndjson` still only had `attempt_started` + - `summary.json` still missing after adapter elapsed time exceeded 4 minutes +- Important interpretation: + - the old manifest/permission and mock-Python blockers are not what is stopping the current live run + - the current visible blocker is real model-side rate limiting, and there may still be a live-path discrepancy because the host-side runner did not yet surface a terminal `stalled_no_progress` / timeout summary despite zero scoped/state progress + +## 2026-03-30 - Timeout summary closure + Telegram approval-aware dispatch + +- Hardened `src/autoresearch/executions/runner.py` so `run_job()` now always persists `summary.json` in a final `finally` path. + - success path now builds `final_summary` and exits through a single summary writer + - unexpected runner exceptions are converted into a terminal `contract_error` summary instead of leaving the run without `summary.json` + - added `runner_exception` event emission for post-mortem visibility +- Added Telegram approval-aware dispatch in `src/autoresearch/api/routers/gateway_telegram.py`. + - `/task --approve <需求>` is now parsed explicitly + - only Telegram `owner` / `partner` identities can promote that flag into `approval_granted=True` + - non-admin senders can still use `/task --approve ...`, but it is downgraded to the normal approval flow and a warning message is sent + - help/usage text now documents `/task --approve <需求>` +- Added regression coverage: + - `tests/test_agent_runner_outcomes.py` + - verifies an unexpected runner crash still writes `summary.json` and records a `runner_exception` event + - `tests/test_gateway_telegram.py` + - verifies owner `/task --approve ...` propagates `approval_granted=True` + - verifies non-admin `/task --approve ...` cannot self-escalate +- Verification: + - `32 passed in 11.19s` for `tests/test_agent_runner_outcomes.py tests/test_gateway_telegram.py` + - `15 passed in 48.38s` for `tests/test_openhands_worker_strict_chain.py tests/test_agent_fallbacks.py tests/test_manager_agent.py` + - `PYTHONDONTWRITEBYTECODE=1 .venv/bin/python -m py_compile ...` passed for the touched files +- Scope note: + - this round intentionally did not rerun the full live Malu chaos flow + - retry semantics for `timed_out` were left unchanged; the fix here is summary/control-plane closure on terminal exceptions plus Telegram approval propagation + +## 2026-03-30 - Live `--approve` chaos rerun on clean repo + +- Prepared a clean repo for promotion preflight by: + - committing the runner + Telegram approval changes (`beb6047 Close runner summaries and add Telegram task approval`) + - stashing local note noise with `git stash push -u -m "codex/pre-live-chaos-20260330-104208" -- memory/2026-03-30.md .codex-worktrees` + - locally ignoring `.codex-worktrees/` through `.git/info/exclude` so nested worktrees do not keep the main repo dirty +- Restarted live uvicorn on `127.0.0.1:8000` and injected a real Telegram webhook payload: + - `/task --approve 为美妆品牌玛露开发一款 6g 罐装遮瑕膏单页。核心卖点:挑战游泳级别持妆、不脱妆、不用调色、遮瑕力强。要求:UI 采用优雅浅色背景,文案语气专业去工厂化。必须带留资接口及手机号正则校验,日志禁打明文手机号。` + - accepted dispatch id: `mgrdispatch_689935d29705` +- What the live rerun proved: + - Telegram approval propagation is real in runtime: + - dispatch metadata shows `approval_granted=true` + - approval source is `telegram_task_flag` + - pipeline target remains `draft_pr` + - routing is still correct: + - `selected_intent = product_landing_page` + - backend allowed paths are `apps/malu/**`, `tests/apps/__init__.py`, and `tests/apps/test_malu_landing_page.py` + - the run now reaches a terminal dispatch summary instead of leaving the manager stuck in `running` +- Final backend outcome for `mgrdispatch_689935d29705-backend`: + - attempt 1 (`openhands`) -> `timed_out` + - attempt 2 (`openhands`) -> `timed_out` + - attempt 3 (`mock`) -> `succeeded` + - final backend status -> `human_review` + - manager dispatch status -> `failed` + - manager summary -> `Manager plan stopped on mgrdispatch_689935d29705-backend with human_review.` +- Produced artifact: + - `.masfactory_runtime/runs/mgrdispatch_689935d29705-backend/artifacts/promotion.patch` + - patch touched: + - `apps/malu/lead_capture.py` + - `tests/apps/test_malu_landing_page.py` +- Why it still did not create a Draft PR: + - validation failed on `worker.test_command` + - failing assertions from `tests/apps/test_malu_landing_page.py`: + - `PhoneValidator.validate('13812345678') is True` + - `capture_lead('13812345678')['success'] is True` + - mock fallback produced a patch that looked plausible but still failed the business test +- Best reading after this rerun: + - `--approve` is now truly wired through Telegram -> Manager -> backend task metadata + - summary/control-plane closure works for this live path + - the current blocker is no longer environment, routing, or approval propagation; it is code quality under the real business validator + +## 2026-03-30 - Live `--approve` rerun with real OpenHands first draft but hung control-plane closure + +- Committed the current memory log before rerunning: + - `3ed0990 docs(memory): record Telegram --approve chaos run and validator failure` +- Restarted local uvicorn on `127.0.0.1:8000` and injected a fresh real Telegram webhook payload: + - `/task --approve 为美妆品牌玛露开发 6g 罐装遮瑕膏单页。核心卖点:游泳级持妆、不脱妆。要求:优雅浅色背景,去工厂化文案。必须带留资接口(带手机号正则校验),禁止日志明文。` + - accepted dispatch id: `mgrdispatch_d78d6737cab9` +- Runtime facts for `mgrdispatch_d78d6737cab9`: + - Manager selected `product_landing_page` + - backend allowed paths were correctly scoped to: + - `apps/malu/**` + - `tests/apps/__init__.py` + - `tests/apps/test_malu_landing_page.py` + - backend metadata carried `approval_granted=true` with `approval_source=telegram_task_flag` +- Real OpenHands actually wrote first-draft business files under the isolated business surface: + - `.masfactory_runtime/runs/mgrdispatch_d78d6737cab9-backend/workspace/apps/malu/__init__.py` + - `.masfactory_runtime/runs/mgrdispatch_d78d6737cab9-backend/workspace/apps/malu/lead_capture.py` + - `.masfactory_runtime/runs/mgrdispatch_d78d6737cab9-backend/workspace/tests/apps/__init__.py` + - `.masfactory_runtime/runs/mgrdispatch_d78d6737cab9-backend/workspace/tests/apps/test_malu_landing_page.py` +- Important code-quality observation: + - unlike the previous mock fallback bug, real OpenHands wrote the phone regex correctly this time: + - `re.compile(r'^1[3-9]\d{9}$')` + - the first draft is materially better than the mock-generated stub and includes real validation, hashing, masking, and test cases +- New blocker exposed by this rerun: + - control-plane closure failed again on the live path + - `events.ndjson` only recorded `attempt_started` + - neither `driver_result.json` nor `summary.json` was written + - `stdout.log` kept printing only `Agent is working` + - `stderr.log` stopped after container creation and never surfaced a terminal reason +- Operational action taken: + - after the run exceeded the expected window and still had no terminal artifacts, the live adapter / ai-lab / uvicorn processes were manually killed to avoid leaving a zombie run behind +- Best reading: + - approval propagation, routing, strict business scoping, and real OpenHands first writes are all working + - the latest blocker is not the old bad regex and not mock fallback; it is a remaining live-path hang where a long-running real OpenHands attempt can still fail to emit terminal `driver_result.json` / `summary.json` + +## 2026-03-30 - Linux remote worker prep + +- Added a Linux-specific doctor profile in `scripts/doctor.py`. +- New `--profile linux-remote` checks: + - warns when `OPENHANDS_RUNTIME` is not `host` + - warns when `DOCKER_HOST` still points at Mac/Colima paths + - verifies `.masfactory_runtime`, `artifacts`, and `logs` are writable + - warns when `gh` or `tmux` are missing +- Added `make doctor-linux` to `Makefile`. +- Added `docs/linux-remote-worker.md` to document the recommended topology: + - Mac as control plane + - Linux as execution plane + - Linux starts with `OPENHANDS_RUNTIME=host` +- Updated `README.md` and `.env.example` so Linux bring-up points at `make doctor-linux` and `OPENHANDS_RUNTIME=host` instead of defaulting people into the Mac/Colima path. +- Validation: + - `tests/test_doctor_linux.py` -> `3 passed` + - `make doctor-linux` -> `READY` on current host, with expected warnings because this machine is Darwin and `OPENHANDS_RUNTIME` was not set + +## 2026-03-30 - Runner hang / zombie hardening + +- Tightened `src/autoresearch/executions/runner.py` so `stdout` / `stderr` noise no longer counts as runtime heartbeat at all. +- Effective progress is now limited to: + - scoped writes under `allowed_paths` + - `.openhands-state` file changes +- Strengthened process termination: + - capture adapter process group id after `start_new_session=True` + - send `SIGTERM` to the whole group first + - escalate to `SIGKILL` + - fall back to direct `process.kill()` if the group still does not exit +- This keeps the existing single-exit `summary.json` closure path intact while removing the old “spinner log keeps the run alive” loophole. +- Added / updated regression coverage in `tests/test_openhands_worker_strict_chain.py`: + - `Agent is working` spinner noise does not reset stall timers + - varying stdout heartbeat logs do not keep a pre-write adapter alive + - state-file heartbeats can keep the adapter alive before a real scoped write + - after a real write, pure log spam no longer prevents `stalled_no_progress` + - a spawned child process that keeps printing invalid noise is killed with the adapter process group, and `summary.json` still lands +- Validation: + - `tests/test_openhands_worker_strict_chain.py -k 'stall_watchdog or heartbeat or agent_working_noise or process_group'` -> `4 passed` + - `tests/test_agent_runner_outcomes.py` -> `8 passed` + +## 2026-03-30 - Live watchdog verification after runner hardening + +- Restarted local uvicorn on `127.0.0.1:8000` with the `885d94b` runner fix live and injected the same Telegram `--approve` Malu prompt again. +- Accepted dispatch id: `mgrdispatch_4d1c3a0edf64`. +- The Manager/control-plane routing was correct: + - `selected_intent=product_landing_page` + - backend scope: + - `apps/malu/**` + - `tests/apps/__init__.py` + - `tests/apps/test_malu_landing_page.py` + - `approval_granted=true` from `telegram_task_flag` +- Real OpenHands attempt 1 no longer kept the run alive with repeated `Agent is working` spinner noise. +- Terminal event sequence for `.masfactory_runtime/runs/mgrdispatch_4d1c3a0edf64-backend/events.ndjson`: + - `attempt_started` attempt 1 `openhands` + - `attempt_completed` attempt 1 `openhands` -> `stalled_no_progress` + - `fallback_skipped` retry skipped because the real attempt made zero valid progress + - `attempt_started` attempt 2 `mock` + - `attempt_completed` attempt 2 `mock` -> `succeeded` +- Crucial verification result: + - `.masfactory_runtime/runs/mgrdispatch_4d1c3a0edf64-backend/summary.json` was written + - Manager dispatch closed cleanly as `failed` + - summary text: `Manager plan stopped on mgrdispatch_4d1c3a0edf64-backend with human_review.` +- This confirms the runner hardening worked on the real live path: + - spinner/stdout noise no longer masks a stalled run + - the control plane gets a terminal artifact instead of a zombie `running` dispatch +- Remaining blocker from this run is business-code quality only: + - mock fallback still produced the bad phone validator regex + - `worker.test_command` failed in `tests/apps/test_malu_landing_page.py` diff --git a/memory/SOP/MASFactory_Strict_Execution_v1.md b/memory/SOP/MASFactory_Strict_Execution_v1.md new file mode 100644 index 00000000..7d199714 --- /dev/null +++ b/memory/SOP/MASFactory_Strict_Execution_v1.md @@ -0,0 +1,78 @@ +# MASFactory Strict Execution v1 + +This SOP is the short operational checklist for bounded autonomous work in this repository. + +The canonical system picture lives in `/Volumes/AI_LAB/Github/autonomous-agent-stack/ARCHITECTURE.md`. If this SOP and the architecture doc ever disagree, follow `ARCHITECTURE.md` and update this file. + +## Purpose + +Keep autonomous execution useful without letting workers bypass the control plane. + +## Mandatory Defaults + +- Default output mode is patch-only. +- Workers may edit only explicitly allowed repo-relative paths. +- Forbidden paths always include `.git/`, `logs/`, `.masfactory_runtime/`, `memory/`, and secret material such as `*.key` or `*.pem`. +- Git branch creation, commit, push, merge, rebase, reset, and checkout are not worker actions. +- Promotion is owned by the promotion gate, never by the worker. + +## Preflight + +Before running OpenHands or any AEP worker: + +1. Confirm the task is bounded and can be expressed as a small patch. +2. Provide `allowed_paths`, `forbidden_paths`, and at least one validation command. +3. Prefer adding or updating a direct regression test alongside the source edit. +4. If the execution path is `OpenHandsControlledBackendService`, require a clean repo root first. + +## Execution Rules + +- Work inside isolated workspaces only. +- Treat the main repo checkout as the baseline, not the mutable execution target. +- Never widen scope from inside the worker prompt. +- If a required file is outside scope, fail and request a new contract instead of reaching around the boundary. + +## Promotion Rules + +- The worker emits a patch candidate. +- Validation runs before promotion. +- `GitPromotionGateService` re-checks scope, runtime artifacts, binary changes, changed file count, patch size, writer lease, and Draft PR prerequisites. +- Draft PR mode requires explicit approval plus remote and credential checks. +- If Draft PR preconditions fail but patch checks pass, degrade to patch mode instead of escalating. + +## Single Writer Rule + +Mutable control-plane actions must hold a `WriterLease`. + +This applies to: + +- git promotion finalization, +- skill promotion from `cold_validated` to `promoted`, +- and any future path that upgrades shared mutable state. + +If the lease is unavailable, block the action. Do not race another writer. + +## Managed Skill Trust Ladder + +Managed skills follow: + +`pending -> quarantined -> cold_validated -> promoted` + +Meaning: + +- `quarantined`: copied out of the untrusted source into holding +- `cold_validated`: static and contract checks passed +- `promoted`: copied into the active runtime root + +Do not skip `quarantined` or `cold_validated`. + +## Physical Runtime Reminder + +Current stable environment: + +- repo checkout on `/Volumes/AI_LAB/Github/autonomous-agent-stack` +- ai-lab writable roots on `/Volumes/AI_LAB/ai_lab` +- Docker runtime through Colima +- isolated execution and promotion worktrees rooted outside the main checkout + +The architecture depends on that separation. Do not silently collapse everything back into one writable repo. diff --git a/migration/openclaw/scripts/start-telegram-poller.sh b/migration/openclaw/scripts/start-telegram-poller.sh index b1bc74ca..f98bbfc2 100644 --- a/migration/openclaw/scripts/start-telegram-poller.sh +++ b/migration/openclaw/scripts/start-telegram-poller.sh @@ -5,7 +5,6 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" PROJECT_ROOT="$(cd "${ROOT_DIR}/../.." && pwd)" PID_FILE="${ROOT_DIR}/logs/telegram-poller.pid" LOG_FILE="${ROOT_DIR}/logs/telegram-poller.log" -ENV_FILE="${ROOT_DIR}/.env.local" SCRIPT="${ROOT_DIR}/scripts/telegram_poller_bridge.py" mkdir -p "${ROOT_DIR}/logs" @@ -24,22 +23,37 @@ if [[ ! -x "${PROJECT_ROOT}/.venv/bin/python" ]]; then exit 1 fi -if [[ -f "${ENV_FILE}" ]]; then - set -a - source "${ENV_FILE}" - set +a +for ENV_FILE in "${PROJECT_ROOT}/.env" "${PROJECT_ROOT}/.env.local" "${ROOT_DIR}/.env.local"; do + if [[ -f "${ENV_FILE}" ]]; then + set -a + source "${ENV_FILE}" + set +a + fi +done + +if [[ -n "${AUTORESEARCH_TELEGRAM_BOT_TOKEN:-}" && -z "${TELEGRAM_BOT_TOKEN:-}" ]]; then + export TELEGRAM_BOT_TOKEN="${AUTORESEARCH_TELEGRAM_BOT_TOKEN}" +fi +if [[ -n "${TELEGRAM_BOT_TOKEN:-}" && -z "${AUTORESEARCH_TELEGRAM_BOT_TOKEN:-}" ]]; then + export AUTORESEARCH_TELEGRAM_BOT_TOKEN="${TELEGRAM_BOT_TOKEN}" fi API_HOST="${AUTORESEARCH_API_HOST:-127.0.0.1}" -API_PORT="${AUTORESEARCH_API_PORT:-8001}" +API_PORT="${AUTORESEARCH_API_PORT:-8000}" if ! curl -fsS "http://${API_HOST}:${API_PORT}/health" >/dev/null 2>&1; then echo "api not healthy on http://${API_HOST}:${API_PORT}/health" echo "run: make start (or start API daemon with the same port)" exit 1 fi +export TELEGRAM_BRIDGE_LOCAL_WEBHOOK_URL="${TELEGRAM_BRIDGE_LOCAL_WEBHOOK_URL:-http://${API_HOST}:${API_PORT}/api/v1/gateway/telegram/webhook}" + cd "${PROJECT_ROOT}" -nohup "${PROJECT_ROOT}/.venv/bin/python" "${SCRIPT}" >> "${LOG_FILE}" 2>&1 & +if command -v setsid >/dev/null 2>&1; then + setsid "${PROJECT_ROOT}/.venv/bin/python" "${SCRIPT}" > "${LOG_FILE}" 2>&1 & +else + nohup "${PROJECT_ROOT}/.venv/bin/python" "${SCRIPT}" > "${LOG_FILE}" 2>&1 & +fi PID=$! echo "${PID}" > "${PID_FILE}" diff --git a/migration/openclaw/scripts/telegram_poller_bridge.py b/migration/openclaw/scripts/telegram_poller_bridge.py index 26e9a3fd..c2ebbab0 100644 --- a/migration/openclaw/scripts/telegram_poller_bridge.py +++ b/migration/openclaw/scripts/telegram_poller_bridge.py @@ -11,9 +11,14 @@ ROOT_DIR = Path(__file__).resolve().parents[1] +PROJECT_ROOT = ROOT_DIR.parent.parent LOG_DIR = ROOT_DIR / "logs" OFFSET_FILE = LOG_DIR / "telegram-poller.offset" -ENV_FILE = ROOT_DIR / ".env.local" +ENV_FILES = ( + PROJECT_ROOT / ".env", + PROJECT_ROOT / ".env.local", + ROOT_DIR / ".env.local", +) OPENCLAW_CONFIG = Path("/Users/iCloud_GZ/.openclaw/openclaw.json") @@ -127,13 +132,21 @@ def forward_update(update: dict[str, Any], webhook_url: str, secret_token: str | def main() -> int: - load_env_file(ENV_FILE) + for env_file in ENV_FILES: + load_env_file(env_file) - bot_token = normalize_token(os.getenv("TELEGRAM_BOT_TOKEN")) + bot_token = normalize_token( + os.getenv("AUTORESEARCH_TELEGRAM_BOT_TOKEN") + or os.getenv("TELEGRAM_BOT_TOKEN") + ) if bot_token is None: bot_token = normalize_token(read_openclaw_bot_token()) + if bot_token and not os.getenv("TELEGRAM_BOT_TOKEN"): + os.environ["TELEGRAM_BOT_TOKEN"] = bot_token + if bot_token and not os.getenv("AUTORESEARCH_TELEGRAM_BOT_TOKEN"): + os.environ["AUTORESEARCH_TELEGRAM_BOT_TOKEN"] = bot_token if not bot_token: - log("[poller] missing TELEGRAM_BOT_TOKEN and cannot read ~/.openclaw/openclaw.json") + log("[poller] missing TELEGRAM_BOT_TOKEN/AUTORESEARCH_TELEGRAM_BOT_TOKEN and cannot read ~/.openclaw/openclaw.json") return 2 webhook_url = os.getenv( diff --git a/pyproject.toml b/pyproject.toml index ba81aad8..82a87f28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ name = "autonomous-agent-stack" version = "0.1.0" description = "Modern multi-agent orchestration framework powered by MASFactory" -requires-python = ">=3.10" +requires-python = ">=3.11" [build-system] requires = ["hatchling"] @@ -10,11 +10,11 @@ build-backend = "hatchling.build" [tool.ruff] line-length = 100 -target-version = "py310" +target-version = "py311" [tool.black] line-length = 100 -target-version = ["py310"] +target-version = ["py311"] [tool.pytest.ini_options] asyncio_mode = "auto" diff --git a/sandbox/ai-lab/Dockerfile b/sandbox/ai-lab/Dockerfile index e3e799b1..c6f74f23 100644 --- a/sandbox/ai-lab/Dockerfile +++ b/sandbox/ai-lab/Dockerfile @@ -1,5 +1,7 @@ FROM python:3.12-bookworm +ARG OPENHANDS_CLI_SPEC=openhands==1.5.0 + ENV PATH="/root/.local/bin:/usr/local/bin:${PATH}" COPY requirements.txt /tmp/requirements.txt @@ -10,7 +12,6 @@ RUN apt-get update && apt-get install -y \ curl \ git \ jq \ - software-properties-common \ && rm -rf /var/lib/apt/lists/* # 2. 安装 GitHub CLI (gh) —— 龙虾的灵魂扳手 @@ -20,10 +21,10 @@ RUN curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | d && apt-get update && apt-get install -y gh \ && rm -rf /var/lib/apt/lists/* -# 3. 安装 uv 和 OpenHands CLI,供外层 launch_ai_lab.sh 直接调用 +# 3. 安装 uv 和锁定版 OpenHands CLI,保持与宿主机已验证版本一致 RUN curl --retry 5 --retry-all-errors -LsSf https://astral.sh/uv/install.sh | sh \ && ln -sf /root/.local/bin/uv /usr/local/bin/uv \ - && /usr/local/bin/uv tool install openhands --python 3.12 \ + && /usr/local/bin/uv tool install "${OPENHANDS_CLI_SPEC}" --python 3.12 \ && ln -sf /root/.local/bin/openhands /usr/local/bin/openhands \ && ln -sf /root/.local/bin/openhands-acp /usr/local/bin/openhands-acp diff --git a/sandbox/ai-lab/docker-compose.yml b/sandbox/ai-lab/docker-compose.yml index 01bdc76e..92cb479b 100644 --- a/sandbox/ai-lab/docker-compose.yml +++ b/sandbox/ai-lab/docker-compose.yml @@ -11,10 +11,12 @@ services: - ../../logs:/workspace/logs:rw - ${WORKSPACE_DIR}/worktrees:/opt/worktrees:rw - ${CACHE_DIR}:/root/.cache:rw - - ${LOG_DIR}/openhands-home:/root/.openhands:rw + - ${OPENHANDS_HOME_DIR:-${LOG_DIR}/openhands-home}:/root/.openhands:rw + - ${DOCKER_HOST_SOCKET_DIR:-/var/run}:${DOCKER_HOST_MOUNT_DIR:-/var/run/host-docker}:rw tmpfs: - /tmp:size=2g,mode=1777 environment: + - DOCKER_HOST=${DOCKER_HOST_IN_CONTAINER:-unix:///var/run/host-docker/docker.sock} - HOME=/root - PYTHONDONTWRITEBYTECODE=1 - PYTHONUNBUFFERED=1 diff --git a/schemas/run_summary.schema.json b/schemas/run_summary.schema.json new file mode 100644 index 00000000..5ad893ee --- /dev/null +++ b/schemas/run_summary.schema.json @@ -0,0 +1,845 @@ +{ + "$defs": { + "ArtifactRef": { + "additionalProperties": false, + "properties": { + "kind": { + "enum": [ + "log", + "report", + "plan", + "patch", + "compliance", + "custom" + ], + "title": "Kind", + "type": "string" + }, + "name": { + "title": "Name", + "type": "string" + }, + "sha256": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Sha256" + }, + "uri": { + "title": "Uri", + "type": "string" + } + }, + "required": [ + "name", + "kind", + "uri" + ], + "title": "ArtifactRef", + "type": "object" + }, + "DispatchLane": { + "enum": [ + "local", + "remote" + ], + "title": "DispatchLane", + "type": "string" + }, + "DriverMetrics": { + "additionalProperties": false, + "properties": { + "commands": { + "default": 0, + "title": "Commands", + "type": "integer" + }, + "completion_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Completion Tokens" + }, + "duration_ms": { + "default": 0, + "title": "Duration Ms", + "type": "integer" + }, + "first_progress_ms": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "First Progress Ms" + }, + "first_scoped_write_ms": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "First Scoped Write Ms" + }, + "first_state_heartbeat_ms": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "First State Heartbeat Ms" + }, + "prompt_tokens": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Prompt Tokens" + }, + "steps": { + "default": 0, + "title": "Steps", + "type": "integer" + } + }, + "title": "DriverMetrics", + "type": "object" + }, + "DriverResult": { + "additionalProperties": false, + "properties": { + "agent_id": { + "title": "Agent Id", + "type": "string" + }, + "attempt": { + "default": 1, + "title": "Attempt", + "type": "integer" + }, + "changed_paths": { + "items": { + "type": "string" + }, + "title": "Changed Paths", + "type": "array" + }, + "error": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Error" + }, + "metrics": { + "$ref": "#/$defs/DriverMetrics" + }, + "output_artifacts": { + "items": { + "$ref": "#/$defs/ArtifactRef" + }, + "title": "Output Artifacts", + "type": "array" + }, + "protocol_version": { + "const": "aep/v0", + "default": "aep/v0", + "title": "Protocol Version", + "type": "string" + }, + "recommended_action": { + "default": "human_review", + "enum": [ + "promote", + "retry", + "fallback", + "human_review", + "reject" + ], + "title": "Recommended Action", + "type": "string" + }, + "run_id": { + "title": "Run Id", + "type": "string" + }, + "status": { + "enum": [ + "succeeded", + "partial", + "failed", + "timed_out", + "stalled_no_progress", + "policy_blocked", + "contract_error" + ], + "title": "Status", + "type": "string" + }, + "summary": { + "title": "Summary", + "type": "string" + } + }, + "required": [ + "run_id", + "agent_id", + "status", + "summary" + ], + "title": "DriverResult", + "type": "object" + }, + "FailureClass": { + "enum": [ + "planner_stalled", + "executor_stalled", + "tool_timeout", + "model_fallback", + "assertion_failed_after_fallback", + "env_missing", + "workspace_dirty", + "transient_network", + "unknown" + ], + "title": "FailureClass", + "type": "string" + }, + "GitPromotionMode": { + "enum": [ + "patch", + "draft_pr" + ], + "title": "GitPromotionMode", + "type": "string" + }, + "GitRemoteProbe": { + "additionalProperties": false, + "properties": { + "base_branch_exists": { + "default": false, + "title": "Base Branch Exists", + "type": "boolean" + }, + "credentials_available": { + "default": false, + "title": "Credentials Available", + "type": "boolean" + }, + "healthy": { + "default": false, + "title": "Healthy", + "type": "boolean" + }, + "reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reason" + }, + "remote_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Remote Name" + }, + "remote_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Remote Url" + } + }, + "title": "GitRemoteProbe", + "type": "object" + }, + "PromotionDiffStats": { + "additionalProperties": false, + "properties": { + "deletions": { + "default": 0, + "title": "Deletions", + "type": "integer" + }, + "files_changed": { + "default": 0, + "title": "Files Changed", + "type": "integer" + }, + "insertions": { + "default": 0, + "title": "Insertions", + "type": "integer" + }, + "patch_lines": { + "default": 0, + "title": "Patch Lines", + "type": "integer" + } + }, + "title": "PromotionDiffStats", + "type": "object" + }, + "PromotionGateCheck": { + "additionalProperties": false, + "properties": { + "detail": { + "default": "", + "title": "Detail", + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "passed": { + "title": "Passed", + "type": "boolean" + } + }, + "required": [ + "id", + "passed" + ], + "title": "PromotionGateCheck", + "type": "object" + }, + "PromotionPreflight": { + "additionalProperties": false, + "properties": { + "allowed": { + "default": false, + "title": "Allowed", + "type": "boolean" + }, + "checks": { + "items": { + "$ref": "#/$defs/PromotionGateCheck" + }, + "title": "Checks", + "type": "array" + }, + "effective_mode": { + "anyOf": [ + { + "$ref": "#/$defs/GitPromotionMode" + }, + { + "type": "null" + } + ], + "default": null + }, + "reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reason" + }, + "remote_probe": { + "$ref": "#/$defs/GitRemoteProbe" + }, + "requested_mode": { + "$ref": "#/$defs/GitPromotionMode" + }, + "run_id": { + "title": "Run Id", + "type": "string" + } + }, + "required": [ + "run_id", + "requested_mode" + ], + "title": "PromotionPreflight", + "type": "object" + }, + "PromotionResult": { + "additionalProperties": false, + "properties": { + "base_ref": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Base Ref" + }, + "branch_name": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Branch Name" + }, + "changed_files": { + "items": { + "type": "string" + }, + "title": "Changed Files", + "type": "array" + }, + "checks": { + "items": { + "$ref": "#/$defs/PromotionGateCheck" + }, + "title": "Checks", + "type": "array" + }, + "commit_sha": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Commit Sha" + }, + "created_at": { + "format": "date-time", + "title": "Created At", + "type": "string" + }, + "diff_stats": { + "$ref": "#/$defs/PromotionDiffStats" + }, + "finalized_by": { + "default": "aggregator", + "title": "Finalized By", + "type": "string" + }, + "metadata": { + "additionalProperties": true, + "title": "Metadata", + "type": "object" + }, + "mode": { + "anyOf": [ + { + "$ref": "#/$defs/GitPromotionMode" + }, + { + "type": "null" + } + ], + "default": null + }, + "patch_uri": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Patch Uri" + }, + "pr_url": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Pr Url" + }, + "reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Reason" + }, + "run_id": { + "title": "Run Id", + "type": "string" + }, + "success": { + "default": false, + "title": "Success", + "type": "boolean" + }, + "target_base_branch": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Target Base Branch" + }, + "updated_at": { + "format": "date-time", + "title": "Updated At", + "type": "string" + } + }, + "required": [ + "run_id", + "created_at", + "updated_at" + ], + "title": "PromotionResult", + "type": "object" + }, + "RecoveryAction": { + "enum": [ + "retry", + "abort", + "require_human_review", + "downgrade_to_draft", + "quarantine" + ], + "title": "RecoveryAction", + "type": "string" + }, + "RemoteRunStatus": { + "enum": [ + "queued", + "running", + "succeeded", + "failed", + "stalled", + "timed_out" + ], + "title": "RemoteRunStatus", + "type": "string" + }, + "RunSummary": { + "additionalProperties": false, + "properties": { + "driver_result": { + "$ref": "#/$defs/DriverResult" + }, + "final_status": { + "enum": [ + "ready_for_promotion", + "blocked", + "failed", + "promoted", + "human_review" + ], + "title": "Final Status", + "type": "string" + }, + "promotion": { + "anyOf": [ + { + "$ref": "#/$defs/PromotionResult" + }, + { + "type": "null" + } + ], + "default": null + }, + "promotion_patch_uri": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Promotion Patch Uri" + }, + "promotion_preflight": { + "anyOf": [ + { + "$ref": "#/$defs/PromotionPreflight" + }, + { + "type": "null" + } + ], + "default": null + }, + "run_id": { + "title": "Run Id", + "type": "string" + }, + "validation": { + "$ref": "#/$defs/ValidationReport" + } + }, + "required": [ + "run_id", + "final_status", + "driver_result", + "validation" + ], + "title": "RunSummary", + "type": "object" + }, + "ValidationCheck": { + "additionalProperties": false, + "properties": { + "artifact": { + "anyOf": [ + { + "$ref": "#/$defs/ArtifactRef" + }, + { + "type": "null" + } + ], + "default": null + }, + "detail": { + "default": "", + "title": "Detail", + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "passed": { + "title": "Passed", + "type": "boolean" + } + }, + "required": [ + "id", + "passed" + ], + "title": "ValidationCheck", + "type": "object" + }, + "ValidationReport": { + "additionalProperties": false, + "properties": { + "checks": { + "items": { + "$ref": "#/$defs/ValidationCheck" + }, + "title": "Checks", + "type": "array" + }, + "passed": { + "title": "Passed", + "type": "boolean" + }, + "run_id": { + "title": "Run Id", + "type": "string" + } + }, + "required": [ + "run_id", + "passed" + ], + "title": "ValidationReport", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "artifact_paths": { + "additionalProperties": { + "type": "string" + }, + "title": "Artifact Paths", + "type": "object" + }, + "failure_class": { + "anyOf": [ + { + "$ref": "#/$defs/FailureClass" + }, + { + "type": "null" + } + ], + "default": null + }, + "fallback_reason": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Fallback Reason" + }, + "finished_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Finished At" + }, + "lane": { + "$ref": "#/$defs/DispatchLane", + "default": "local" + }, + "metadata": { + "additionalProperties": true, + "title": "Metadata", + "type": "object" + }, + "protocol_version": { + "const": "remote-run/v1", + "default": "remote-run/v1", + "title": "Protocol Version", + "type": "string" + }, + "recovery_action": { + "anyOf": [ + { + "$ref": "#/$defs/RecoveryAction" + }, + { + "type": "null" + } + ], + "default": null + }, + "requested_lane": { + "$ref": "#/$defs/DispatchLane", + "default": "local" + }, + "run_id": { + "minLength": 1, + "title": "Run Id", + "type": "string" + }, + "run_summary": { + "anyOf": [ + { + "$ref": "#/$defs/RunSummary" + }, + { + "type": "null" + } + ], + "default": null + }, + "started_at": { + "anyOf": [ + { + "format": "date-time", + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Started At" + }, + "status": { + "$ref": "#/$defs/RemoteRunStatus", + "default": "queued" + }, + "summary": { + "default": "", + "title": "Summary", + "type": "string" + }, + "updated_at": { + "format": "date-time", + "title": "Updated At", + "type": "string" + } + }, + "required": [ + "run_id" + ], + "title": "RemoteRunSummary", + "type": "object" +} diff --git a/schemas/task_run.schema.json b/schemas/task_run.schema.json new file mode 100644 index 00000000..4b3902b4 --- /dev/null +++ b/schemas/task_run.schema.json @@ -0,0 +1,378 @@ +{ + "$defs": { + "ArtifactRef": { + "additionalProperties": false, + "properties": { + "kind": { + "enum": [ + "log", + "report", + "plan", + "patch", + "compliance", + "custom" + ], + "title": "Kind", + "type": "string" + }, + "name": { + "title": "Name", + "type": "string" + }, + "sha256": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Sha256" + }, + "uri": { + "title": "Uri", + "type": "string" + } + }, + "required": [ + "name", + "kind", + "uri" + ], + "title": "ArtifactRef", + "type": "object" + }, + "DispatchLane": { + "enum": [ + "local", + "remote" + ], + "title": "DispatchLane", + "type": "string" + }, + "ExecutionPolicy": { + "additionalProperties": false, + "properties": { + "allow_binary_changes": { + "default": false, + "title": "Allow Binary Changes", + "type": "boolean" + }, + "allowed_paths": { + "items": { + "type": "string" + }, + "title": "Allowed Paths", + "type": "array" + }, + "cleanup_on_success": { + "default": true, + "title": "Cleanup On Success", + "type": "boolean" + }, + "forbidden_paths": { + "items": { + "type": "string" + }, + "title": "Forbidden Paths", + "type": "array" + }, + "max_changed_files": { + "default": 20, + "maximum": 1000, + "minimum": 0, + "title": "Max Changed Files", + "type": "integer" + }, + "max_patch_lines": { + "default": 2000, + "maximum": 100000, + "minimum": 0, + "title": "Max Patch Lines", + "type": "integer" + }, + "max_steps": { + "default": 1, + "maximum": 20, + "minimum": 1, + "title": "Max Steps", + "type": "integer" + }, + "network": { + "default": "disabled", + "enum": [ + "disabled", + "allowlist", + "full" + ], + "title": "Network", + "type": "string" + }, + "network_allowlist": { + "items": { + "type": "string" + }, + "title": "Network Allowlist", + "type": "array" + }, + "retain_workspace_on_failure": { + "default": true, + "title": "Retain Workspace On Failure", + "type": "boolean" + }, + "timeout_sec": { + "default": 900, + "maximum": 7200, + "minimum": 1, + "title": "Timeout Sec", + "type": "integer" + }, + "tool_allowlist": { + "items": { + "type": "string" + }, + "title": "Tool Allowlist", + "type": "array" + } + }, + "title": "ExecutionPolicy", + "type": "object" + }, + "FallbackStep": { + "additionalProperties": false, + "properties": { + "action": { + "enum": [ + "retry", + "fallback_agent", + "human_review", + "reject" + ], + "title": "Action", + "type": "string" + }, + "agent_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Agent Id" + }, + "max_attempts": { + "default": 1, + "maximum": 20, + "minimum": 1, + "title": "Max Attempts", + "type": "integer" + } + }, + "required": [ + "action" + ], + "title": "FallbackStep", + "type": "object" + }, + "JobSpec": { + "additionalProperties": false, + "properties": { + "agent_id": { + "title": "Agent Id", + "type": "string" + }, + "fallback": { + "items": { + "$ref": "#/$defs/FallbackStep" + }, + "title": "Fallback", + "type": "array" + }, + "input_artifacts": { + "items": { + "$ref": "#/$defs/ArtifactRef" + }, + "title": "Input Artifacts", + "type": "array" + }, + "metadata": { + "additionalProperties": true, + "title": "Metadata", + "type": "object" + }, + "mode": { + "default": "patch_only", + "enum": [ + "plan_only", + "patch_only", + "apply_in_workspace", + "review_only" + ], + "title": "Mode", + "type": "string" + }, + "parent_run_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Parent Run Id" + }, + "policy": { + "$ref": "#/$defs/ExecutionPolicy" + }, + "protocol_version": { + "const": "aep/v0", + "default": "aep/v0", + "title": "Protocol Version", + "type": "string" + }, + "role": { + "default": "executor", + "enum": [ + "planner", + "executor", + "reviewer", + "analyst" + ], + "title": "Role", + "type": "string" + }, + "run_id": { + "title": "Run Id", + "type": "string" + }, + "task": { + "title": "Task", + "type": "string" + }, + "validators": { + "items": { + "$ref": "#/$defs/ValidatorSpec" + }, + "title": "Validators", + "type": "array" + } + }, + "required": [ + "run_id", + "agent_id", + "task" + ], + "title": "JobSpec", + "type": "object" + }, + "ValidatorSpec": { + "additionalProperties": false, + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Command" + }, + "id": { + "title": "Id", + "type": "string" + }, + "kind": { + "enum": [ + "builtin", + "command", + "human" + ], + "title": "Kind", + "type": "string" + } + }, + "required": [ + "id", + "kind" + ], + "title": "ValidatorSpec", + "type": "object" + } + }, + "additionalProperties": false, + "properties": { + "job": { + "$ref": "#/$defs/JobSpec" + }, + "lane": { + "$ref": "#/$defs/DispatchLane", + "default": "local" + }, + "metadata": { + "additionalProperties": true, + "title": "Metadata", + "type": "object" + }, + "planner_candidate_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Planner Candidate Id" + }, + "planner_plan_id": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "null" + } + ], + "default": null, + "title": "Planner Plan Id" + }, + "protocol_version": { + "const": "remote-run/v1", + "default": "remote-run/v1", + "title": "Protocol Version", + "type": "string" + }, + "requested_lane": { + "$ref": "#/$defs/DispatchLane", + "default": "local" + }, + "run_id": { + "minLength": 1, + "title": "Run Id", + "type": "string" + }, + "runtime_mode": { + "default": "day", + "minLength": 1, + "title": "Runtime Mode", + "type": "string" + } + }, + "required": [ + "run_id", + "job" + ], + "title": "RemoteTaskSpec", + "type": "object" +} diff --git a/scripts/doctor.py b/scripts/doctor.py index e4038a94..efd9bd5c 100644 --- a/scripts/doctor.py +++ b/scripts/doctor.py @@ -30,6 +30,7 @@ "workflow.workflow_engine", ) OPTIONAL_COMMANDS = ("git", "curl", "lsof") +LINUX_REMOTE_OPTIONAL_COMMANDS = ("gh", "tmux") @dataclass(frozen=True) @@ -55,11 +56,17 @@ def _fail(name: str, detail: str, hint: str | None = None) -> CheckResult: def _check_python() -> CheckResult: major, minor = sys.version_info[:2] if (major, minor) >= REQUIRED_PYTHON: - return _ok("Python version", f"{major}.{minor}") + return _ok( + "Python version", + f"{major}.{minor} (project baseline: {REQUIRED_PYTHON[0]}.{REQUIRED_PYTHON[1]}+)", + ) return _fail( "Python version", - f"{major}.{minor} is too old", - f"Use Python {REQUIRED_PYTHON[0]}.{REQUIRED_PYTHON[1]}+.", + f"{major}.{minor} is too old for this repository", + ( + f"Use Python {REQUIRED_PYTHON[0]}.{REQUIRED_PYTHON[1]}+ so packaging, " + "README, doctor, and CI stay aligned." + ), ) @@ -214,6 +221,89 @@ def _check_commands() -> CheckResult: return _ok("System commands", "git, curl, lsof are available") +def _check_profile_platform(profile: str) -> CheckResult: + if profile != "linux-remote": + return _ok("Doctor profile", "standard local checks") + if sys.platform.startswith("linux"): + return _ok("Doctor profile", "linux-remote checks on Linux host") + return _warn( + "Doctor profile", + f"linux-remote selected on {sys.platform}", + "Run `make doctor-linux` on the actual Linux worker node for the most useful signal.", + ) + + +def _check_linux_runtime_mode(repo_root: Path, profile: str) -> CheckResult | None: + if profile != "linux-remote": + return None + runtime = (_read_env_value(repo_root, "OPENHANDS_RUNTIME") or "").strip().lower() + if not runtime: + runtime = os.getenv("OPENHANDS_RUNTIME", "").strip().lower() + if not runtime: + return _warn( + "OpenHands runtime", + "OPENHANDS_RUNTIME is not set", + "Set `OPENHANDS_RUNTIME=host` on Linux workers unless you have a validated container runtime.", + ) + if runtime != "host": + return _warn( + "OpenHands runtime", + f"OPENHANDS_RUNTIME={runtime}", + "Linux workers are best started with `OPENHANDS_RUNTIME=host` for the first stable bring-up.", + ) + return _ok("OpenHands runtime", "OPENHANDS_RUNTIME=host") + + +def _check_linux_docker_host(profile: str) -> CheckResult | None: + if profile != "linux-remote": + return None + docker_host = os.getenv("DOCKER_HOST", "").strip() + if not docker_host: + return _ok("Docker host", "DOCKER_HOST is not set") + if any(token in docker_host for token in ("colima", "/Users/", "/Volumes/")): + return _warn( + "Docker host", + docker_host, + "Unset DOCKER_HOST or point it to a local Linux daemon; do not inherit a Mac Colima socket on a Linux worker.", + ) + return _ok("Docker host", docker_host) + + +def _check_linux_runtime_paths(repo_root: Path, profile: str) -> CheckResult | None: + if profile != "linux-remote": + return None + required_roots = ( + repo_root / ".masfactory_runtime", + repo_root / "artifacts", + repo_root / "logs", + ) + blocked: list[str] = [] + for path in required_roots: + target = path if path.exists() else path.parent + if not os.access(target, os.W_OK): + blocked.append(str(path)) + if blocked: + return _fail( + "Runtime paths", + "Not writable: " + ", ".join(blocked), + "Grant the Linux worker write access to runtime directories before dispatching real jobs.", + ) + return _ok("Runtime paths", "runtime directories are writable") + + +def _check_linux_optional_commands(profile: str) -> CheckResult | None: + if profile != "linux-remote": + return None + missing = [cmd for cmd in LINUX_REMOTE_OPTIONAL_COMMANDS if shutil.which(cmd) is None] + if missing: + return _warn( + "Linux extras", + "Missing: " + ", ".join(missing), + "Install `gh` for promotion/PR workflows and `tmux` for resilient long-lived remote sessions.", + ) + return _ok("Linux extras", "gh and tmux are available") + + def _is_port_occupied(host: str, port: int) -> bool: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: sock.settimeout(0.2) @@ -231,8 +321,9 @@ def _check_port(port: int) -> CheckResult: return _ok("API port", f"{host}:{port} is available") -def _run_checks(repo_root: Path, port: int) -> list[CheckResult]: +def _run_checks(repo_root: Path, port: int, profile: str) -> list[CheckResult]: checks: list[CheckResult] = [] + checks.append(_check_profile_platform(profile)) checks.append(_check_python()) checks.extend(_check_virtualenv(repo_root)) checks.append(_check_requirements(repo_root)) @@ -240,6 +331,14 @@ def _run_checks(repo_root: Path, port: int) -> list[CheckResult]: checks.append(_check_env_files(repo_root)) checks.append(_check_telegram_secret_policy(repo_root)) checks.append(_check_commands()) + for extra_check in ( + _check_linux_runtime_mode(repo_root, profile), + _check_linux_docker_host(profile), + _check_linux_runtime_paths(repo_root, profile), + _check_linux_optional_commands(profile), + ): + if extra_check is not None: + checks.append(extra_check) checks.append(_check_port(port)) return checks @@ -275,11 +374,17 @@ def main() -> int: default=int(os.getenv("AUTORESEARCH_API_PORT", "8001")), help="Port used for API start checks (default: AUTORESEARCH_API_PORT or 8001).", ) + parser.add_argument( + "--profile", + choices=("local", "linux-remote"), + default="local", + help="Check profile: local workstation or Linux remote worker.", + ) args = parser.parse_args() repo_root = Path(__file__).resolve().parents[1] os.chdir(repo_root) - results = _run_checks(repo_root=repo_root, port=args.port) + results = _run_checks(repo_root=repo_root, port=args.port, profile=args.profile) return _print_report(results) diff --git a/scripts/export_remote_run_schemas.py b/scripts/export_remote_run_schemas.py new file mode 100644 index 00000000..2473a6c3 --- /dev/null +++ b/scripts/export_remote_run_schemas.py @@ -0,0 +1,37 @@ +from __future__ import annotations + +import argparse +import json +from pathlib import Path + +from autoresearch.shared.remote_run_contract import RemoteRunSummary, RemoteTaskSpec + + +def export_schemas(output_dir: Path) -> list[Path]: + output_dir.mkdir(parents=True, exist_ok=True) + targets = { + "task_run.schema.json": RemoteTaskSpec.model_json_schema(), + "run_summary.schema.json": RemoteRunSummary.model_json_schema(), + } + written: list[Path] = [] + for name, payload in targets.items(): + path = output_dir / name + path.write_text(json.dumps(payload, ensure_ascii=False, indent=2, sort_keys=True) + "\n", encoding="utf-8") + written.append(path) + return written + + +def main() -> int: + parser = argparse.ArgumentParser(description="Export remote-run JSON schemas.") + parser.add_argument( + "--output-dir", + default=str(Path(__file__).resolve().parents[1] / "schemas"), + help="directory to receive exported schema files", + ) + args = parser.parse_args() + export_schemas(Path(args.output_dir).expanduser().resolve()) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/housekeeper_api.sh b/scripts/housekeeper_api.sh new file mode 100755 index 00000000..d1040204 --- /dev/null +++ b/scripts/housekeeper_api.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash +set -euo pipefail + +API_BASE="${AUTORESEARCH_HOUSEKEEPER_API_BASE:-http://127.0.0.1:8000}" +ACTION="${1:-}" + +if [[ -z "${ACTION}" ]]; then + echo "usage: $0 " >&2 + exit 2 +fi + +post_json() { + local path="$1" + local payload="$2" + curl -fsS \ + -H "content-type: application/json" \ + -X POST \ + --data "${payload}" \ + "${API_BASE}${path}" +} + +case "${ACTION}" in + mode-day) + post_json \ + "/api/v1/housekeeper/mode" \ + '{"action":"apply_schedule","target_mode":"day_safe","changed_by":"systemd","reason":"schedule","metadata":{"source":"systemd_timer"}}' + ;; + mode-night) + post_json \ + "/api/v1/housekeeper/mode" \ + '{"action":"apply_schedule","target_mode":"night_readonly_explore","changed_by":"systemd","reason":"schedule","metadata":{"source":"systemd_timer"}}' + ;; + night-tick) + post_json "/api/v1/housekeeper/night-explore/tick" '{}' + ;; + morning-summary) + post_json "/api/v1/housekeeper/summaries/morning" '{}' + ;; + clear-override) + post_json \ + "/api/v1/housekeeper/mode" \ + '{"action":"clear_manual_override","changed_by":"systemd","reason":"manual_api","metadata":{"source":"systemd_timer"}}' + ;; + ack-breaker) + post_json \ + "/api/v1/housekeeper/mode" \ + '{"action":"ack_circuit_breaker","changed_by":"systemd","reason":"recovered_from_circuit_breaker","metadata":{"source":"systemd_timer"}}' + ;; + *) + echo "unsupported action: ${ACTION}" >&2 + exit 2 + ;; +esac diff --git a/scripts/launch_ai_lab.sh b/scripts/launch_ai_lab.sh index 7b3f06e9..42fbeb73 100755 --- a/scripts/launch_ai_lab.sh +++ b/scripts/launch_ai_lab.sh @@ -14,9 +14,15 @@ OVERRIDE_LOG_DIR="${LOG_DIR:-}" OVERRIDE_CACHE_DIR="${CACHE_DIR:-}" OVERRIDE_LAB_USER="${LAB_USER:-}" OVERRIDE_AUTO_OPEN_DOCKER="${AUTO_OPEN_DOCKER:-}" +OVERRIDE_AUTO_START_COLIMA="${AUTO_START_COLIMA:-}" OVERRIDE_IMAGE_TAG="${AI_LAB_IMAGE_TAG:-}" OVERRIDE_FORCE_DOCKER_RUN="${AI_LAB_FORCE_DOCKER_RUN:-}" OVERRIDE_HOST_MOUNT_ROOT="${AI_LAB_HOST_MOUNT_ROOT:-}" +OVERRIDE_OPENHANDS_HOME_DIR="${OPENHANDS_HOME_DIR:-}" +OVERRIDE_DOCKER_HOST_SOCKET_PATH="${DOCKER_HOST_SOCKET_PATH:-}" +OVERRIDE_DOCKER_HOST_IN_CONTAINER="${DOCKER_HOST_IN_CONTAINER:-}" +OVERRIDE_DOCKER_HOST_MOUNT_DIR="${DOCKER_HOST_MOUNT_DIR:-}" +OVERRIDE_COLIMA_HELPER="${AI_LAB_COLIMA_HELPER:-}" load_env_file() { local file="$1" @@ -38,10 +44,15 @@ LOG_DIR="${OVERRIDE_LOG_DIR:-${LOG_DIR:-/Users/ai_lab/logs}}" CACHE_DIR="${OVERRIDE_CACHE_DIR:-${CACHE_DIR:-/Users/ai_lab/.cache}}" LAB_USER="${OVERRIDE_LAB_USER:-${LAB_USER:-ai_lab}}" AUTO_OPEN_DOCKER="${OVERRIDE_AUTO_OPEN_DOCKER:-${AUTO_OPEN_DOCKER:-1}}" +AUTO_START_COLIMA="${OVERRIDE_AUTO_START_COLIMA:-${AUTO_START_COLIMA:-1}}" IMAGE_TAG="${OVERRIDE_IMAGE_TAG:-${AI_LAB_IMAGE_TAG:-ai-lab-local:dev}}" FORCE_DOCKER_RUN="${OVERRIDE_FORCE_DOCKER_RUN:-${AI_LAB_FORCE_DOCKER_RUN:-0}}" HOST_MOUNT_ROOT="${OVERRIDE_HOST_MOUNT_ROOT:-${AI_LAB_HOST_MOUNT_ROOT:-${WORKSPACE_DIR}}}" -OPENHANDS_HOME_DIR="${LOG_DIR}/openhands-home" +OPENHANDS_HOME_DIR="${OVERRIDE_OPENHANDS_HOME_DIR:-${OPENHANDS_HOME_DIR:-${LOG_DIR}/openhands-home}}" +DOCKER_HOST_SOCKET_PATH="${OVERRIDE_DOCKER_HOST_SOCKET_PATH:-${DOCKER_HOST_SOCKET_PATH:-}}" +DOCKER_HOST_IN_CONTAINER="${OVERRIDE_DOCKER_HOST_IN_CONTAINER:-${DOCKER_HOST_IN_CONTAINER:-}}" +DOCKER_HOST_MOUNT_DIR="${OVERRIDE_DOCKER_HOST_MOUNT_DIR:-${DOCKER_HOST_MOUNT_DIR:-/var/run/host-docker}}" +COLIMA_HELPER="${OVERRIDE_COLIMA_HELPER:-${AI_LAB_COLIMA_HELPER:-${REPO_ROOT}/scripts/colima-external.sh}}" if [[ "${LAB_USER}" != "ai_lab" ]]; then WORKSPACE_DIR="${WORKSPACE_DIR:-/Users/${LAB_USER}/workspace}" @@ -84,6 +95,68 @@ need_cmd() { command -v "$1" >/dev/null 2>&1 || die "missing command: $1" } +docker_host_socket_path() { + local docker_host_value="${DOCKER_HOST:-}" + if [[ "${docker_host_value}" =~ ^unix://(.+)$ ]]; then + printf '%s' "${BASH_REMATCH[1]}" + fi +} + +preferred_colima_home_path() { + if [[ -n "${COLIMA_HOME_PATH:-}" ]]; then + printf '%s' "${COLIMA_HOME_PATH}" + return + fi + + if [[ -d "/Volumes/ColimaStore/.colima-home" ]] || [[ -e "/Volumes/AI_LAB/colima-store.sparsebundle" ]]; then + printf '/Volumes/ColimaStore/.colima-home' + return + fi + + printf '%s/.colima' "${HOME}" +} + +preferred_colima_socket_path() { + local colima_home + local profile="${COLIMA_PROFILE:-default}" + colima_home="$(preferred_colima_home_path)" + printf '%s/%s/docker.sock' "${colima_home}" "${profile}" +} + +configure_docker_socket_env() { + if [[ -z "${DOCKER_HOST_SOCKET_PATH}" ]]; then + DOCKER_HOST_SOCKET_PATH="$(docker_host_socket_path)" + fi + + if [[ -n "${DOCKER_HOST_SOCKET_PATH}" ]]; then + DOCKER_HOST_SOCKET_DIR="${DOCKER_HOST_SOCKET_DIR:-$(dirname "${DOCKER_HOST_SOCKET_PATH}")}" + DOCKER_HOST_SOCKET_NAME="${DOCKER_HOST_SOCKET_NAME:-$(basename "${DOCKER_HOST_SOCKET_PATH}")}" + DOCKER_HOST_IN_CONTAINER="${DOCKER_HOST_IN_CONTAINER:-unix://${DOCKER_HOST_MOUNT_DIR}/${DOCKER_HOST_SOCKET_NAME}}" + export DOCKER_HOST_SOCKET_PATH + export DOCKER_HOST_SOCKET_DIR + export DOCKER_HOST_SOCKET_NAME + export DOCKER_HOST_MOUNT_DIR + export DOCKER_HOST_IN_CONTAINER + fi +} + +describe_socket_access() { + local socket_path="$1" + if [[ -e "${socket_path}" ]]; then + stat -f 'owner=%Su group=%Sg mode=%Sp path=%N' "${socket_path}" 2>/dev/null || true + else + printf 'missing path=%s\n' "${socket_path}" + fi +} + +docker_host_looks_like_colima() { + local docker_host_value="${DOCKER_HOST:-}" + if [[ "${docker_host_value}" == *".colima/"* ]]; then + return 0 + fi + [[ "${DOCKER_CONTEXT:-}" == "colima" ]] +} + docker_ready() { python3 - <<'PY' >/dev/null 2>&1 import subprocess @@ -91,7 +164,7 @@ import sys try: completed = subprocess.run( - ["docker", "info"], + ["docker", "version", "--format", "{{.Server.Version}}"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3, @@ -104,12 +177,61 @@ sys.exit(0 if completed.returncode == 0 else 1) PY } +try_safe_colima_fallback() { + if [[ "${AUTO_START_COLIMA}" != "1" ]]; then + return 1 + fi + + log "configured Colima socket is not usable; trying a safe Colima fallback" + + local ready_label="current-user" + + if [[ -f "${COLIMA_HELPER}" ]] && { [[ -n "${COLIMA_HOME_PATH:-}" ]] || [[ -d "/Volumes/ColimaStore/.colima-home" ]] || [[ -e "/Volumes/AI_LAB/colima-store.sparsebundle" ]]; }; then + bash "${COLIMA_HELPER}" start + ready_label="repo-managed" + else + need_cmd colima + local -a colima_args=(start --profile "${COLIMA_PROFILE:-default}") + if [[ -d "/Volumes/AI_LAB" ]]; then + colima_args+=(--mount "/Volumes/AI_LAB:w") + fi + colima "${colima_args[@]}" + fi + + export DOCKER_HOST="unix://$(preferred_colima_socket_path)" + DOCKER_HOST_SOCKET_PATH="" + configure_docker_socket_env + + for _ in {1..20}; do + if docker_ready; then + ok "${ready_label} Colima is ready via ${DOCKER_HOST}" + return 0 + fi + sleep 2 + done + + return 1 +} + ensure_docker() { need_cmd docker + configure_docker_socket_env if docker_ready; then + configure_docker_socket_env return fi + local configured_socket + configured_socket="$(docker_host_socket_path)" + if [[ -n "${configured_socket}" ]] && [[ ! -r "${configured_socket}" || ! -w "${configured_socket}" ]]; then + warn "configured DOCKER_HOST socket is not accessible" + describe_socket_access "${configured_socket}" >&2 || true + if docker_host_looks_like_colima && try_safe_colima_fallback; then + return + fi + die "Docker socket is configured but not accessible. Update DOCKER_HOST or start a Colima profile owned by the current user." + fi + if [[ "${AUTO_OPEN_DOCKER}" == "1" ]] && [[ "$(uname -s)" == "Darwin" ]]; then log "Docker daemon not ready; trying to open Docker Desktop" open -a Docker >/dev/null 2>&1 || true @@ -207,6 +329,7 @@ build_direct_image() { } direct_docker_base_args() { + configure_docker_socket_env DIRECT_DOCKER_ARGS=( docker run --rm --platform linux/arm64 @@ -221,6 +344,13 @@ direct_docker_base_args() { -e PYTHONUNBUFFERED=1 ) + if [[ -n "${DOCKER_HOST_SOCKET_PATH}" ]] && [[ -S "${DOCKER_HOST_SOCKET_PATH}" ]]; then + DIRECT_DOCKER_ARGS+=( + --mount "type=bind,src=${DOCKER_HOST_SOCKET_DIR},dst=${DOCKER_HOST_MOUNT_DIR}" + -e "DOCKER_HOST=${DOCKER_HOST_IN_CONTAINER}" + ) + fi + append_env_passthrough TELEGRAM_BOT_TOKEN append_env_passthrough TELEGRAM_CHAT_ID append_env_passthrough GITHUB_TOKEN diff --git a/scripts/openhands_controlled_backend.sh b/scripts/openhands_controlled_backend.sh index 92be54be..1dac3123 100755 --- a/scripts/openhands_controlled_backend.sh +++ b/scripts/openhands_controlled_backend.sh @@ -6,6 +6,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" RUN_ID="${OPENHANDS_RUN_ID:-$(date -u +%Y%m%dT%H%M%SZ)}" RUN_ROOT="${OPENHANDS_RUN_ROOT:-${REPO_ROOT}/.masfactory_runtime/openhands-controlled}" +BASELINE_DIR="${OPENHANDS_BASELINE_DIR:-${RUN_ROOT}/${RUN_ID}/baseline}" ISOLATED_WORKSPACE="${OPENHANDS_ISOLATED_WORKSPACE:-${RUN_ROOT}/${RUN_ID}/workspace}" ARTIFACT_DIR="${OPENHANDS_ARTIFACT_DIR:-${REPO_ROOT}/logs/audit/openhands/jobs/${RUN_ID}}" CHAIN_DRY_RUN="${OPENHANDS_CHAIN_DRY_RUN:-0}" @@ -33,7 +34,7 @@ json_escape() { printf '%s' "${input}" } -mkdir -p "${ISOLATED_WORKSPACE}" "${ARTIFACT_DIR}" +mkdir -p "${BASELINE_DIR}" "${ISOLATED_WORKSPACE}" "${ARTIFACT_DIR}" if [[ -z "${VALIDATE_CMD}" ]]; then VALIDATE_CMD="python3 scripts/check_prompt_hygiene.py --root src --output-dir ${ARTIFACT_DIR}/prompt_hygiene --min-repeat 3" @@ -42,6 +43,8 @@ fi RSYNC_EXCLUDES=( --exclude '.git' --exclude '.venv' + --exclude '.masfactory_runtime' + --exclude 'logs' --exclude 'node_modules' --exclude 'panel/out' --exclude 'dashboard/.next' @@ -50,11 +53,15 @@ RSYNC_EXCLUDES=( ) echo "[controlled-backend] run_id: ${RUN_ID}" +echo "[controlled-backend] baseline dir: ${BASELINE_DIR}" echo "[controlled-backend] isolated workspace: ${ISOLATED_WORKSPACE}" echo "[controlled-backend] artifact dir: ${ARTIFACT_DIR}" +echo "[controlled-backend] preparing filtered baseline snapshot" +rsync -a --delete "${RSYNC_EXCLUDES[@]}" "${REPO_ROOT}/" "${BASELINE_DIR}/" + echo "[controlled-backend] preparing isolated workspace snapshot" -rsync -a --delete "${RSYNC_EXCLUDES[@]}" "${REPO_ROOT}/" "${ISOLATED_WORKSPACE}/" +rsync -a --delete "${BASELINE_DIR}/" "${ISOLATED_WORKSPACE}/" echo "[controlled-backend] executing OpenHands" set +e @@ -80,11 +87,12 @@ fi echo "[controlled-backend] collecting promotion patch" set +e +RAW_PATCH_PATH="${ARTIFACT_DIR}/promotion.raw.patch" git --no-pager diff --no-index \ -- \ - "${REPO_ROOT}" \ + "${BASELINE_DIR}" \ "${ISOLATED_WORKSPACE}" \ - > "${ARTIFACT_DIR}/promotion.patch" + > "${RAW_PATCH_PATH}" DIFF_EXIT_CODE=$? set -e @@ -93,6 +101,76 @@ if [[ ${DIFF_EXIT_CODE} -gt 1 ]]; then exit ${DIFF_EXIT_CODE} fi +python3 - "${RAW_PATCH_PATH}" "${BASELINE_DIR}" "${ISOLATED_WORKSPACE}" "${ARTIFACT_DIR}/promotion.patch" <<'PY' +from pathlib import Path +import re +import sys + +raw_patch_path = Path(sys.argv[1]) +baseline_dir = str(Path(sys.argv[2]).resolve()) +workspace_dir = str(Path(sys.argv[3]).resolve()) +target_path = Path(sys.argv[4]) + +text = raw_patch_path.read_text(encoding="utf-8") if raw_patch_path.exists() else "" + + +def normalize_path(token: str, side: str) -> str: + if token == "/dev/null": + return token + + candidate = token + if candidate.startswith("a/") or candidate.startswith("b/"): + candidate = candidate[2:] + + candidates = { + candidate, + candidate.lstrip("/"), + "/" + candidate.lstrip("/"), + } + + for prefix in (baseline_dir, workspace_dir): + prefixes = { + prefix.rstrip("/"), + prefix.rstrip("/").lstrip("/"), + "/" + prefix.rstrip("/").lstrip("/"), + } + for probe in candidates: + for normalized_prefix in prefixes: + prefix_with_sep = normalized_prefix.rstrip("/") + "/" + if probe == normalized_prefix or probe.startswith(prefix_with_sep): + rel = probe[len(normalized_prefix) :].lstrip("/") + return f"{side}/{rel}" if rel else side + + if token.startswith(f"{side}/"): + return token + return f"{side}/{candidate.lstrip('/')}" + + +normalized_lines: list[str] = [] +for line in text.splitlines(): + if line.startswith("diff --git "): + match = re.match(r"^diff --git a/(.+) b/(.+)$", line) + if match: + left = normalize_path(match.group(1), "a") + right = normalize_path(match.group(2), "b") + normalized_lines.append(f"diff --git {left} {right}") + continue + if line.startswith("--- "): + normalized_lines.append(f"--- {normalize_path(line[4:], 'a')}") + continue + if line.startswith("+++ "): + normalized_lines.append(f"+++ {normalize_path(line[4:], 'b')}") + continue + normalized_lines.append(line) + +normalized = "\n".join(normalized_lines) +if normalized: + normalized += "\n" +target_path.write_text(normalized, encoding="utf-8") +PY + +rm -f "${RAW_PATCH_PATH}" + PROMOTION_READY=false if [[ ${OPENHANDS_EXIT_CODE} -eq 0 && ${VALIDATION_EXIT_CODE} -eq 0 ]]; then PROMOTION_READY=true diff --git a/scripts/openhands_start.sh b/scripts/openhands_start.sh index ba923b41..a6fcd390 100755 --- a/scripts/openhands_start.sh +++ b/scripts/openhands_start.sh @@ -3,6 +3,19 @@ set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +OPENHANDS_ENV_FILE="${OPENHANDS_ENV_FILE:-${REPO_ROOT}/ai_lab.env}" + +load_env_file() { + local file="$1" + if [[ -f "${file}" ]]; then + # shellcheck disable=SC1090 + set -a + . "${file}" + set +a + fi +} + +load_env_file "${OPENHANDS_ENV_FILE}" PROMPT="${*:-${OPENHANDS_TASK:-}}" RUNTIME="${OPENHANDS_RUNTIME:-ai-lab}" @@ -19,26 +32,81 @@ AUDIT_FILE="${OPENHANDS_AUDIT_FILE:-${AUDIT_DIR}/compliance.json}" MAX_FILES_PER_STEP="${OPENHANDS_MAX_FILES_PER_STEP:-3}" OPENHANDS_CMD="${OPENHANDS_CMD:-}" OPENHANDS_CMD_TEMPLATE="${OPENHANDS_CMD_TEMPLATE:-}" +OPENHANDS_HEADLESS="${OPENHANDS_HEADLESS:-1}" +OPENHANDS_JSON="${OPENHANDS_JSON:-0}" +OPENHANDS_EXPERIMENTAL="${OPENHANDS_EXPERIMENTAL:-1}" +OPENHANDS_SANDBOX_PROVIDER="${OPENHANDS_SANDBOX_PROVIDER:-process}" +OPENHANDS_SANDBOX_VOLUMES="${OPENHANDS_SANDBOX_VOLUMES:-${WORKSPACE}:/workspace:rw}" +OPENHANDS_PERSISTENCE_DIR="${OPENHANDS_PERSISTENCE_DIR:-${AUDIT_DIR}/state}" +OPENHANDS_LOCAL_BIN="${OPENHANDS_LOCAL_BIN:-${REPO_ROOT}/.masfactory_runtime/tools/openhands-cli-py312/bin/openhands}" +OPENHANDS_CONTAINER_BIN="${OPENHANDS_CONTAINER_BIN:-openhands}" +OPENHANDS_BOOTSTRAP_PYTHON="${OPENHANDS_BOOTSTRAP_PYTHON:-}" +OPENHANDS_HOME_ROOT="${OPENHANDS_HOME_ROOT:-${REPO_ROOT}/.masfactory_runtime/tools/openhands-home}" +OPENHANDS_CONFIG_DIR="${OPENHANDS_CONFIG_DIR:-}" +OPENHANDS_CONTAINER_WORKSPACE="${OPENHANDS_CONTAINER_WORKSPACE:-/opt/workspace}" +OPENHANDS_RUN_AS_USER="${OPENHANDS_RUN_AS_USER:-nobody}" +OPENHANDS_RUN_AS_HOME="${OPENHANDS_RUN_AS_HOME:-/tmp/openhands-home}" +OPENHANDS_RUN_AS_ENABLED="${OPENHANDS_RUN_AS_ENABLED:-1}" if [[ -z "${PROMPT}" ]]; then echo "[openhands] missing task prompt" >&2 exit 40 fi +if [[ -z "${OPENHANDS_CONFIG_DIR}" ]]; then + if [[ "${RUNTIME}" == "ai-lab" ]]; then + OPENHANDS_CONFIG_DIR="${LOG_DIR:-${REPO_ROOT}/logs}/openhands-home" + else + OPENHANDS_CONFIG_DIR="${OPENHANDS_HOME_ROOT}/.openhands" + fi +fi + +if [[ "${RUNTIME}" == "host" ]]; then + OPENHANDS_HOME_ROOT="$(dirname "${OPENHANDS_CONFIG_DIR}")" +fi + +if [[ -z "${OPENHANDS_BOOTSTRAP_PYTHON}" ]]; then + if [[ -x "${OPENHANDS_LOCAL_BIN%/*}/python" ]]; then + OPENHANDS_BOOTSTRAP_PYTHON="${OPENHANDS_LOCAL_BIN%/*}/python" + else + OPENHANDS_BOOTSTRAP_PYTHON="${PYTHON_BIN:-python3}" + fi +fi + +OPENHANDS_SETTINGS_PATH="${OPENHANDS_SETTINGS_PATH:-${OPENHANDS_CONFIG_DIR}/agent_settings.json}" + mkdir -p "${AUDIT_DIR}" "$(dirname "${AUDIT_FILE}")" PROMPT_FILE="${AUDIT_DIR}/prompt.txt" printf '%s\n' "${PROMPT}" > "${PROMPT_FILE}" if [[ -z "${OPENHANDS_CMD_TEMPLATE}" ]]; then if [[ -z "${OPENHANDS_CMD}" ]]; then - if [[ "${DRY_RUN}" == "1" ]]; then + if [[ "${RUNTIME}" == "ai-lab" ]]; then + OPENHANDS_CMD="${OPENHANDS_CONTAINER_BIN}" + elif [[ -x "${OPENHANDS_LOCAL_BIN}" ]]; then + OPENHANDS_CMD="${OPENHANDS_LOCAL_BIN}" + elif command -v openhands >/dev/null 2>&1; then + OPENHANDS_CMD="$(command -v openhands)" + elif [[ "${DRY_RUN}" == "1" ]]; then OPENHANDS_CMD="openhands" else echo "[openhands] OPENHANDS_CMD or OPENHANDS_CMD_TEMPLATE is required for real execution" >&2 exit 127 fi fi - OPENHANDS_CMD_TEMPLATE='${OPENHANDS_CMD} "${OPENHANDS_PROMPT}"' + if [[ "${OPENHANDS_HEADLESS}" == "1" ]]; then + OPENHANDS_CMD_TEMPLATE='RUNTIME="${OPENHANDS_SANDBOX_PROVIDER}" SANDBOX_VOLUMES="${OPENHANDS_SANDBOX_VOLUMES}" OH_PERSISTENCE_DIR="${OPENHANDS_PERSISTENCE_DIR}" "${OPENHANDS_CMD}"' + if [[ "${OPENHANDS_EXPERIMENTAL}" == "1" ]]; then + OPENHANDS_CMD_TEMPLATE+=' --exp' + fi + OPENHANDS_CMD_TEMPLATE+=' --headless' + if [[ "${OPENHANDS_JSON}" == "1" ]]; then + OPENHANDS_CMD_TEMPLATE+=' --json' + fi + OPENHANDS_CMD_TEMPLATE+=' -t "${OPENHANDS_PROMPT}"' + else + OPENHANDS_CMD_TEMPLATE='"${OPENHANDS_CMD}" "${OPENHANDS_PROMPT}"' + fi fi quote_cmd() { @@ -109,11 +177,60 @@ host_mount_root_for_path() { esac } +ensure_openhands_agent_settings() { + if [[ "${DRY_RUN}" == "1" ]]; then + return 0 + fi + + if [[ -f "${OPENHANDS_SETTINGS_PATH}" ]] && [[ "${OPENHANDS_FORCE_AGENT_SETTINGS:-0}" != "1" ]]; then + return 0 + fi + + if [[ -z "${LLM_MODEL:-}" || -z "${LLM_API_KEY:-}" ]]; then + echo "[openhands] missing LLM_MODEL or LLM_API_KEY for agent settings bootstrap" >&2 + exit 40 + fi + + if [[ ! -x "${OPENHANDS_BOOTSTRAP_PYTHON}" ]] && ! command -v "${OPENHANDS_BOOTSTRAP_PYTHON}" >/dev/null 2>&1; then + echo "[openhands] bootstrap python not found: ${OPENHANDS_BOOTSTRAP_PYTHON}" >&2 + exit 127 + fi + + mkdir -p "${OPENHANDS_CONFIG_DIR}" + + "${OPENHANDS_BOOTSTRAP_PYTHON}" - "${OPENHANDS_SETTINGS_PATH}" "${LLM_MODEL}" "${LLM_API_KEY}" "${LLM_BASE_URL:-}" <<'PY' +import json +import sys +from pathlib import Path + +from openhands.sdk import Agent, LLM +from openhands.tools.preset.default import get_default_tools + +settings_path = Path(sys.argv[1]) +model = sys.argv[2] +api_key = sys.argv[3] +base_url = sys.argv[4] or None + +settings_path.parent.mkdir(parents=True, exist_ok=True) +agent = Agent( + llm=LLM(model=model, api_key=api_key, base_url=base_url), + tools=get_default_tools(enable_browser=False), +) +settings_path.write_text( + json.dumps(agent.model_dump(context={"expose_secrets": True}), indent=2) + "\n", + encoding="utf-8", +) +PY + + chmod 600 "${OPENHANDS_SETTINGS_PATH}" 2>/dev/null || true +} + run_host() { local shell_cmd - shell_cmd='mkdir -p "${OPENHANDS_AUDIT_DIR}" && eval "${OPENHANDS_CMD_TEMPLATE}"' + shell_cmd='mkdir -p "${OPENHANDS_AUDIT_DIR}" "${OPENHANDS_PERSISTENCE_DIR}" && cd "${OPENHANDS_WORKSPACE}" && eval "${OPENHANDS_CMD_TEMPLATE}"' local -a cmd=( env + "HOME=${OPENHANDS_HOME_ROOT}" "OPENHANDS_PROMPT=${PROMPT}" "OPENHANDS_PROMPT_FILE=${PROMPT_FILE}" "OPENHANDS_WORKSPACE=${WORKSPACE}" @@ -122,6 +239,13 @@ run_host() { "OPENHANDS_MAX_FILES_PER_STEP=${MAX_FILES_PER_STEP}" "OPENHANDS_CMD=${OPENHANDS_CMD}" "OPENHANDS_CMD_TEMPLATE=${OPENHANDS_CMD_TEMPLATE}" + "OPENHANDS_HEADLESS=${OPENHANDS_HEADLESS}" + "OPENHANDS_JSON=${OPENHANDS_JSON}" + "OPENHANDS_EXPERIMENTAL=${OPENHANDS_EXPERIMENTAL}" + "OPENHANDS_SANDBOX_PROVIDER=${OPENHANDS_SANDBOX_PROVIDER}" + "OPENHANDS_SANDBOX_VOLUMES=${OPENHANDS_SANDBOX_VOLUMES}" + "OPENHANDS_PERSISTENCE_DIR=${OPENHANDS_PERSISTENCE_DIR}" + "OPENHANDS_SETTINGS_PATH=${OPENHANDS_SETTINGS_PATH}" /bin/bash -lc "${shell_cmd}" ) @@ -137,31 +261,63 @@ run_ai_lab() { local container_audit_dir local container_audit_file local container_prompt_file + local container_persistence_dir + local container_sandbox_volumes local extra_volume local host_mount_root local shell_cmd + local persistence_token + local runtime_settings_path + local runtime_config_dir + local runtime_home + local runtime_state_dir + local runtime_script_path container_audit_dir="$(container_path_for_host_path "${AUDIT_DIR}")" container_audit_file="$(container_path_for_host_path "${AUDIT_FILE}")" container_prompt_file="$(container_path_for_host_path "${PROMPT_FILE}")" + persistence_token="$(basename "$(dirname "${AUDIT_DIR}")")-$(basename "${AUDIT_DIR}")" + container_sandbox_volumes="${OPENHANDS_CONTAINER_WORKSPACE}:/workspace:rw" host_mount_root="$(host_mount_root_for_path "${PROMPT_FILE}")" - extra_volume="${WORKSPACE}:/opt/workspace:rw" - shell_cmd='mkdir -p "${OPENHANDS_AUDIT_DIR}" && eval "${OPENHANDS_CMD_TEMPLATE}"' + extra_volume="${WORKSPACE}:${OPENHANDS_CONTAINER_WORKSPACE}:rw" + runtime_home="${OPENHANDS_RUN_AS_HOME}" + runtime_config_dir="${runtime_home}/.openhands" + runtime_state_dir="${runtime_home}/state" + container_persistence_dir="${runtime_state_dir}/${persistence_token}" + runtime_settings_path="${runtime_config_dir}/agent_settings.json" + runtime_script_path="/tmp/openhands-run.sh" + + if [[ "${OPENHANDS_RUN_AS_ENABLED}" == "1" && "${OPENHANDS_RUN_AS_USER}" != "root" && -n "${OPENHANDS_RUN_AS_USER}" ]]; then + shell_cmd='mkdir -p "${OPENHANDS_AUDIT_DIR}" "${OPENHANDS_PERSISTENCE_DIR}" "'"${runtime_config_dir}"'" "'"${runtime_state_dir}"'" && printf "%s\n" "set -euo pipefail" "cd \"\${OPENHANDS_WORKSPACE}\"" "eval \"\${OPENHANDS_CMD_TEMPLATE}\"" > "'"${runtime_script_path}"'" && chmod 755 "'"${runtime_script_path}"'" && cp /root/.openhands/agent_settings.json "'"${runtime_settings_path}"'" && chmod 777 "'"${runtime_home}"'" "'"${runtime_config_dir}"'" "'"${runtime_state_dir}"'" "${OPENHANDS_PERSISTENCE_DIR}" && chmod 644 "'"${runtime_settings_path}"'" && chmod o+rx /root && runuser -u "'"${OPENHANDS_RUN_AS_USER}"'" -- env HOME="'"${runtime_home}"'" PATH="/usr/local/bin:/usr/bin:/bin:/root/.local/bin" OPENHANDS_SETTINGS_PATH="'"${runtime_settings_path}"'" /bin/bash "'"${runtime_script_path}"'"' + else + shell_cmd='mkdir -p "${OPENHANDS_AUDIT_DIR}" "${OPENHANDS_PERSISTENCE_DIR}" && cd "${OPENHANDS_WORKSPACE}" && eval "${OPENHANDS_CMD_TEMPLATE}"' + fi local -a cmd=( env "AI_LAB_HOST_MOUNT_ROOT=${host_mount_root}" + "OPENHANDS_HOME_DIR=${OPENHANDS_CONFIG_DIR}" "${REPO_ROOT}/scripts/launch_ai_lab.sh" run env "OPENHANDS_PROMPT=${PROMPT}" "OPENHANDS_PROMPT_FILE=${container_prompt_file}" - "OPENHANDS_WORKSPACE=/opt/workspace" + "OPENHANDS_WORKSPACE=${OPENHANDS_CONTAINER_WORKSPACE}" "OPENHANDS_AUDIT_DIR=${container_audit_dir}" "OPENHANDS_AUDIT_FILE=${container_audit_file}" "OPENHANDS_MAX_FILES_PER_STEP=${MAX_FILES_PER_STEP}" "OPENHANDS_CMD=${OPENHANDS_CMD}" "OPENHANDS_CMD_TEMPLATE=${OPENHANDS_CMD_TEMPLATE}" + "OPENHANDS_HEADLESS=${OPENHANDS_HEADLESS}" + "OPENHANDS_JSON=${OPENHANDS_JSON}" + "OPENHANDS_EXPERIMENTAL=${OPENHANDS_EXPERIMENTAL}" + "OPENHANDS_SANDBOX_PROVIDER=${OPENHANDS_SANDBOX_PROVIDER}" + "OPENHANDS_SANDBOX_VOLUMES=${container_sandbox_volumes}" + "OPENHANDS_PERSISTENCE_DIR=${container_persistence_dir}" + "OPENHANDS_SETTINGS_PATH=${runtime_settings_path}" + "OPENHANDS_RUN_AS_USER=${OPENHANDS_RUN_AS_USER}" + "OPENHANDS_RUN_AS_HOME=${runtime_home}" + "OPENHANDS_RUN_AS_ENABLED=${OPENHANDS_RUN_AS_ENABLED}" /bin/bash -lc "${shell_cmd}" ) @@ -174,6 +330,9 @@ run_ai_lab() { EXTRA_VOLUME="${extra_volume}" "${cmd[@]}" } +mkdir -p "$(dirname "${OPENHANDS_CONFIG_DIR}")" +ensure_openhands_agent_settings + case "${RUNTIME}" in host) run_host diff --git a/scripts/smoke_control_plane_hardening.py b/scripts/smoke_control_plane_hardening.py new file mode 100644 index 00000000..7151ca2a --- /dev/null +++ b/scripts/smoke_control_plane_hardening.py @@ -0,0 +1,469 @@ +#!/usr/bin/env python3 +from __future__ import annotations + +import os +from pathlib import Path +import sys +import tempfile +from typing import Any +from urllib.parse import parse_qs, urlparse + +REPO_ROOT = Path(__file__).resolve().parents[1] +SRC_ROOT = REPO_ROOT / "src" +if str(SRC_ROOT) not in sys.path: + sys.path.insert(0, str(SRC_ROOT)) + +from fastapi.testclient import TestClient + +from autoresearch.agent_protocol.models import DriverResult, JobSpec, RunSummary, ValidationReport +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.api import dependencies +from autoresearch.api.main import app +from autoresearch.api.routers import gateway_telegram +from autoresearch.core.adapters import CapabilityProviderRegistry +from autoresearch.core.services.admin_auth import AdminAuthService +from autoresearch.core.services.admin_config import AdminConfigService +from autoresearch.core.services.agent_audit_trail import AgentAuditTrailService +from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService +from autoresearch.core.services.claude_agents import ClaudeAgentService +from autoresearch.core.services.openclaw_compat import OpenClawCompatService +from autoresearch.core.services.openclaw_memory import OpenClawMemoryService +from autoresearch.core.services.panel_access import PanelAccessService +from autoresearch.core.services.panel_audit import PanelAuditService +from autoresearch.shared.models import PromotionDiffStats, PromotionResult, utc_now +from autoresearch.shared.store import InMemoryRepository + + +class _StubTelegramNotifier: + def __init__(self) -> None: + self.messages: list[dict[str, Any]] = [] + + @property + def enabled(self) -> bool: + return True + + def send_message( + self, + *, + chat_id: str, + text: str, + disable_web_page_preview: bool = True, + reply_markup: dict[str, object] | None = None, + ) -> bool: + self.messages.append( + { + "chat_id": chat_id, + "text": text, + "disable_web_page_preview": disable_web_page_preview, + "reply_markup": reply_markup, + } + ) + return True + + def notify_manual_action(self, **_: object) -> bool: + return True + + +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _write_patch(repo_root: Path, *, run_id: str, changed_paths: list[str]) -> Path: + patch_dir = repo_root / ".masfactory_runtime" / "smokes" / run_id / "artifacts" + patch_dir.mkdir(parents=True, exist_ok=True) + patch_path = patch_dir / "promotion.patch" + changed_path = changed_paths[0] if changed_paths else "src/demo.py" + patch_path.write_text( + "\n".join( + [ + f"diff --git a/{changed_path} b/{changed_path}", + f"--- a/{changed_path}", + f"+++ b/{changed_path}", + "@@ -1 +1 @@", + "+CONTROL_PLANE_SMOKE = True", + "", + ] + ), + encoding="utf-8", + ) + return patch_path + + +def _planner_dispatch_runner_factory(repo_root: Path): + def _dispatch_runner(job: JobSpec) -> RunSummary: + changed_paths = list(job.policy.allowed_paths) or ["src/autoresearch/core/services/control_plane_target.py"] + patch_path = _write_patch(repo_root, run_id=job.run_id, changed_paths=changed_paths) + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="planner fallback lane completed", + changed_paths=changed_paths, + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri=str(patch_path), + ) + + return _dispatch_runner + + +def _manager_dispatch_runner_factory(repo_root: Path): + def _dispatch_runner(job: JobSpec) -> RunSummary: + changed_paths = list(job.policy.allowed_paths) or ["src/autoresearch/api/routers/admin.py"] + patch_path = _write_patch(repo_root, run_id=job.run_id, changed_paths=changed_paths) + promotion = PromotionResult( + run_id=job.run_id, + success=True, + mode="draft_pr", + pr_url=f"https://example.invalid/pr/{job.run_id[-6:]}", + changed_files=changed_paths, + diff_stats=PromotionDiffStats(files_changed=len(changed_paths), insertions=12, deletions=2), + created_at=utc_now(), + updated_at=utc_now(), + ) + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="manager smoke dispatch completed", + changed_paths=changed_paths, + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri=str(patch_path), + promotion=promotion, + ) + + return _dispatch_runner + + +def _extract_panel_token(url: str) -> str: + parsed = urlparse(url) + return parse_qs(parsed.query)["token"][0] + + +def _build_services(repo_root: Path) -> dict[str, object]: + notifier = _StubTelegramNotifier() + openclaw_service = OpenClawCompatService(repository=InMemoryRepository()) + claude_service = ClaudeAgentService( + repository=InMemoryRepository(), + openclaw_service=openclaw_service, + repo_root=repo_root, + max_agents=10, + max_depth=3, + ) + planner_service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + dispatch_runner=_planner_dispatch_runner_factory(repo_root), + ) + manager_service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + dispatch_runner=_manager_dispatch_runner_factory(repo_root), + ) + audit_service = AgentAuditTrailService( + repo_root=repo_root, + planner_service=planner_service, + manager_service=manager_service, + agent_service=claude_service, + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="https://panel.example/api/v1/panel/view", + mini_app_url="https://panel.example/api/v1/panel/view", + allowed_uids={"10001"}, + ) + panel_audit = PanelAuditService(repository=InMemoryRepository()) + approval_store = ApprovalStoreService(repository=InMemoryRepository()) + openclaw_memory = OpenClawMemoryService(repository=InMemoryRepository(), openclaw_service=openclaw_service) + admin_config = AdminConfigService( + agent_repository=InMemoryRepository(), + channel_repository=InMemoryRepository(), + revision_repository=InMemoryRepository(), + ) + admin_auth = AdminAuthService(secret="admin-smoke-secret", bootstrap_key="bootstrap-smoke-key") + capability_registry = CapabilityProviderRegistry() + return { + "notifier": notifier, + "openclaw_service": openclaw_service, + "claude_service": claude_service, + "planner_service": planner_service, + "manager_service": manager_service, + "audit_service": audit_service, + "panel_access": panel_access, + "panel_audit": panel_audit, + "approval_store": approval_store, + "openclaw_memory": openclaw_memory, + "admin_config": admin_config, + "admin_auth": admin_auth, + "capability_registry": capability_registry, + } + + +def _install_overrides(services: dict[str, object]) -> None: + app.dependency_overrides[dependencies.get_telegram_notifier_service] = lambda: services["notifier"] + app.dependency_overrides[dependencies.get_openclaw_compat_service] = lambda: services["openclaw_service"] + app.dependency_overrides[dependencies.get_claude_agent_service] = lambda: services["claude_service"] + app.dependency_overrides[dependencies.get_autoresearch_planner_service] = lambda: services["planner_service"] + app.dependency_overrides[dependencies.get_manager_agent_service] = lambda: services["manager_service"] + app.dependency_overrides[dependencies.get_panel_access_service] = lambda: services["panel_access"] + app.dependency_overrides[dependencies.get_panel_audit_service] = lambda: services["panel_audit"] + app.dependency_overrides[dependencies.get_approval_store_service] = lambda: services["approval_store"] + app.dependency_overrides[dependencies.get_openclaw_memory_service] = lambda: services["openclaw_memory"] + app.dependency_overrides[dependencies.get_admin_config_service] = lambda: services["admin_config"] + app.dependency_overrides[dependencies.get_admin_auth_service] = lambda: services["admin_auth"] + app.dependency_overrides[dependencies.get_agent_audit_trail_service] = lambda: services["audit_service"] + app.dependency_overrides[dependencies.get_capability_provider_registry] = lambda: services["capability_registry"] + + +def _run_planner_panel_smoke(client: TestClient, services: dict[str, object], repo_root: Path) -> dict[str, Any]: + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "control_plane_target.py", + "\n".join( + [ + "def control_plane_target() -> bool:", + " # FIXME: cover the remote fallback control-plane flow", + " return True", + "", + ] + ), + ) + + create_response = client.post( + "/api/v1/autoresearch/plans", + json={ + "goal": "Smoke the offline control plane fallback path.", + "telegram_uid": "10001", + "metadata": { + "runtime_mode": "night", + "remote_available": False, + }, + }, + ) + assert create_response.status_code == 202, create_response.text + plan_payload = create_response.json() + plan_id = plan_payload["plan_id"] + + panel_access = services["panel_access"] + assert isinstance(panel_access, PanelAccessService) + token = _extract_panel_token(panel_access.create_magic_link("10001").url) + headers = {"x-autoresearch-panel-token": token} + + state_before = client.get("/api/v1/panel/state", headers=headers) + assert state_before.status_code == 200, state_before.text + pending_plans = state_before.json()["pending_autoresearch_plans"] + assert len(pending_plans) == 1 + assert pending_plans[0]["plan_id"] == plan_id + + dispatch_response = client.post( + f"/api/v1/panel/autoresearch/plans/{plan_id}/dispatch", + headers=headers, + json={"note": "smoke dispatch", "metadata": {"source": "smoke"}}, + ) + assert dispatch_response.status_code == 200, dispatch_response.text + queued_payload = dispatch_response.json() + assert queued_payload["dispatch_status"] == "dispatching" + assert queued_payload["dispatch_run"]["status"] == "queued" + assert queued_payload["dispatch_run"]["requested_lane"] == "remote" + assert queued_payload["dispatch_run"]["lane"] == "local" + + plan_after = client.get(f"/api/v1/autoresearch/plans/{plan_id}") + assert plan_after.status_code == 200, plan_after.text + dispatched_payload = plan_after.json() + assert dispatched_payload["dispatch_status"] == "dispatched" + assert dispatched_payload["dispatch_run"]["requested_lane"] == "remote" + assert dispatched_payload["dispatch_run"]["lane"] == "local" + assert dispatched_payload["dispatch_run"]["status"] == "succeeded" + assert dispatched_payload["dispatch_run"]["fallback_reason"] + assert dispatched_payload["run_summary"]["final_status"] == "ready_for_promotion" + assert dispatched_payload["dispatch_error"] is None + + summary_relpath = dispatched_payload["dispatch_run"]["artifact_paths"]["summary"] + summary_path = repo_root / summary_relpath + assert summary_path.exists(), summary_path + + state_after = client.get("/api/v1/panel/state", headers=headers) + assert state_after.status_code == 200, state_after.text + assert state_after.json()["pending_autoresearch_plans"] == [] + + notifier = services["notifier"] + assert isinstance(notifier, _StubTelegramNotifier) + dispatch_messages = [ + message for message in notifier.messages if "[AutoResearch Dispatch]" in str(message["text"]) + ] + assert dispatch_messages, notifier.messages + latest_dispatch_message = str(dispatch_messages[-1]["text"]) + assert "- lane: local" in latest_dispatch_message + assert "- remote_status: succeeded" in latest_dispatch_message + assert "- final_status: ready_for_promotion" in latest_dispatch_message + + return { + "plan_id": plan_id, + "requested_lane": dispatched_payload["dispatch_run"]["requested_lane"], + "lane": dispatched_payload["dispatch_run"]["lane"], + "remote_status": dispatched_payload["dispatch_run"]["status"], + "dispatch_status": dispatched_payload["dispatch_status"], + "final_status": dispatched_payload["run_summary"]["final_status"], + "fallback_reason": dispatched_payload["dispatch_run"]["fallback_reason"], + "summary_artifact": summary_relpath, + } + + +def _run_gateway_regression_smoke(client: TestClient, services: dict[str, object]) -> dict[str, Any]: + gateway_telegram._SEEN_UPDATES.clear() + gateway_telegram._CHAT_RATE_WINDOWS.clear() + + response = client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 99001, + "message": { + "message_id": 501, + "text": "/task 为美妆品牌玛露开发 6g 遮瑕膏落地页", + "chat": {"id": 10001, "type": "private"}, + "from": {"id": 10001, "username": "control-plane-smoke"}, + }, + }, + ) + assert response.status_code == 200, response.text + payload = response.json() + assert payload["accepted"] is True + assert payload["metadata"]["source"] == "telegram_manager_task" + dispatch_id = payload["metadata"]["dispatch_id"] + + manager_service = services["manager_service"] + assert isinstance(manager_service, ManagerAgentService) + dispatch = manager_service.get_dispatch(dispatch_id) + assert dispatch is not None + assert dispatch.status.value == "completed" + assert dispatch.run_summary is not None + assert dispatch.run_summary.final_status == "ready_for_promotion" + + return { + "dispatch_id": dispatch_id, + "status": dispatch.status.value, + "final_status": dispatch.run_summary.final_status, + } + + +def _run_admin_audit_smoke( + client: TestClient, + services: dict[str, object], + planner_result: dict[str, Any], + gateway_result: dict[str, Any], +) -> dict[str, Any]: + admin_auth = services["admin_auth"] + assert isinstance(admin_auth, AdminAuthService) + admin_token = admin_auth.issue_token( + subject="control-plane-smoke", + roles=["owner"], + bootstrap_key="bootstrap-smoke-key", + ttl_seconds=3600, + ).token + headers = {"authorization": f"Bearer {admin_token}"} + + snapshot_response = client.get("/api/v1/admin/audit-trail?limit=20", headers=headers) + assert snapshot_response.status_code == 200, snapshot_response.text + snapshot = snapshot_response.json() + planner_item = next(item for item in snapshot["items"] if item["source"] == "autoresearch_plan") + manager_item = next(item for item in snapshot["items"] if item["source"] == "manager_task") + + assert planner_item["status"] == planner_result["dispatch_status"] + assert planner_item["final_status"] == planner_result["final_status"] + assert planner_item["metadata"]["dispatch_requested_lane"] == planner_result["requested_lane"] + assert planner_item["metadata"]["dispatch_lane"] == planner_result["lane"] + assert planner_item["metadata"]["dispatch_remote_status"] == planner_result["remote_status"] + assert planner_item["metadata"]["dispatch_fallback_reason"] == planner_result["fallback_reason"] + assert manager_item["entry_id"].startswith("manager:") + assert manager_item["final_status"] == gateway_result["final_status"] + + detail_response = client.get( + f"/api/v1/admin/audit-trail/{planner_item['entry_id']}", + headers=headers, + ) + assert detail_response.status_code == 200, detail_response.text + detail = detail_response.json() + raw_plan = detail["raw_record"]["autoresearch_plan"] + assert raw_plan["dispatch_status"] == planner_result["dispatch_status"] + assert raw_plan["dispatch_run"]["status"] == planner_result["remote_status"] + assert raw_plan["dispatch_run"]["lane"] == planner_result["lane"] + assert raw_plan["run_summary"]["final_status"] == planner_result["final_status"] + + return { + "planner_entry_id": planner_item["entry_id"], + "manager_entry_id": manager_item["entry_id"], + "planner_dispatch_lane": planner_item["metadata"]["dispatch_lane"], + "planner_remote_status": planner_item["metadata"]["dispatch_remote_status"], + } + + +def _print_report(results: dict[str, Any]) -> None: + print("control-plane-hardening smoke: PASS") + print( + "planner/panel: " + f"plan={results['planner']['plan_id']} " + f"requested={results['planner']['requested_lane']} " + f"lane={results['planner']['lane']} " + f"remote_status={results['planner']['remote_status']} " + f"dispatch_status={results['planner']['dispatch_status']} " + f"final_status={results['planner']['final_status']}" + ) + print( + "gateway/task: " + f"dispatch={results['gateway']['dispatch_id']} " + f"status={results['gateway']['status']} " + f"final_status={results['gateway']['final_status']}" + ) + print( + "admin/audit: " + f"planner_entry={results['admin']['planner_entry_id']} " + f"manager_entry={results['admin']['manager_entry_id']} " + f"dispatch_lane={results['admin']['planner_dispatch_lane']} " + f"remote_status={results['admin']['planner_remote_status']}" + ) + + +def main() -> int: + with tempfile.TemporaryDirectory(prefix="control-plane-smoke-") as temp_dir: + repo_root = Path(temp_dir) / "repo" + repo_root.mkdir(parents=True, exist_ok=True) + services = _build_services(repo_root) + _install_overrides(services) + previous_allowed = os.environ.get("AUTORESEARCH_TELEGRAM_ALLOWED_UIDS") + os.environ["AUTORESEARCH_TELEGRAM_ALLOWED_UIDS"] = "10001" + try: + with TestClient(app) as client: + planner_result = _run_planner_panel_smoke(client, services, repo_root) + gateway_result = _run_gateway_regression_smoke(client, services) + admin_result = _run_admin_audit_smoke(client, services, planner_result, gateway_result) + finally: + if previous_allowed is None: + os.environ.pop("AUTORESEARCH_TELEGRAM_ALLOWED_UIDS", None) + else: + os.environ["AUTORESEARCH_TELEGRAM_ALLOWED_UIDS"] = previous_allowed + app.dependency_overrides.clear() + + _print_report( + { + "planner": planner_result, + "gateway": gateway_result, + "admin": admin_result, + } + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/scripts/sync_openclaw_upstream.sh b/scripts/sync_openclaw_upstream.sh new file mode 100755 index 00000000..ceef9797 --- /dev/null +++ b/scripts/sync_openclaw_upstream.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash +set -euo pipefail + +UPSTREAM_URL="${OPENCLAW_UPSTREAM_URL:-https://github.com/openclaw/openclaw.git}" +WORKSPACE_ROOT="${OPENCLAW_SYNC_WORKSPACE_ROOT:-/Volumes/AI_LAB/ai_lab/workspace}" +KEEP_CLONE="${OPENCLAW_SYNC_KEEP_CLONE:-0}" +MAX_COMMITS="${OPENCLAW_SYNC_MAX_COMMITS:-5}" + +need_cmd() { + command -v "$1" >/dev/null 2>&1 || { + echo "[error] missing command: $1" >&2 + exit 1 + } +} + +need_cmd git +need_cmd mktemp + +mkdir -p "${WORKSPACE_ROOT}" +SYNC_DIR="$(mktemp -d "${WORKSPACE_ROOT%/}/openclaw-upstream.XXXXXX")" + +cleanup() { + if [[ "${KEEP_CLONE}" == "1" ]]; then + return + fi + find "${WORKSPACE_ROOT}" -mindepth 1 -maxdepth 1 -type d -name 'openclaw-upstream.*' -exec rm -rf {} + +} + +trap cleanup EXIT + +echo "[sync] detecting upstream default branch for ${UPSTREAM_URL}" +DEFAULT_BRANCH="$(git ls-remote --symref "${UPSTREAM_URL}" HEAD | awk '/^ref:/ {sub("refs/heads/","",$2); print $2; exit}')" +DEFAULT_BRANCH="${DEFAULT_BRANCH:-main}" +echo "[sync] cloning ${UPSTREAM_URL} (branch=${DEFAULT_BRANCH}, depth=1)" +echo "[sync] target workspace: ${SYNC_DIR}" +git clone --progress --depth=1 --single-branch --no-tags --branch "${DEFAULT_BRANCH}" "${UPSTREAM_URL}" "${SYNC_DIR}" +git -C "${SYNC_DIR}" fetch --depth="$((MAX_COMMITS + 1))" origin "${DEFAULT_BRANCH}" + +echo +echo "[sync] latest commit" +git -C "${SYNC_DIR}" log "origin/${DEFAULT_BRANCH}" -1 --decorate=short --date=iso --stat + +echo +echo "[sync] recent commits" +git -C "${SYNC_DIR}" log "origin/${DEFAULT_BRANCH}" -"${MAX_COMMITS}" --date=short --pretty=format:'- %ad %h %s' +echo + +echo +echo "[sync] recent touched files" +for commit in $(git -C "${SYNC_DIR}" rev-list --max-count="${MAX_COMMITS}" "origin/${DEFAULT_BRANCH}"); do + git -C "${SYNC_DIR}" diff-tree --no-commit-id --name-only -r -m "${commit}" +done | sed '/^$/d' | sort -u | sed 's/^/- /' + +echo +if [[ "${KEEP_CLONE}" == "1" ]]; then + echo "[sync] clone ready at ${SYNC_DIR}" +else + echo "[sync] analysis complete; cleaning ${WORKSPACE_ROOT%/}/openclaw-upstream.*" +fi diff --git a/src/autoresearch/agent_protocol/decision.py b/src/autoresearch/agent_protocol/decision.py index 607f8a4c..c5716b95 100644 --- a/src/autoresearch/agent_protocol/decision.py +++ b/src/autoresearch/agent_protocol/decision.py @@ -8,7 +8,7 @@ def attempt_succeeded(driver_result: DriverResult, validation: ValidationReport) def derive_terminal_status(driver_result: DriverResult, validation: ValidationReport) -> str: - if driver_result.status in {"contract_error", "failed", "timed_out"}: + if driver_result.status in {"contract_error", "failed", "timed_out", "stalled_no_progress"}: return "failed" if driver_result.status == "policy_blocked": return "blocked" diff --git a/src/autoresearch/agent_protocol/models.py b/src/autoresearch/agent_protocol/models.py index 0092361e..752487fc 100644 --- a/src/autoresearch/agent_protocol/models.py +++ b/src/autoresearch/agent_protocol/models.py @@ -22,7 +22,7 @@ class ExecutionPolicy(StrictModel): tool_allowlist: list[str] = Field(default_factory=lambda: ["read", "write", "bash"]) - allowed_paths: list[str] = Field(default_factory=lambda: ["src/**", "tests/**", "docs/**"]) + allowed_paths: list[str] = Field(default_factory=lambda: ["src/**", "tests/**", "docs/**", "apps/**"]) forbidden_paths: list[str] = Field( default_factory=lambda: [ ".git/**", @@ -35,7 +35,7 @@ class ExecutionPolicy(StrictModel): ) max_changed_files: int = Field(default=20, ge=0, le=1000) - max_patch_lines: int = Field(default=500, ge=0, le=100000) + max_patch_lines: int = Field(default=2000, ge=0, le=100000) allow_binary_changes: bool = False cleanup_on_success: bool = True @@ -80,6 +80,9 @@ class DriverMetrics(StrictModel): commands: int = 0 prompt_tokens: int | None = None completion_tokens: int | None = None + first_progress_ms: int | None = None + first_scoped_write_ms: int | None = None + first_state_heartbeat_ms: int | None = None class DriverResult(StrictModel): @@ -94,6 +97,7 @@ class DriverResult(StrictModel): "partial", "failed", "timed_out", + "stalled_no_progress", "policy_blocked", "contract_error", ] diff --git a/src/autoresearch/agent_protocol/policy.py b/src/autoresearch/agent_protocol/policy.py index f3260936..ca58d407 100644 --- a/src/autoresearch/agent_protocol/policy.py +++ b/src/autoresearch/agent_protocol/policy.py @@ -112,7 +112,7 @@ def merge_policy(*policies: ExecutionPolicy) -> ExecutionPolicy: network="disabled", network_allowlist=[], tool_allowlist=["read", "write", "bash"], - allowed_paths=["src/**", "tests/**", "docs/**"], + allowed_paths=["src/**", "tests/**", "scripts/**", "docs/**", "apps/**"], forbidden_paths=[ ".git/**", "logs/**", @@ -122,7 +122,7 @@ def merge_policy(*policies: ExecutionPolicy) -> ExecutionPolicy: "**/*.pem", ], max_changed_files=20, - max_patch_lines=500, + max_patch_lines=2000, allow_binary_changes=False, cleanup_on_success=True, retain_workspace_on_failure=True, diff --git a/src/autoresearch/agents/manager_agent.py b/src/autoresearch/agents/manager_agent.py new file mode 100644 index 00000000..3f7e15b3 --- /dev/null +++ b/src/autoresearch/agents/manager_agent.py @@ -0,0 +1,907 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path +import re +import sys +from typing import Callable, Iterable + +from autoresearch.agent_protocol.models import JobSpec, RunSummary +from autoresearch.core.services.openhands_worker import OpenHandsWorkerService +from autoresearch.core.services.writer_lease import WriterLeaseService +from autoresearch.executions.runner import AgentExecutionRunner +from autoresearch.shared.housekeeper_contract import AdmissionRiskLevel, TaskAdmissionAssessmentRead +from autoresearch.shared.manager_agent_contract import ( + ManagerDispatchRead, + ManagerDispatchRequest, + ManagerExecutionPlanRead, + ManagerIntentRead, + ManagerPlanStrategy, + ManagerPlanTaskRead, + ManagerTaskStage, +) +from autoresearch.shared.models import JobStatus, utc_now +from autoresearch.shared.openhands_worker_contract import OpenHandsWorkerJobSpec +from autoresearch.shared.store import Repository, create_resource_id + + +_COMPLEXITY_MARKERS = ( + "完整", + "全套", + "系统", + "平台", + "dashboard", + "大屏", + "监控", + "实时", + "图表", + "电商", + "小程序", + "frontend", + "backend", + "前端", + "后端", + "端到端", +) + +_BUSINESS_MARKERS = ( + "落地页", + "landing page", + "landing", + "产品", + "product", + "品牌", + "brand", + "前端", + "ui", + "页面", + "page", + "预约", + "留资", + "lead capture", + "marketing", + "美妆", +) + +_INFRA_MARKERS = ( + "openhands", + "worker", + "runner", + "sandbox", + "promotion gate", + "writerlease", + "writer lease", + "patch-only", + "pipeline", + "preflight", + "docker_host", + "socket", + "执行链", + "基建", + "调度", +) + +_PRODUCT_SLUG_ALIASES = { + "玛露": "malu", +} + +_PRODUCT_SLUG_STOPWORDS = { + "landing", + "page", + "product", + "brand", + "lead", + "capture", + "frontend", + "backend", + "page", + "panel", + "admin", + "openclaw", + "autoresearch", +} + + +@dataclass(frozen=True, slots=True) +class _IntentRule: + intent_id: str + label: str + summary: str + keywords: tuple[str, ...] + preferred_paths: tuple[str, ...] + preferred_tests: tuple[str, ...] + goal_template: str + + +_INTENT_RULES = ( + _IntentRule( + intent_id="product_landing_page", + label="product_landing_page", + summary="Build a product or marketing landing page with bounded lead-capture support.", + keywords=( + "落地页", + "landing page", + "landing", + "页面", + "page", + "预约", + "留资", + "lead capture", + "marketing", + "品牌", + "product", + "产品", + "前端", + "ui", + "美妆", + "产品页", + ), + preferred_paths=( + "apps/**", + "tests/apps/**", + ), + preferred_tests=("tests/apps/**",), + goal_template="Build a bounded landing-page style product surface for: {prompt}", + ), + _IntentRule( + intent_id="admin_dashboard", + label="admin_dashboard", + summary="Build an admin/panel dashboard feature across API, tests, and UI.", + keywords=("dashboard", "大屏", "监控", "图表", "admin panel", "实时"), + preferred_paths=( + "src/autoresearch/api/routers/admin.py", + "src/autoresearch/api/routers/panel.py", + "src/autoresearch/core/services/**", + "panel/**", + "tests/test_panel_security.py", + "tests/test_admin_managed_skills.py", + ), + preferred_tests=("tests/test_panel_security.py", "tests/test_admin_managed_skills.py"), + goal_template="Build an admin dashboard style feature for: {prompt}", + ), + _IntentRule( + intent_id="game_prototype", + label="game_prototype", + summary="Prototype an interactive experience on the existing panel surface.", + keywords=("小游戏", "游戏", "game", "gameplay", "demo"), + preferred_paths=( + "panel/**", + "src/autoresearch/api/routers/panel.py", + "src/autoresearch/api/routers/openclaw.py", + "tests/test_panel_security.py", + ), + preferred_tests=("tests/test_panel_security.py",), + goal_template="Prototype a lightweight interactive feature on the existing panel surface for: {prompt}", + ), + _IntentRule( + intent_id="telegram_surface", + label="telegram_surface", + summary="Extend Telegram and Mini App control surfaces.", + keywords=("telegram", "mini app", "miniapp", "通知", "提醒"), + preferred_paths=( + "src/autoresearch/api/routers/autoresearch_plans.py", + "src/autoresearch/api/routers/panel.py", + "src/autoresearch/core/services/telegram_notify.py", + "tests/test_autoresearch_planner.py", + "tests/test_panel_security.py", + ), + preferred_tests=("tests/test_autoresearch_planner.py", "tests/test_panel_security.py"), + goal_template="Improve Telegram/Mini App workflow support for: {prompt}", + ), + _IntentRule( + intent_id="approval_surface", + label="approval_surface", + summary="Improve approval, admin, or review surfaces without changing safety policy.", + keywords=("审批", "审核", "admin", "panel", "管理"), + preferred_paths=( + "src/autoresearch/api/routers/admin.py", + "src/autoresearch/api/routers/panel.py", + "tests/test_admin_managed_skills.py", + "tests/test_panel_security.py", + ), + preferred_tests=("tests/test_admin_managed_skills.py", "tests/test_panel_security.py"), + goal_template="Improve operator approval workflows for: {prompt}", + ), + _IntentRule( + intent_id="worker_execution", + label="worker_execution", + summary="Improve planner, worker, or execution infrastructure.", + keywords=( + "openhands", + "worker", + "runner", + "sandbox", + "promotion gate", + "patch-only", + "writer lease", + "preflight", + "socket", + "docker_host", + "pipeline", + ), + preferred_paths=( + "src/autoresearch/core/services/openhands_worker.py", + "src/autoresearch/core/services/autoresearch_planner.py", + "tests/test_openhands_worker.py", + "tests/test_autoresearch_planner.py", + ), + preferred_tests=("tests/test_openhands_worker.py", "tests/test_autoresearch_planner.py"), + goal_template="Improve the execution pipeline for: {prompt}", + ), + _IntentRule( + intent_id="generic_product", + label="generic_product", + summary="Ship a bounded product improvement through existing API/core surfaces.", + keywords=(), + preferred_paths=( + "src/autoresearch/api/**", + "src/autoresearch/core/services/**", + "panel/**", + "tests/**", + ), + preferred_tests=("tests/test_autoresearch_planner.py",), + goal_template="Design and implement a bounded product increment for: {prompt}", + ), +) + + +class ManagerAgentService: + """Translate ambiguous founder prompts into bounded patch-only execution plans.""" + + def __init__( + self, + repository: Repository[ManagerDispatchRead], + *, + repo_root: Path | None = None, + worker_service: OpenHandsWorkerService | None = None, + dispatch_runner: Callable[[JobSpec], RunSummary] | None = None, + writer_lease: WriterLeaseService | None = None, + ) -> None: + self._repository = repository + self._repo_root = (repo_root or Path(__file__).resolve().parents[3]).resolve() + self._worker_service = worker_service or OpenHandsWorkerService() + self._dispatch_runner = dispatch_runner or self._default_dispatch_runner + self._writer_lease = writer_lease or WriterLeaseService() + + def create_dispatch(self, request: ManagerDispatchRequest) -> ManagerDispatchRead: + now = utc_now() + dispatch_id = create_resource_id("mgrdispatch") + intent = self._select_intent(request.prompt) + normalized_goal = str(intent.metadata["normalized_goal"]) + execution_plan = self._build_execution_plan(dispatch_id=dispatch_id, request=request, intent=intent) + primary_task = execution_plan.tasks[0] if execution_plan.tasks else None + status = JobStatus.QUEUED if request.auto_dispatch else JobStatus.CREATED + + dispatch = ManagerDispatchRead( + dispatch_id=dispatch_id, + prompt=request.prompt, + normalized_goal=normalized_goal, + status=status, + summary=( + f"Manager routed prompt to {intent.label} and produced " + f"{len(execution_plan.tasks)} task(s)." + ), + created_at=now, + updated_at=now, + selected_intent=intent, + execution_plan=execution_plan, + worker_spec=primary_task.worker_spec if primary_task is not None else None, + controlled_request=primary_task.controlled_request if primary_task is not None else None, + agent_job=primary_task.agent_job if primary_task is not None else None, + run_summary=None, + metadata={ + **request.metadata, + "pipeline_target": request.pipeline_target, + "target_base_branch": request.target_base_branch, + "plan_strategy": execution_plan.strategy.value, + }, + error=None, + ) + return self._repository.save(dispatch.dispatch_id, dispatch) + + def assess_request(self, request: ManagerDispatchRequest) -> TaskAdmissionAssessmentRead: + intent = self._select_intent(request.prompt) + execution_plan = self._build_execution_plan( + dispatch_id="mgrpreview", + request=request.model_copy(update={"auto_dispatch": False}), + intent=intent, + ) + fanout_count = max(1, len(execution_plan.tasks)) + estimated_runtime_minutes = 10 if execution_plan.strategy is ManagerPlanStrategy.SINGLE_TASK else fanout_count * 20 + if intent.intent_id in {"product_landing_page", "admin_dashboard"} and execution_plan.strategy is ManagerPlanStrategy.TASK_DAG: + estimated_runtime_minutes += 15 + + risk_level = AdmissionRiskLevel.LOW + if fanout_count > 1: + risk_level = AdmissionRiskLevel.HIGH + elif intent.intent_id in {"worker_execution", "approval_surface", "telegram_surface"}: + risk_level = AdmissionRiskLevel.MEDIUM + + return TaskAdmissionAssessmentRead( + plan_shape=execution_plan.strategy.value, + estimated_runtime_minutes=estimated_runtime_minutes, + requires_repo_write=True, + requires_network=request.pipeline_target == "draft_pr", + fanout_count=fanout_count, + risk_level=risk_level, + ) + + def list_dispatches(self) -> list[ManagerDispatchRead]: + return self._repository.list() + + def get_dispatch(self, dispatch_id: str) -> ManagerDispatchRead | None: + return self._repository.get(dispatch_id) + + def execute_dispatch(self, dispatch_id: str) -> ManagerDispatchRead: + dispatch = self._require_dispatch(dispatch_id) + plan = dispatch.execution_plan + if plan is None or not plan.tasks: + raise ValueError("manager dispatch does not have an execution plan") + + last_summary: RunSummary | None = None + for task in plan.tasks: + current = self._require_dispatch(dispatch_id) + current_task = self._require_plan_task(current, task.task_id) + self._ensure_dependencies_completed(current=current, task=current_task) + self._save_dispatch( + current, + task_id=task.task_id, + task_status=JobStatus.RUNNING, + ) + + if current_task.agent_job is None: + raise ValueError(f"manager task does not have a runnable job: {task.task_id}") + + try: + run_summary = self._dispatch_runner(current_task.agent_job) + except Exception as exc: + failed = self._save_dispatch( + self._require_dispatch(dispatch_id), + status=JobStatus.FAILED, + task_id=task.task_id, + task_status=JobStatus.FAILED, + task_error=str(exc), + error=str(exc), + summary=( + f"Manager plan failed on {task.task_id} " + f"after routing {len(plan.tasks)} task(s)." + ), + ) + return failed + + last_summary = run_summary + task_status = ( + JobStatus.COMPLETED + if run_summary.final_status in {"ready_for_promotion", "promoted"} + else JobStatus.FAILED + ) + if task_status is JobStatus.FAILED: + failed = self._save_dispatch( + self._require_dispatch(dispatch_id), + status=JobStatus.FAILED, + task_id=task.task_id, + task_status=task_status, + task_run_summary=run_summary, + task_error=run_summary.driver_result.error, + run_summary=run_summary, + error=run_summary.driver_result.error, + summary=( + f"Manager plan stopped on {task.task_id} " + f"with {run_summary.final_status}." + ), + ) + return failed + + self._save_dispatch( + self._require_dispatch(dispatch_id), + task_id=task.task_id, + task_status=JobStatus.COMPLETED, + task_run_summary=run_summary, + run_summary=run_summary, + ) + + completed = self._save_dispatch( + self._require_dispatch(dispatch_id), + status=JobStatus.COMPLETED, + run_summary=last_summary, + summary=( + f"Manager plan completed across {len(plan.tasks)} dependent task(s)." + ), + error=None, + ) + return completed + + def _build_execution_plan( + self, + *, + dispatch_id: str, + request: ManagerDispatchRequest, + intent: ManagerIntentRead, + ) -> ManagerExecutionPlanRead: + if self._should_decompose(request.prompt, intent): + tasks = self._build_decomposed_tasks(dispatch_id=dispatch_id, request=request, intent=intent) + return ManagerExecutionPlanRead( + plan_id=f"{dispatch_id}-plan", + strategy=ManagerPlanStrategy.TASK_DAG, + summary="Manager decomposed the prompt into backend -> tests -> frontend stages.", + tasks=tasks, + ) + + single_task = self._build_plan_task( + dispatch_id=dispatch_id, + request=request, + intent=intent, + task_suffix="primary", + title=f"Implement {intent.label}", + summary=intent.summary, + stage=ManagerTaskStage.GENERIC, + depends_on=[], + allowed_paths=intent.allowed_paths, + test_paths=intent.suggested_test_paths, + ) + return ManagerExecutionPlanRead( + plan_id=f"{dispatch_id}-plan", + strategy=ManagerPlanStrategy.SINGLE_TASK, + summary="Manager kept the prompt as a single bounded task.", + tasks=[single_task], + ) + + def _build_decomposed_tasks( + self, + *, + dispatch_id: str, + request: ManagerDispatchRequest, + intent: ManagerIntentRead, + ) -> list[ManagerPlanTaskRead]: + backend_paths = self._bucket_backend_paths(intent) + test_paths = self._bucket_test_paths(intent) + frontend_paths = self._bucket_frontend_paths(intent) + + backend_task = self._build_plan_task( + dispatch_id=dispatch_id, + request=request, + intent=intent, + task_suffix="backend", + title=f"Backend foundation for {intent.label}", + summary="Define or update service/API surfaces needed by the feature.", + stage=ManagerTaskStage.BACKEND, + depends_on=[], + allowed_paths=backend_paths, + test_paths=test_paths, + ) + tests_task = self._build_plan_task( + dispatch_id=dispatch_id, + request=request, + intent=intent, + task_suffix="tests", + title=f"Regression coverage for {intent.label}", + summary="Lock the backend contract and edge cases with focused tests.", + stage=ManagerTaskStage.TESTS, + depends_on=[backend_task.task_id], + allowed_paths=test_paths or ["tests/**"], + test_paths=test_paths, + ) + frontend_task = self._build_plan_task( + dispatch_id=dispatch_id, + request=request, + intent=intent, + task_suffix="frontend", + title=f"Frontend integration for {intent.label}", + summary="Connect the new capability to panel/admin surfaces without expanding the safety boundary.", + stage=ManagerTaskStage.FRONTEND, + depends_on=[backend_task.task_id, tests_task.task_id], + allowed_paths=frontend_paths, + test_paths=test_paths, + ) + return [backend_task, tests_task, frontend_task] + + def _build_plan_task( + self, + *, + dispatch_id: str, + request: ManagerDispatchRequest, + intent: ManagerIntentRead, + task_suffix: str, + title: str, + summary: str, + stage: ManagerTaskStage, + depends_on: list[str], + allowed_paths: list[str], + test_paths: list[str], + ) -> ManagerPlanTaskRead: + task_id = f"{dispatch_id}-{task_suffix}" + normalized_allowed_paths = self._normalize_scope( + allowed_paths, + fallback=["src/autoresearch/api/**", "src/autoresearch/core/services/**", "tests/**"], + ) + normalized_test_paths = self._normalize_scope( + test_paths, + fallback=["tests/test_autoresearch_planner.py"], + ) + worker_spec = self._build_worker_spec( + dispatch_id=dispatch_id, + request=request, + intent=intent, + task_id=task_id, + task_title=title, + task_summary=summary, + task_stage=stage, + depends_on=depends_on, + allowed_paths=normalized_allowed_paths, + test_paths=normalized_test_paths, + ) + return ManagerPlanTaskRead( + task_id=task_id, + title=title, + summary=summary, + stage=stage, + depends_on=depends_on, + status=JobStatus.CREATED, + worker_spec=worker_spec, + controlled_request=self._worker_service.build_controlled_request(worker_spec), + agent_job=self._worker_service.build_agent_job_spec(worker_spec), + run_summary=None, + metadata={ + "manager_intent_id": intent.intent_id, + "manager_stage": stage.value, + "dependency_count": len(depends_on), + }, + error=None, + ) + + def _select_intent(self, prompt: str) -> ManagerIntentRead: + normalized_prompt = prompt.strip() + prompt_folded = normalized_prompt.casefold() + best_rule = _INTENT_RULES[-1] + best_keywords: list[str] = [] + best_score = -1 + for rule in _INTENT_RULES: + matched = [keyword for keyword in rule.keywords if keyword.casefold() in prompt_folded] + score = self._score_intent_rule( + rule=rule, + prompt_folded=prompt_folded, + matched_keywords=matched, + ) + if score > best_score: + best_rule = rule + best_keywords = matched + best_score = score + normalized_goal = best_rule.goal_template.format(prompt=normalized_prompt) + intent_metadata: dict[str, str] = {"normalized_goal": normalized_goal} + + if best_rule.intent_id == "product_landing_page": + surface_scope = self._build_product_surface_scope(normalized_prompt) + allowed_paths = surface_scope["allowed_paths"] + suggested_tests = surface_scope["suggested_tests"] + intent_metadata.update(surface_scope["metadata"]) + else: + allowed_paths = self._resolve_existing_paths(best_rule.preferred_paths) + suggested_tests = self._resolve_existing_paths(best_rule.preferred_tests) + if not allowed_paths: + allowed_paths = ["src/autoresearch/api/**", "src/autoresearch/core/services/**", "tests/**"] + return ManagerIntentRead( + intent_id=best_rule.intent_id, + label=best_rule.label, + summary=best_rule.summary, + matched_keywords=best_keywords, + allowed_paths=allowed_paths, + suggested_test_paths=suggested_tests, + metadata=intent_metadata, + ) + + def _score_intent_rule( + self, + *, + rule: _IntentRule, + prompt_folded: str, + matched_keywords: list[str], + ) -> int: + score = len(matched_keywords) * 100 + business_hits = sum(1 for marker in _BUSINESS_MARKERS if marker.casefold() in prompt_folded) + infra_hits = sum(1 for marker in _INFRA_MARKERS if marker.casefold() in prompt_folded) + + if rule.intent_id == "product_landing_page": + score += business_hits * 15 + if any(marker.casefold() in prompt_folded for marker in ("落地页", "landing page", "留资", "预约")): + score += 40 + score -= infra_hits * 10 + elif rule.intent_id == "worker_execution": + score += infra_hits * 15 + score -= business_hits * 25 + elif rule.intent_id in {"admin_dashboard", "game_prototype", "generic_product"}: + score += business_hits * 5 + + return score + + def _bucket_backend_paths(self, intent: ManagerIntentRead) -> list[str]: + if intent.intent_id == "product_landing_page": + surface_root = str(intent.metadata.get("surface_root") or "apps/brand-site") + surface_test_init = str(intent.metadata.get("surface_test_init") or "tests/apps/__init__.py") + surface_test_path = str( + intent.metadata.get("surface_test_path") or "tests/apps/test_brand_site_landing_page.py" + ) + return [f"{surface_root}/**", surface_test_init, surface_test_path] + candidates = [ + path + for path in intent.allowed_paths + if path.startswith("src/") + and not self._is_frontend_path(path) + and not path.startswith("tests/") + ] + return self._normalize_scope( + candidates, + fallback=["src/autoresearch/api/**", "src/autoresearch/core/services/**"], + ) + + def _bucket_test_paths(self, intent: ManagerIntentRead) -> list[str]: + if intent.intent_id == "product_landing_page": + surface_test_init = str(intent.metadata.get("surface_test_init") or "tests/apps/__init__.py") + surface_test_path = str(intent.metadata.get("surface_test_path") or "tests/apps/test_brand_site_landing_page.py") + return [surface_test_init, surface_test_path] + candidates = [ + *intent.suggested_test_paths, + *[path for path in intent.allowed_paths if path.startswith("tests/")], + ] + return self._normalize_scope(candidates, fallback=["tests/test_autoresearch_planner.py"]) + + def _bucket_frontend_paths(self, intent: ManagerIntentRead) -> list[str]: + if intent.intent_id == "product_landing_page": + surface_root = str(intent.metadata.get("surface_root") or "apps/brand-site") + return [f"{surface_root}/**"] + candidates = [path for path in intent.allowed_paths if self._is_frontend_path(path)] + return self._normalize_scope( + candidates, + fallback=["panel/**", "src/autoresearch/api/routers/panel.py"], + ) + + def _build_product_surface_scope(self, prompt: str) -> dict[str, object]: + surface_slug = self._extract_product_surface_slug(prompt) + surface_root = f"apps/{surface_slug}" + surface_test_init = "tests/apps/__init__.py" + surface_test_path = f"tests/apps/test_{surface_slug}_landing_page.py" + surface_backend_entry = f"{surface_root}/lead_capture.py" + surface_frontend_entry = f"{surface_root}/landing_page.html" + return { + "allowed_paths": [f"{surface_root}/**", surface_test_init, surface_test_path], + "suggested_tests": [surface_test_path], + "metadata": { + "surface_slug": surface_slug, + "surface_root": surface_root, + "surface_test_init": surface_test_init, + "surface_test_path": surface_test_path, + "surface_backend_entry": surface_backend_entry, + "surface_frontend_entry": surface_frontend_entry, + }, + } + + def _extract_product_surface_slug(self, prompt: str) -> str: + explicit_match = re.search(r"apps[/ ]([a-z0-9][a-z0-9_-]*)", prompt.casefold()) + if explicit_match is not None: + return explicit_match.group(1) + + for marker, slug in _PRODUCT_SLUG_ALIASES.items(): + if marker in prompt: + return slug + + ascii_candidates = re.findall(r"[A-Za-z][A-Za-z0-9_-]{1,31}", prompt) + for candidate in ascii_candidates: + folded = candidate.casefold() + if folded not in _PRODUCT_SLUG_STOPWORDS: + return self._slugify(folded) + + return "brand-site" + + def _resolve_existing_paths(self, patterns: tuple[str, ...]) -> list[str]: + resolved: list[str] = [] + for pattern in patterns: + if pattern.endswith("/**"): + relative_dir = pattern[:-3] + if (self._repo_root / relative_dir).exists(): + resolved.append(pattern) + continue + if (self._repo_root / pattern).exists(): + resolved.append(pattern) + return resolved + + def _normalize_scope(self, values: Iterable[str], *, fallback: list[str]) -> list[str]: + normalized = self._dedupe(values) + if normalized: + return normalized + return self._dedupe(item for item in fallback if self._scope_exists(item)) + + def _scope_exists(self, pattern: str) -> bool: + if pattern.endswith("/**"): + return (self._repo_root / pattern[:-3]).exists() + return (self._repo_root / pattern).exists() + + def _should_decompose(self, prompt: str, intent: ManagerIntentRead) -> bool: + prompt_folded = prompt.casefold() + if intent.intent_id in {"admin_dashboard", "product_landing_page"}: + return True + if any(marker.casefold() in prompt_folded for marker in _COMPLEXITY_MARKERS): + return True + return len(prompt.strip()) >= 40 and len(intent.matched_keywords) >= 2 + + def _build_worker_spec( + self, + *, + dispatch_id: str, + request: ManagerDispatchRequest, + intent: ManagerIntentRead, + task_id: str, + task_title: str, + task_summary: str, + task_stage: ManagerTaskStage, + depends_on: list[str], + allowed_paths: list[str], + test_paths: list[str], + ) -> OpenHandsWorkerJobSpec: + slug = self._slugify(f"{intent.label}-{task_stage.value}") + test_command = self._task_test_command( + intent=intent, + task_stage=task_stage, + test_paths=test_paths, + ) + dependency_text = ", ".join(depends_on) if depends_on else "none" + problem_statement = ( + "Manager agent execution plan task.\n\n" + f"Founder prompt: {request.prompt}\n" + f"Normalized goal: {intent.metadata['normalized_goal']}\n" + f"Intent: {intent.label}\n" + f"Task title: {task_title}\n" + f"Task summary: {task_summary}\n" + f"Task stage: {task_stage.value}\n" + f"Dependencies: {dependency_text}\n" + "Stay inside the scoped files and deliver the smallest useful patch for this stage only." + ) + if intent.intent_id == "product_landing_page": + problem_statement += ( + "\n" + f"Business surface root: {intent.metadata.get('surface_root', 'apps/brand-site')}\n" + f"Suggested backend entry: {intent.metadata.get('surface_backend_entry', 'apps/brand-site/lead_capture.py')}\n" + f"Suggested frontend entry: {intent.metadata.get('surface_frontend_entry', 'apps/brand-site/landing_page.html')}\n" + f"Suggested regression test: {intent.metadata.get('surface_test_path', 'tests/apps/test_brand_site_landing_page.py')}\n" + "This is a business-surface task. Do not route it through src/autoresearch or compatibility routers unless the scope explicitly allows it.\n" + "If the isolated apps surface does not exist yet, create it inside allowed_paths." + ) + return OpenHandsWorkerJobSpec( + job_id=task_id, + problem_statement=problem_statement, + allowed_paths=allowed_paths, + test_command=test_command, + pipeline_target=request.pipeline_target, + target_base_branch=request.target_base_branch, + max_iterations=request.max_iterations, + metadata={ + **request.metadata, + "manager_dispatch_id": dispatch_id, + "manager_prompt": request.prompt, + "manager_intent_id": intent.intent_id, + "manager_intent_label": intent.label, + "manager_goal": intent.metadata["normalized_goal"], + "manager_task_id": task_id, + "manager_task_title": task_title, + "manager_task_stage": task_stage.value, + "manager_dependencies": list(depends_on), + "approval_granted": request.approval_granted, + "branch_name": f"codex/manager/{slug}-{dispatch_id[-6:]}", + "commit_message": f"Manager Agent [{task_stage.value}]: {intent.label}", + "pr_title": f"Manager Agent [{task_stage.value}]: {intent.label}", + "pr_body": f"{task_title}\n\n{request.prompt}", + "base_branch": request.target_base_branch, + }, + ) + + def _task_test_command( + self, + *, + intent: ManagerIntentRead, + task_stage: ManagerTaskStage, + test_paths: list[str], + ) -> str: + if intent.intent_id == "product_landing_page": + surface_test_path = str( + intent.metadata.get("surface_test_path") or "tests/apps/test_brand_site_landing_page.py" + ) + if task_stage is ManagerTaskStage.BACKEND: + return f"pytest -q {surface_test_path}" + if task_stage in {ManagerTaskStage.TESTS, ManagerTaskStage.FRONTEND}: + return f"pytest -q {surface_test_path}" + if test_paths: + return "pytest -q " + " ".join(test_paths) + return "pytest -q tests/test_autoresearch_planner.py" + + def _save_dispatch( + self, + current: ManagerDispatchRead, + *, + status: JobStatus | None = None, + task_id: str | None = None, + task_status: JobStatus | None = None, + task_run_summary: RunSummary | None = None, + task_error: str | None = None, + run_summary: RunSummary | None = None, + summary: str | None = None, + error: str | None = None, + ) -> ManagerDispatchRead: + execution_plan = current.execution_plan + if execution_plan is not None and task_id is not None: + updated_tasks: list[ManagerPlanTaskRead] = [] + for task in execution_plan.tasks: + if task.task_id != task_id: + updated_tasks.append(task) + continue + updated_tasks.append( + task.model_copy( + update={ + "status": task_status or task.status, + "run_summary": task_run_summary if task_run_summary is not None else task.run_summary, + "error": task_error if task_error is not None else task.error, + } + ) + ) + execution_plan = execution_plan.model_copy(update={"tasks": updated_tasks}) + + updated = current.model_copy( + update={ + "status": status or current.status, + "execution_plan": execution_plan, + "run_summary": run_summary if run_summary is not None else current.run_summary, + "summary": summary if summary is not None else current.summary, + "error": error if error is not None else current.error, + "updated_at": utc_now(), + } + ) + return self._repository.save(updated.dispatch_id, updated) + + def _ensure_dependencies_completed(self, *, current: ManagerDispatchRead, task: ManagerPlanTaskRead) -> None: + if current.execution_plan is None: + return + dependency_statuses = { + item.task_id: item.status + for item in current.execution_plan.tasks + if item.task_id in task.depends_on + } + missing = [dep for dep in task.depends_on if dependency_statuses.get(dep) is not JobStatus.COMPLETED] + if missing: + raise ValueError(f"task {task.task_id} is blocked by incomplete dependencies: {', '.join(missing)}") + + def _require_plan_task(self, current: ManagerDispatchRead, task_id: str) -> ManagerPlanTaskRead: + if current.execution_plan is None: + raise KeyError(f"manager execution plan missing for task: {task_id}") + for task in current.execution_plan.tasks: + if task.task_id == task_id: + return task + raise KeyError(f"manager task not found: {task_id}") + + def _default_dispatch_runner(self, job: JobSpec) -> RunSummary: + runner = AgentExecutionRunner(repo_root=self._repo_root) + return runner.run_job(job) + + def _require_dispatch(self, dispatch_id: str) -> ManagerDispatchRead: + dispatch = self._repository.get(dispatch_id) + if dispatch is None: + raise KeyError(f"manager dispatch not found: {dispatch_id}") + return dispatch + + @staticmethod + def _is_frontend_path(path: str) -> bool: + normalized = path.replace("\\", "/") + return normalized.startswith("panel/") or normalized.endswith("/panel.py") + + @staticmethod + def _dedupe(values: Iterable[str]) -> list[str]: + seen: set[str] = set() + ordered: list[str] = [] + for item in values: + normalized = str(item).strip() + if not normalized or normalized in seen: + continue + seen.add(normalized) + ordered.append(normalized) + return ordered + + @staticmethod + def _slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") + return slug or "dispatch" diff --git a/src/autoresearch/api/dependencies.py b/src/autoresearch/api/dependencies.py index e34626d8..29a001e8 100644 --- a/src/autoresearch/api/dependencies.py +++ b/src/autoresearch/api/dependencies.py @@ -8,11 +8,15 @@ from autoresearch.api.settings import ( get_admin_settings, get_feature_settings, + get_housekeeper_settings, + get_media_settings, get_panel_settings, get_runtime_settings, get_telegram_settings, + get_upstream_watcher_settings, ) from autoresearch.agents.opensource_searcher import GitHubSearcher +from autoresearch.agents.manager_agent import ManagerAgentService from autoresearch.core.adapters import ( AppleCalendarAdapter, CapabilityProviderRegistry, @@ -24,12 +28,17 @@ from autoresearch.core.services.admin_auth import AdminAuthService from autoresearch.core.services.admin_config import AdminConfigService from autoresearch.core.services.admin_secrets import AdminSecretCipher +from autoresearch.core.services.agent_audit_trail import AgentAuditTrailService from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService from autoresearch.core.services.claude_agents import ClaudeAgentService from autoresearch.core.services.evaluations import EvaluationService from autoresearch.core.services.executions import ExecutionService +from autoresearch.core.services.github_issue_service import GitHubIssueService +from autoresearch.core.services.housekeeper import HousekeeperService from autoresearch.core.services.mirofish_prediction import MiroFishPredictionService from autoresearch.core.services.managed_skill_registry import ManagedSkillRegistryService +from autoresearch.core.services.media_jobs import MediaJobService from autoresearch.core.services.openclaw_compat import OpenClawCompatService from autoresearch.core.services.openclaw_memory import OpenClawMemoryService from autoresearch.core.services.openclaw_skills import OpenClawSkillService @@ -39,6 +48,7 @@ from autoresearch.core.services.reports import ReportService from autoresearch.core.services.self_integration import SelfIntegrationService from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.core.services.upstream_watcher import UpstreamWatcherService from autoresearch.core.services.variants import VariantService from autoresearch.shared.models import ( ClaudeAgentRunRead, @@ -60,6 +70,10 @@ ReportRead, VariantRead, ) +from autoresearch.shared.autoresearch_planner_contract import AutoResearchPlanRead +from autoresearch.shared.housekeeper_contract import ExplorationRecordRead, HousekeeperStateRead, NightBudgetStateRead +from autoresearch.shared.manager_agent_contract import ManagerDispatchRead +from autoresearch.shared.media_job_contract import MediaJobEventRead, MediaJobRead from autoresearch.shared.store import SQLiteModelRepository from autoresearch.train.services.experiments import ExperimentService from autoresearch.train.services.optimizations import OptimizationService @@ -141,6 +155,81 @@ def get_execution_service() -> ExecutionService: ) +@lru_cache(maxsize=1) +def get_autoresearch_planner_service() -> AutoResearchPlannerService: + return AutoResearchPlannerService( + repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="autoresearch_plans", + model_cls=AutoResearchPlanRead, + ), + repo_root=_repo_root(), + upstream_watcher=get_upstream_watcher_service(), + ) + + +@lru_cache(maxsize=1) +def get_manager_agent_service() -> ManagerAgentService: + return ManagerAgentService( + repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="manager_agent_dispatches", + model_cls=ManagerDispatchRead, + ), + repo_root=_repo_root(), + ) + + +@lru_cache(maxsize=1) +def get_housekeeper_service() -> HousekeeperService: + settings = get_housekeeper_settings() + return HousekeeperService( + state_repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="housekeeper_state", + model_cls=HousekeeperStateRead, + ), + budget_repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="night_budget_state", + model_cls=NightBudgetStateRead, + ), + exploration_repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="housekeeper_exploration_records", + model_cls=ExplorationRecordRead, + ), + timezone_name=settings.timezone_name, + summary_chat_id=settings.summary_chat_id, + ) + + +@lru_cache(maxsize=1) +def get_media_job_service() -> MediaJobService: + settings = get_media_settings() + return MediaJobService( + repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="media_jobs", + model_cls=MediaJobRead, + ), + event_repository=SQLiteModelRepository( + db_path=_api_db_path(), + table_name="media_job_events", + model_cls=MediaJobEventRead, + ), + media_root=settings.media_root, + allowed_domains=settings.allowed_domains, + yt_dlp_bin=settings.yt_dlp_bin, + ffmpeg_bin=settings.ffmpeg_bin, + ) + + +@lru_cache(maxsize=1) +def get_github_issue_service() -> GitHubIssueService: + return GitHubIssueService(repo_root=_repo_root()) + + @lru_cache(maxsize=1) def get_openclaw_compat_service() -> OpenClawCompatService: return OpenClawCompatService( @@ -284,6 +373,7 @@ def get_panel_access_service() -> PanelAccessService: return PanelAccessService( secret=panel_settings.jwt_secret, base_url=panel_settings.base_url, + mini_app_url=panel_settings.mini_app_url, issuer=panel_settings.jwt_issuer, audience=panel_settings.jwt_audience, default_ttl_seconds=max(30, min(panel_settings.magic_link_ttl_seconds, 3600)), @@ -305,6 +395,16 @@ def get_panel_audit_service() -> PanelAuditService: ) +@lru_cache(maxsize=1) +def get_agent_audit_trail_service() -> AgentAuditTrailService: + return AgentAuditTrailService( + repo_root=_repo_root(), + planner_service=get_autoresearch_planner_service(), + manager_service=get_manager_agent_service(), + agent_service=get_claude_agent_service(), + ) + + @lru_cache(maxsize=1) def get_telegram_notifier_service() -> TelegramNotifierService: telegram_settings = get_telegram_settings() @@ -315,6 +415,16 @@ def get_telegram_notifier_service() -> TelegramNotifierService: ) +@lru_cache(maxsize=1) +def get_upstream_watcher_service() -> UpstreamWatcherService: + settings = get_upstream_watcher_settings() + return UpstreamWatcherService( + upstream_url=settings.upstream_url, + workspace_root=settings.workspace_root, + max_commits=max(1, min(settings.max_commits, 20)), + ) + + @lru_cache(maxsize=1) def get_admin_config_service() -> AdminConfigService: return AdminConfigService( @@ -368,6 +478,10 @@ def clear_dependency_caches() -> None: get_optimization_service.cache_clear() get_experiment_service.cache_clear() get_execution_service.cache_clear() + get_housekeeper_service.cache_clear() + get_manager_agent_service.cache_clear() + get_media_job_service.cache_clear() + get_github_issue_service.cache_clear() get_openclaw_compat_service.cache_clear() get_openclaw_memory_service.cache_clear() get_capability_provider_registry.cache_clear() @@ -378,7 +492,9 @@ def clear_dependency_caches() -> None: get_self_integration_service.cache_clear() get_panel_access_service.cache_clear() get_panel_audit_service.cache_clear() + get_agent_audit_trail_service.cache_clear() get_telegram_notifier_service.cache_clear() + get_upstream_watcher_service.cache_clear() get_admin_config_service.cache_clear() get_admin_secret_cipher.cache_clear() get_admin_auth_service.cache_clear() diff --git a/src/autoresearch/api/main.py b/src/autoresearch/api/main.py index 5be8cea5..411d9f59 100644 --- a/src/autoresearch/api/main.py +++ b/src/autoresearch/api/main.py @@ -125,6 +125,10 @@ def create_app() -> FastAPI: ("autoresearch.api.routers.evaluations", "router", "evaluations"), ("autoresearch.api.routers.generators", "router", "generators"), ("autoresearch.api.routers.executors", "router", "executors"), + ("autoresearch.api.routers.autoresearch_plans", "router", "autoresearch plans"), + ("autoresearch.api.routers.manager_agent", "router", "manager agent"), + ("autoresearch.api.routers.housekeeper", "router", "housekeeper"), + ("autoresearch.api.routers.media_jobs", "router", "media jobs"), ("autoresearch.api.routers.synthesis", "router", "synthesis"), ("autoresearch.api.routers.loops", "router", "loops"), ("autoresearch.api.routers.orchestration", "router", "orchestration"), diff --git a/src/autoresearch/api/routers/admin.py b/src/autoresearch/api/routers/admin.py index 3a8d4e83..4cbf1431 100644 --- a/src/autoresearch/api/routers/admin.py +++ b/src/autoresearch/api/routers/admin.py @@ -3,7 +3,7 @@ import hashlib import hmac from typing import Any, Literal -from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse +from urllib.parse import urlparse from fastapi import APIRouter, BackgroundTasks, Depends, Header, HTTPException, Query, Request, status from fastapi.responses import HTMLResponse @@ -11,6 +11,7 @@ from autoresearch.api.dependencies import ( get_admin_auth_service, get_admin_config_service, + get_agent_audit_trail_service, get_approval_store_service, get_capability_provider_registry, get_claude_agent_service, @@ -33,6 +34,8 @@ ApprovalRequestRead, ApprovalRisk, ApprovalStatus, + AdminAgentAuditTrailDetailRead, + AdminAgentAuditTrailSnapshotRead, AdminCapabilityProviderInventoryRead, AdminCapabilitySnapshotRead, AdminCapabilityToolRead, @@ -185,25 +188,73 @@ def _select_skill_promotion_uid( def _build_panel_action_url( *, panel_access_service: PanelAccessService, + telegram_uid: str, install_id: str, approval_id: str, action_nonce: str, action_hash: str, action_issued_at: str, ) -> str: - parsed = urlparse(panel_access_service.base_url) - query = dict(parse_qsl(parsed.query, keep_blank_values=True)) - query.update( - { + return panel_access_service.build_action_url( + query_params={ "action": "managed-skill-promote", "installId": install_id, "approvalId": approval_id, "actionNonce": action_nonce, "actionHash": action_hash, "actionIssuedAt": action_issued_at, - } + }, + telegram_uid=telegram_uid, + prefer_mini_app=True, + ) + + +def _build_panel_action_markup( + *, + panel_access_service: PanelAccessService, + action_url: str, +) -> dict[str, object] | None: + markups = _build_panel_action_markups( + panel_access_service=panel_access_service, + action_url=action_url, ) - return urlunparse(parsed._replace(query=urlencode(query))) + return markups[0] if markups else None + + +def _build_panel_action_markups( + *, + panel_access_service: PanelAccessService, + action_url: str, +) -> list[dict[str, object] | None]: + parsed = urlparse(action_url) + if parsed.scheme != "https": + return [None] + url_markup = { + "inline_keyboard": [ + [ + { + "text": "打开 Panel 审批", + "url": action_url, + } + ] + ] + } + if panel_access_service.mini_app_url: + return [ + { + "inline_keyboard": [ + [ + { + "text": "打开 Mini App 审批", + "web_app": {"url": action_url}, + } + ] + ] + }, + url_markup, + None, + ] + return [url_markup, None] def _compute_managed_skill_action_hash( @@ -485,6 +536,7 @@ def admin_request_managed_skill_promotion( mini_app_url = _build_panel_action_url( panel_access_service=panel_access_service, + telegram_uid=telegram_uid, install_id=install.install_id, approval_id=approval.approval_id, action_nonce=action_binding["action_nonce"], @@ -503,20 +555,18 @@ def admin_request_managed_skill_promotion( message_lines.append(f"- note: {note}") if approval.expires_at is not None: message_lines.append(f"- expires_at: {approval.expires_at.isoformat()}") - notification_sent = notifier.send_message( - chat_id=telegram_uid, - text="\n".join(message_lines), - reply_markup={ - "inline_keyboard": [ - [ - { - "text": "打开 Mini App 审批", - "web_app": {"url": mini_app_url}, - } - ] - ] - }, - ) + notification_sent = False + for reply_markup in _build_panel_action_markups( + panel_access_service=panel_access_service, + action_url=mini_app_url, + ): + if notifier.send_message( + chat_id=telegram_uid, + text="\n".join(message_lines), + reply_markup=reply_markup, + ): + notification_sent = True + break return AdminManagedSkillPromotionRequestRead( install=install, approval=approval, @@ -898,6 +948,31 @@ def list_channel_history( return service.list_revisions(target_type="channel", target_id=channel_id, limit=limit) +@router.get("/audit-trail", response_model=AdminAgentAuditTrailSnapshotRead) +def get_agent_audit_trail( + limit: int = Query(default=20, ge=1, le=200), + status_filter: Literal["all", "success", "failed", "pending", "running", "review"] | None = Query( + default=None + ), + agent_role: Literal["all", "manager", "planner", "worker"] | None = Query(default=None), + access: AdminAccessClaims = Depends(_require_admin_read), + service=Depends(get_agent_audit_trail_service), +) -> AdminAgentAuditTrailSnapshotRead: + return service.snapshot(limit=limit, status_filter=status_filter, agent_role=agent_role) + + +@router.get("/audit-trail/{entry_id}", response_model=AdminAgentAuditTrailDetailRead) +def get_agent_audit_trail_detail( + entry_id: str, + access: AdminAccessClaims = Depends(_require_admin_read), + service=Depends(get_agent_audit_trail_service), +) -> AdminAgentAuditTrailDetailRead: + try: + return service.detail(entry_id) + except KeyError as exc: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="audit trail entry not found") from exc + + @router.get("/revisions", response_model=list[AdminConfigRevisionRead]) def list_revisions( target_type: Literal["agent", "channel"] | None = None, @@ -1264,6 +1339,65 @@ def list_revisions( +
+

Agent Audit Trail

+
+ + + + 最近 20 条执行足迹 +
+
+ + + + + + + + + + + + + + + +
TimeRoleRunStatusDurationFilesScopeChangedInspect
+
+
+

Audit Detail

+

点击单条记录查看输入、patch diff 和失败原因。

+
+ +
+
+

Input Context

+
暂无
+
+
+

Patch Diff

+
暂无
+
+
+

Failure / Traceback

+
暂无
+
+
+
+

Revision Timeline

@@ -1287,6 +1421,14 @@ def list_revisions( const approvalsBody = document.getElementById("approvals-body"); const skillsBody = document.getElementById("skills-body"); const skillDetailPre = document.getElementById("skill-detail-pre"); +const auditTrailSummary = document.getElementById("audit-trail-summary"); +const auditTrailBody = document.getElementById("audit-trail-body"); +const auditStatusFilter = document.getElementById("audit-status-filter"); +const auditRoleFilter = document.getElementById("audit-role-filter"); +const auditDetailMeta = document.getElementById("audit-detail-meta"); +const auditInputPre = document.getElementById("audit-input-pre"); +const auditPatchPre = document.getElementById("audit-patch-pre"); +const auditErrorPre = document.getElementById("audit-error-pre"); const revisionsPre = document.getElementById("revisions-pre"); const tokenFromQuery = new URLSearchParams(window.location.search).get("token") || ""; let adminToken = localStorage.getItem("autoresearch_admin_token") || tokenFromQuery; @@ -1296,6 +1438,7 @@ def list_revisions( let channelFormMode = "create"; let editingChannelId = null; let editingChannelStatus = "active"; +let selectedAuditEntryId = ""; function setAdminToken() { const input = prompt("请输入 Bearer Token(不含 Bearer 前缀)", adminToken || ""); @@ -1337,6 +1480,20 @@ def list_revisions( return new Date(value).toLocaleString("zh-CN"); } +function fmtDuration(value) { + if (value === null || value === undefined) return "-"; + if (value < 1000) return `${Math.round(value)} ms`; + return `${(value / 1000).toFixed(value >= 10000 ? 0 : 1)} s`; +} + +function compactList(values, limit = 2) { + const items = (values || []).filter(Boolean); + if (!items.length) return "-"; + const visible = items.slice(0, limit).join(", "); + const extra = items.length - limit; + return extra > 0 ? `${visible} +${extra}` : visible; +} + function csvToList(raw) { if (!raw) return []; return raw @@ -1568,6 +1725,99 @@ def list_revisions( `; } +function clearAuditDetail() { + selectedAuditEntryId = ""; + auditDetailMeta.textContent = "点击单条记录查看输入、patch diff 和失败原因。"; + auditInputPre.textContent = "暂无"; + auditPatchPre.textContent = "暂无"; + auditErrorPre.textContent = "暂无"; +} + +function auditTrailRow(item) { + const finalStatus = item.final_status || item.status || "-"; + const pillClass = ["failed", "blocked", "interrupted", "human_review", "stalled_no_progress"].includes(finalStatus) ? "inactive" : "active"; + return ` + + ${fmtDate(item.recorded_at)} + ${item.agent_role} / ${item.source} + ${item.run_id} + ${finalStatus} + ${fmtDuration(item.duration_ms)} + ${item.files_changed || 0} + ${compactList(item.scope_paths, 2)} + ${compactList(item.changed_paths, 2)} + + `; +} + +async function loadAuditDetail(entryId) { + selectedAuditEntryId = entryId; + auditDetailMeta.textContent = "加载详情中..."; + try { + const detail = await callApi(`/api/v1/admin/audit-trail/${encodeURIComponent(entryId)}`); + const entry = detail.entry || {}; + const detailLines = [ + `Role: ${entry.agent_role || "-"}`, + `Source: ${entry.source || "-"}`, + `Run: ${entry.run_id || "-"}`, + `Title: ${entry.title || "-"}`, + `Status: ${entry.final_status || entry.status || "-"}`, + `Raw: ${entry.status || "-"}`, + `Recorded: ${fmtDate(entry.recorded_at)}`, + `First progress: ${fmtDuration(entry.first_progress_ms)}`, + `First write: ${fmtDuration(entry.first_scoped_write_ms)}`, + `First state: ${fmtDuration(entry.first_state_heartbeat_ms)}`, + `Patch: ${entry.patch_uri || "-"}`, + `Workspace: ${entry.isolated_workspace || "-"}`, + detail.patch_truncated ? "Patch preview: truncated" : "Patch preview: full", + ]; + auditDetailMeta.textContent = detailLines.join(" | "); + auditInputPre.textContent = asJSON({ + prompt: detail.input_prompt || null, + job_spec: detail.job_spec || {}, + worker_spec: detail.worker_spec || {}, + controlled_request: detail.controlled_request || {}, + raw_record: detail.raw_record || {}, + }); + auditPatchPre.textContent = detail.patch_text || (entry.patch_uri ? `Patch file: ${entry.patch_uri}` : "暂无 patch"); + auditErrorPre.textContent = detail.error_reason || detail.traceback + ? [detail.error_reason || "no error reason", detail.traceback || ""].filter(Boolean).join("\\n\\n") + : "无失败细节"; + } catch (err) { + auditDetailMeta.textContent = `加载详情失败: ${err.message}`; + auditInputPre.textContent = "加载失败"; + auditPatchPre.textContent = "加载失败"; + auditErrorPre.textContent = String(err); + } +} + +async function loadAuditTrail() { + try { + const params = new URLSearchParams(); + params.set("limit", "20"); + params.set("status_filter", auditStatusFilter.value || "all"); + params.set("agent_role", auditRoleFilter.value || "all"); + const snapshot = await callApi(`/api/v1/admin/audit-trail?${params.toString()}`); + const items = snapshot.items || []; + const stats = snapshot.stats || {}; + auditTrailBody.innerHTML = items.map(auditTrailRow).join("") + || "暂无"; + auditTrailSummary.textContent = + `Recent: ${items.length} | Success: ${stats.succeeded || 0} | Failed: ${stats.failed || 0} | Running: ${stats.running || 0} | Queued: ${stats.queued || 0} | Filter: ${(auditStatusFilter.value || "all")}/${(auditRoleFilter.value || "all")}`; + if (selectedAuditEntryId && items.some((item) => item.entry_id === selectedAuditEntryId)) { + await loadAuditDetail(selectedAuditEntryId); + } else if (!selectedAuditEntryId) { + clearAuditDetail(); + } else { + clearAuditDetail(); + auditDetailMeta.textContent = "当前筛选结果不包含已选记录。"; + } + } catch (err) { + auditTrailSummary.textContent = `加载失败: ${err.message}`; + auditTrailBody.innerHTML = "加载失败"; + } +} + async function refreshAll() { try { const [agents, channels, capabilitySnapshot, approvals, skillSnapshot] = await Promise.all([ @@ -1585,12 +1835,16 @@ def list_revisions( approvalsBody.innerHTML = approvals.map(approvalRow).join("") || "暂无"; skillsBody.innerHTML = skillItems.map(skillRow).join("") || "暂无"; summary.textContent = `Agents: ${agents.length} | Channels: ${channels.length} | Providers: ${(capabilitySnapshot.providers || []).length} | Approvals: ${approvals.length} | Skills: ${skillItems.length} | API: /api/v1/admin`; + await loadAuditTrail(); await loadRevisions(); } catch (err) { summary.textContent = `加载失败: ${err.message}`; capabilitiesBody.innerHTML = "加载失败"; approvalsBody.innerHTML = "加载失败"; skillsBody.innerHTML = "加载失败"; + auditTrailSummary.textContent = `加载失败: ${err.message}`; + auditTrailBody.innerHTML = "加载失败"; + clearAuditDetail(); } } diff --git a/src/autoresearch/api/routers/autoresearch_plans.py b/src/autoresearch/api/routers/autoresearch_plans.py new file mode 100644 index 00000000..d3d7878e --- /dev/null +++ b/src/autoresearch/api/routers/autoresearch_plans.py @@ -0,0 +1,258 @@ +from __future__ import annotations + +from urllib.parse import urlparse + +from fastapi import APIRouter, Depends, HTTPException, status + +from autoresearch.api.dependencies import ( + get_autoresearch_planner_service, + get_housekeeper_service, + get_panel_access_service, + get_telegram_notifier_service, +) +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.core.services.panel_access import PanelAccessService +from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.shared.autoresearch_planner_contract import ( + AutoResearchPlanRead, + AutoResearchPlannerRequest, + UpstreamWatchDecision, +) + + +router = APIRouter(prefix="/api/v1/autoresearch/plans", tags=["autoresearch-plans"]) + + +@router.post( + "", + response_model=AutoResearchPlanRead, + status_code=status.HTTP_202_ACCEPTED, +) +def create_autoresearch_plan( + payload: AutoResearchPlannerRequest, + service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), + housekeeper_service: HousekeeperService = Depends(get_housekeeper_service), + panel_access_service: PanelAccessService = Depends(get_panel_access_service), + notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), +) -> AutoResearchPlanRead: + prepared, _, _ = housekeeper_service.prepare_planner_request(payload, trigger_source="api") + telegram_uid = _select_plan_notification_uid(payload=prepared, panel_access_service=panel_access_service) + plan = service.create(prepared.model_copy(update={"telegram_uid": telegram_uid})) + panel_action_url = None + notification_sent = False + + if plan.selected_candidate is not None: + panel_action_url = _build_plan_panel_action_url( + panel_access_service=panel_access_service, + plan_id=plan.plan_id, + telegram_uid=telegram_uid, + ) + notification_sent = _send_plan_notification( + notifier=notifier, + panel_access_service=panel_access_service, + plan=plan, + panel_action_url=panel_action_url, + telegram_uid=telegram_uid, + ) + notification_sent = _send_upstream_watch_notification( + notifier=notifier, + plan=plan, + telegram_uid=telegram_uid, + ) or notification_sent + if plan.selected_candidate is None and not notification_sent and telegram_uid == plan.telegram_uid: + return plan + return service.update_delivery( + plan.plan_id, + telegram_uid=telegram_uid, + panel_action_url=panel_action_url, + notification_sent=notification_sent, + ) + + +@router.get("", response_model=list[AutoResearchPlanRead]) +def list_autoresearch_plans( + service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), +) -> list[AutoResearchPlanRead]: + return service.list() + + +@router.get("/{plan_id}", response_model=AutoResearchPlanRead) +def get_autoresearch_plan( + plan_id: str, + service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), +) -> AutoResearchPlanRead: + plan = service.get(plan_id) + if plan is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="AutoResearch plan not found") + return plan + + +def _select_plan_notification_uid( + *, + payload: AutoResearchPlannerRequest, + panel_access_service: PanelAccessService, +) -> str | None: + candidate = (payload.telegram_uid or "").strip() + allowed_uids = panel_access_service.allowed_uids + if candidate: + if allowed_uids and candidate not in allowed_uids: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail="telegram uid is not allowed for panel access", + ) + return candidate + if len(allowed_uids) == 1: + return allowed_uids[0] + return None + + +def _build_plan_panel_action_url( + *, + panel_access_service: PanelAccessService, + plan_id: str, + telegram_uid: str | None, +) -> str: + return panel_access_service.build_action_url( + query_params={"planId": plan_id}, + telegram_uid=telegram_uid, + prefer_mini_app=True, + ) + + +def _build_plan_notification_markup( + *, + panel_access_service: PanelAccessService, + panel_action_url: str, +) -> dict[str, object] | None: + markups = _build_plan_notification_markups( + panel_access_service=panel_access_service, + panel_action_url=panel_action_url, + ) + return markups[0] if markups else None + + +def _build_plan_notification_markups( + *, + panel_access_service: PanelAccessService, + panel_action_url: str, +) -> list[dict[str, object] | None]: + parsed = urlparse(panel_action_url) + if parsed.scheme != "https": + return [None] + url_markup = { + "inline_keyboard": [ + [ + { + "text": "打开 Panel 审批", + "url": panel_action_url, + } + ] + ] + } + if panel_access_service.mini_app_url: + return [ + { + "inline_keyboard": [ + [ + { + "text": "打开 Mini App 审批", + "web_app": {"url": panel_action_url}, + } + ] + ] + }, + url_markup, + None, + ] + return [url_markup, None] + + +def _send_plan_notification( + *, + notifier: TelegramNotifierService, + panel_access_service: PanelAccessService, + plan: AutoResearchPlanRead, + panel_action_url: str, + telegram_uid: str | None, +) -> bool: + if not notifier.enabled or not telegram_uid: + return False + for reply_markup in _build_plan_notification_markups( + panel_access_service=panel_access_service, + panel_action_url=panel_action_url, + ): + if notifier.send_message( + chat_id=telegram_uid, + text=_build_plan_notification_text(plan, delivery=_plan_notification_delivery(reply_markup)), + reply_markup=reply_markup, + ): + return True + return False + + +def _plan_notification_delivery(reply_markup: dict[str, object] | None) -> str: + if reply_markup is None: + return "text" + try: + button = reply_markup["inline_keyboard"][0][0] + except (KeyError, IndexError, TypeError): + return "panel" + if isinstance(button, dict) and "web_app" in button: + return "mini_app" + return "panel" + + +def _build_plan_notification_text(plan: AutoResearchPlanRead, *, delivery: str) -> str: + candidate = plan.selected_candidate + if candidate is None: + return "🔍 AutoResearch 完成扫描,但暂时没有生成可执行的规划单。" + estimated_changes = max(1, len(candidate.allowed_paths)) + approval_hint = "请前往 Panel 审批执行。" + if delivery == "mini_app": + approval_hint = "请前往 Mini App 审批执行。" + return ( + f"🔍 AutoResearch 发现新优化点: {candidate.title}\n" + f"- target: {candidate.source_path}\n" + f"- category: {candidate.category}\n" + f"- estimated_changes: {estimated_changes}\n" + f"{approval_hint}" + ) + + +def _send_upstream_watch_notification( + *, + notifier: TelegramNotifierService, + plan: AutoResearchPlanRead, + telegram_uid: str | None, +) -> bool: + upstream_watch = plan.upstream_watch + if not notifier.enabled or not telegram_uid or upstream_watch is None: + return False + if upstream_watch.decision is not UpstreamWatchDecision.SKIP: + return False + return notifier.send_message( + chat_id=telegram_uid, + text=_build_upstream_watch_notification_text(plan), + ) + + +def _build_upstream_watch_notification_text(plan: AutoResearchPlanRead) -> str: + upstream_watch = plan.upstream_watch + if upstream_watch is None: + return "🛡️ 已完成上游巡检,当前没有需要同步的核心更新。" + focus_labels = [_format_upstream_focus_area(item) for item in upstream_watch.focus_areas if item != "repo-meta"] + focus_hint = "/".join(focus_labels[:3]) or "近期扩展" + return ( + f"🛡️ 已完成上游巡检,最新变更({focus_hint} 修复)与核心基建无关," + "已自动拦截跳过。" + ) + + +def _format_upstream_focus_area(focus_area: str) -> str: + if focus_area.startswith("extension:"): + name = focus_area.split(":", 1)[1] + if name.lower() == "line": + return "LINE" + return name.replace("-", " ").title() + return focus_area.replace("-", " ") diff --git a/src/autoresearch/api/routers/gateway_telegram.py b/src/autoresearch/api/routers/gateway_telegram.py index a94906fc..9591fdf5 100644 --- a/src/autoresearch/api/routers/gateway_telegram.py +++ b/src/autoresearch/api/routers/gateway_telegram.py @@ -13,6 +13,10 @@ get_approval_store_service, get_capability_provider_registry, get_claude_agent_service, + get_github_issue_service, + get_housekeeper_service, + get_manager_agent_service, + get_media_job_service, get_openclaw_compat_service, get_openclaw_memory_service, get_panel_access_service, @@ -23,7 +27,11 @@ from autoresearch.core.services.admin_config import AdminConfigService from autoresearch.core.services.approval_store import ApprovalStoreService from autoresearch.core.services.claude_agents import ClaudeAgentService +from autoresearch.core.services.github_issue_service import GitHubIssueRead, GitHubIssueService from autoresearch.core.services.group_access import GroupAccessManager +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.core.services.media_jobs import MediaJobService from autoresearch.core.services.openclaw_compat import OpenClawCompatService from autoresearch.core.services.openclaw_memory import OpenClawMemoryService from autoresearch.core.services.panel_access import PanelAccessService @@ -35,7 +43,10 @@ from autoresearch.shared.models import ( AdminChannelConfigCreateRequest, AdminChannelConfigUpdateRequest, + ActorRole, ApprovalDecisionRequest, + ApprovalRequestCreateRequest, + ApprovalRisk, ApprovalStatus, AssistantScope, ClaudeAgentCreateRequest, @@ -50,6 +61,8 @@ ChatType, TelegramWebhookAck, ) +from autoresearch.shared.manager_agent_contract import ManagerDispatchRead, ManagerDispatchRequest +from autoresearch.shared.media_job_contract import MediaJobRead, MediaJobStatus router = APIRouter(prefix="/api/v1/gateway/telegram", tags=["gateway", "telegram"]) @@ -80,6 +93,10 @@ def telegram_webhook( memory_service: OpenClawMemoryService = Depends(get_openclaw_memory_service), approval_service: ApprovalStoreService = Depends(get_approval_store_service), agent_service: ClaudeAgentService = Depends(get_claude_agent_service), + housekeeper_service: HousekeeperService = Depends(get_housekeeper_service), + manager_service: ManagerAgentService = Depends(get_manager_agent_service), + media_job_service: MediaJobService = Depends(get_media_job_service), + github_issue_service: GitHubIssueService = Depends(get_github_issue_service), capability_registry: CapabilityProviderRegistry = Depends(get_capability_provider_registry), panel_access_service: PanelAccessService = Depends(get_panel_access_service), notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), @@ -93,6 +110,10 @@ def telegram_webhook( memory_service=memory_service, approval_service=approval_service, agent_service=agent_service, + housekeeper_service=housekeeper_service, + manager_service=manager_service, + media_job_service=media_job_service, + github_issue_service=github_issue_service, capability_registry=capability_registry, panel_access_service=panel_access_service, notifier=notifier, @@ -113,6 +134,10 @@ def legacy_telegram_webhook( memory_service: OpenClawMemoryService = Depends(get_openclaw_memory_service), approval_service: ApprovalStoreService = Depends(get_approval_store_service), agent_service: ClaudeAgentService = Depends(get_claude_agent_service), + housekeeper_service: HousekeeperService = Depends(get_housekeeper_service), + manager_service: ManagerAgentService = Depends(get_manager_agent_service), + media_job_service: MediaJobService = Depends(get_media_job_service), + github_issue_service: GitHubIssueService = Depends(get_github_issue_service), capability_registry: CapabilityProviderRegistry = Depends(get_capability_provider_registry), panel_access_service: PanelAccessService = Depends(get_panel_access_service), notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), @@ -126,6 +151,10 @@ def legacy_telegram_webhook( memory_service=memory_service, approval_service=approval_service, agent_service=agent_service, + housekeeper_service=housekeeper_service, + manager_service=manager_service, + media_job_service=media_job_service, + github_issue_service=github_issue_service, capability_registry=capability_registry, panel_access_service=panel_access_service, notifier=notifier, @@ -142,6 +171,10 @@ def _handle_telegram_webhook( memory_service: OpenClawMemoryService, approval_service: ApprovalStoreService, agent_service: ClaudeAgentService, + housekeeper_service: HousekeeperService, + manager_service: ManagerAgentService, + media_job_service: MediaJobService, + github_issue_service: GitHubIssueService, capability_registry: CapabilityProviderRegistry, panel_access_service: PanelAccessService, notifier: TelegramNotifierService, @@ -228,6 +261,22 @@ def _handle_telegram_webhook( session_identity=session_identity, ) + if _is_task_command(text): + return _handle_task_command( + chat_id=chat_id, + update=update, + extracted=extracted, + background_tasks=background_tasks, + openclaw_service=openclaw_service, + approval_service=approval_service, + housekeeper_service=housekeeper_service, + manager_service=manager_service, + media_job_service=media_job_service, + github_issue_service=github_issue_service, + notifier=notifier, + session_identity=session_identity, + ) + if _is_approve_command(text): return _handle_approve_command( chat_id=chat_id, @@ -235,6 +284,7 @@ def _handle_telegram_webhook( extracted=extracted, background_tasks=background_tasks, approval_service=approval_service, + github_issue_service=github_issue_service, notifier=notifier, session_identity=session_identity, ) @@ -617,6 +667,241 @@ def _find_or_create_telegram_session( ) +def _handle_task_command( + *, + chat_id: str, + update: dict[str, Any], + extracted: dict[str, Any], + background_tasks: BackgroundTasks, + openclaw_service: OpenClawCompatService, + approval_service: ApprovalStoreService, + housekeeper_service: HousekeeperService, + manager_service: ManagerAgentService, + media_job_service: MediaJobService, + github_issue_service: GitHubIssueService, + notifier: TelegramNotifierService, + session_identity: TelegramSessionIdentityRead, +) -> TelegramWebhookAck: + task_query, approval_requested = _parse_task_command(extracted["text"]) + if not task_query: + message_text = ( + "用法:\n" + "/task <需求>\n" + "/task --approve <需求>\n" + "/task issue [补充说明]" + ) + if notifier.enabled: + background_tasks.add_task(notifier.send_message, chat_id=chat_id, text=message_text) + return TelegramWebhookAck( + accepted=False, + update_id=_safe_int(update.get("update_id")), + chat_id=chat_id, + reason="missing task payload", + metadata={"source": "telegram_manager_task", "scope": session_identity.scope.value}, + ) + + session = _find_or_create_telegram_session( + openclaw_service=openclaw_service, + chat_id=chat_id, + session_identity=session_identity, + ) + _append_user_event( + openclaw_service=openclaw_service, + session=session, + text=extracted["text"], + update=update, + extracted=extracted, + session_identity=session_identity, + ) + + issue: GitHubIssueRead | None = None + manager_prompt = task_query + task_source = "prompt" + operator_note = "" + issue_reference: str | None = None + issue_url: str | None = None + approval_granted = approval_requested and _can_telegram_task_self_approve( + session_identity=session_identity + ) + + if approval_requested and not approval_granted and notifier.enabled: + background_tasks.add_task( + notifier.send_message, + chat_id=chat_id, + text="`--approve` 仅对 owner/partner 生效;本次仍按常规审批流执行。", + ) + + media_request = media_job_service.parse_telegram_task(task_query) + if media_request is not None: + media_request = media_request.model_copy( + update={ + "metadata": { + **media_request.metadata, + "source": "telegram_media_task", + "telegram_chat_id": chat_id, + "telegram_user_id": session_identity.actor.user_id, + "telegram_session_id": session.session_id, + "telegram_scope": session_identity.scope.value, + "raw_task_query": task_query, + } + } + ) + media_job = media_job_service.create(media_request) + openclaw_service.append_event( + session_id=session.session_id, + request=OpenClawSessionEventAppendRequest( + role="status", + content=f"media job queued: {media_job.job_id}", + metadata={ + "source": "telegram_media_task", + "job_id": media_job.job_id, + "mode": media_job.mode.value, + "target_bucket": media_job.target_bucket.value, + }, + ), + ) + openclaw_service.set_status( + session_id=session.session_id, + status=JobStatus.QUEUED, + metadata_updates={"latest_media_job_id": media_job.job_id}, + ) + if notifier.enabled: + background_tasks.add_task( + notifier.send_message, + chat_id=chat_id, + text=_build_media_job_queued_message(media_job), + ) + background_tasks.add_task( + _execute_media_job_and_notify, + media_job_service=media_job_service, + housekeeper_service=housekeeper_service, + openclaw_service=openclaw_service, + notifier=notifier, + chat_id=chat_id, + session_id=session.session_id, + job_id=media_job.job_id, + ) + return TelegramWebhookAck( + accepted=True, + update_id=_safe_int(update.get("update_id")), + chat_id=chat_id, + session_id=session.session_id, + metadata={ + "source": "telegram_media_task", + "job_id": media_job.job_id, + "mode": media_job.mode.value, + "target_bucket": media_job.target_bucket.value, + "scope": session_identity.scope.value, + }, + ) + + if task_query.casefold().startswith("issue "): + issue_reference, operator_note = _extract_issue_task_parts(task_query) + try: + issue = github_issue_service.fetch_issue(issue_reference) + except Exception as exc: + return TelegramWebhookAck( + accepted=False, + update_id=_safe_int(update.get("update_id")), + chat_id=chat_id, + session_id=session.session_id, + reason=str(exc), + metadata={ + "source": "telegram_manager_task", + "task_source": "issue", + "scope": session_identity.scope.value, + }, + ) + manager_prompt = github_issue_service.build_manager_prompt(issue, operator_note=operator_note or None) + task_source = "issue" + issue_reference = issue.reference.display + issue_url = issue.url + + dispatch_request, _, _ = housekeeper_service.prepare_manager_request( + ManagerDispatchRequest( + prompt=manager_prompt, + approval_granted=approval_granted, + auto_dispatch=True, + metadata={ + "source": "telegram_manager_task", + "task_source": task_source, + "telegram_chat_id": chat_id, + "telegram_user_id": session_identity.actor.user_id, + "telegram_session_id": session.session_id, + "telegram_scope": session_identity.scope.value, + "raw_task_query": task_query, + "operator_note": operator_note, + "github_issue_reference": issue_reference, + "github_issue_url": issue_url, + "github_issue_title": issue.title if issue is not None else None, + "approval_requested": approval_requested, + "approval_granted": approval_granted, + "approval_source": "telegram_task_flag" if approval_granted else None, + }, + ), + manager_service=manager_service, + trigger_source="telegram", + ) + dispatch = manager_service.create_dispatch(dispatch_request) + openclaw_service.append_event( + session_id=session.session_id, + request=OpenClawSessionEventAppendRequest( + role="status", + content=f"manager dispatch queued: {dispatch.dispatch_id}", + metadata={ + "source": "telegram_manager_task", + "dispatch_id": dispatch.dispatch_id, + "task_source": task_source, + "issue_reference": issue_reference, + }, + ), + ) + openclaw_service.set_status( + session_id=session.session_id, + status=JobStatus.QUEUED if dispatch_request.auto_dispatch else JobStatus.CREATED, + metadata_updates={"latest_manager_dispatch_id": dispatch.dispatch_id}, + ) + + if notifier.enabled: + background_tasks.add_task( + notifier.send_message, + chat_id=chat_id, + text=_build_manager_dispatch_queued_message(dispatch, issue_reference=issue_reference), + ) + + if dispatch_request.auto_dispatch: + background_tasks.add_task( + _execute_manager_dispatch_and_notify, + manager_service=manager_service, + approval_service=approval_service, + openclaw_service=openclaw_service, + notifier=notifier, + chat_id=chat_id, + session_id=session.session_id, + approval_uid=session_identity.actor.user_id or chat_id, + assistant_scope=session_identity.scope, + dispatch_id=dispatch.dispatch_id, + issue_reference=issue_reference, + issue_url=issue_url, + issue_title=issue.title if issue is not None else None, + ) + + return TelegramWebhookAck( + accepted=True, + update_id=_safe_int(update.get("update_id")), + chat_id=chat_id, + session_id=session.session_id, + metadata={ + "source": "telegram_manager_task", + "dispatch_id": dispatch.dispatch_id, + "task_source": task_source, + "issue_reference": issue_reference, + "issue_url": issue_url, + "scope": session_identity.scope.value, + }, + ) + + def _handle_memory_command( *, chat_id: str, @@ -979,6 +1264,7 @@ def _handle_approve_command( extracted: dict[str, Any], background_tasks: BackgroundTasks, approval_service: ApprovalStoreService, + github_issue_service: GitHubIssueService, notifier: TelegramNotifierService, session_identity: TelegramSessionIdentityRead, ) -> TelegramWebhookAck: @@ -1009,9 +1295,35 @@ def _handle_approve_command( message_text = _build_approval_decision_message(approval) message_source = "telegram_approve_decision" approval_query = approval.approval_id + if decision == "approved" and approval.metadata.get("action_type") == "github_issue_comment": + comment_output = _post_github_issue_comment_for_approval( + approval=approval, + approval_service=approval_service, + github_issue_service=github_issue_service, + chat_id=chat_id, + scope=session_identity.scope.value, + ) + message_text = "\n\n".join( + [ + message_text, + _build_github_issue_comment_posted_message( + approval_id=approval.approval_id, + issue_reference=str(approval.metadata.get("issue_reference") or "unknown"), + output=comment_output or None, + ), + ] + ).strip() except ValueError as exc: message_text = str(exc) message_source = "telegram_approve_decision" + except RuntimeError as exc: + message_text = "\n\n".join( + [ + _build_approval_decision_message(approval), + f"[GitHub Reply Failed]\n{str(exc).strip()}", + ] + ).strip() + message_source = "telegram_approve_decision" elif approval_id: approval = approval_service.get_request(approval_id) if approval is None or approval.telegram_uid != approval_uid: @@ -1159,6 +1471,345 @@ def _build_agent_result_message(run: ClaudeAgentRunRead) -> str: return text +def _execute_manager_dispatch_and_notify( + *, + manager_service: ManagerAgentService, + approval_service: ApprovalStoreService, + openclaw_service: OpenClawCompatService, + notifier: TelegramNotifierService, + chat_id: str, + session_id: str, + approval_uid: str, + assistant_scope: AssistantScope, + dispatch_id: str, + issue_reference: str | None, + issue_url: str | None, + issue_title: str | None, +) -> None: + dispatch: ManagerDispatchRead | None = None + try: + dispatch = manager_service.execute_dispatch(dispatch_id) + except Exception as exc: + dispatch = manager_service.get_dispatch(dispatch_id) + error_text = str(exc).strip() or "manager dispatch failed" + if dispatch is None: + if notifier.enabled: + notifier.send_message( + chat_id=chat_id, + text=_truncate_telegram_text( + "\n".join( + [ + "[Manager Task]", + f"dispatch: {dispatch_id}", + "status: failed", + "", + error_text, + ] + ) + ), + ) + return + dispatch = dispatch.model_copy(update={"error": dispatch.error or error_text}) + + final_status = dispatch.status + openclaw_service.append_event( + session_id=session_id, + request=OpenClawSessionEventAppendRequest( + role="status", + content=f"manager dispatch finished: {dispatch.dispatch_id}", + metadata={ + "source": "telegram_manager_task", + "dispatch_id": dispatch.dispatch_id, + "status": final_status.value, + "issue_reference": issue_reference, + }, + ), + ) + openclaw_service.set_status( + session_id=session_id, + status=JobStatus.COMPLETED if final_status == JobStatus.COMPLETED else JobStatus.FAILED, + metadata_updates={"latest_manager_dispatch_status": final_status.value}, + ) + + if notifier.enabled: + notifier.send_message( + chat_id=chat_id, + text=_build_manager_dispatch_result_message( + dispatch, + issue_reference=issue_reference, + issue_url=issue_url, + ), + ) + + if not issue_reference: + return + + approval = approval_service.create_request( + ApprovalRequestCreateRequest( + title=f"Reply to GitHub issue {issue_reference}", + summary=f"Review and post the automated execution update for {issue_reference}.", + risk=ApprovalRisk.EXTERNAL, + source="github_issue_task", + telegram_uid=approval_uid, + session_id=session_id, + assistant_scope=assistant_scope, + metadata={ + "action_type": "github_issue_comment", + "issue_reference": issue_reference, + "issue_url": issue_url, + "issue_title": issue_title, + "dispatch_id": dispatch.dispatch_id, + "comment_body": _build_github_issue_comment_body( + dispatch, + issue_reference=issue_reference, + issue_url=issue_url, + ), + }, + ) + ) + openclaw_service.append_event( + session_id=session_id, + request=OpenClawSessionEventAppendRequest( + role="status", + content=f"github issue reply approval queued: {approval.approval_id}", + metadata={ + "source": "telegram_manager_task", + "approval_id": approval.approval_id, + "dispatch_id": dispatch.dispatch_id, + "issue_reference": issue_reference, + }, + ), + ) + if notifier.enabled: + notifier.send_message( + chat_id=chat_id, + text=_build_github_issue_reply_approval_message( + approval_id=approval.approval_id, + issue_reference=issue_reference, + issue_url=issue_url, + ), + ) + + +def _execute_media_job_and_notify( + *, + media_job_service: MediaJobService, + housekeeper_service: HousekeeperService, + openclaw_service: OpenClawCompatService, + notifier: TelegramNotifierService, + chat_id: str, + session_id: str, + job_id: str, +) -> None: + job = media_job_service.execute(job_id) + housekeeper_service.record_media_job_outcome( + job=job, + notifier=notifier, + media_jobs=media_job_service.list(), + ) + openclaw_service.append_event( + session_id=session_id, + request=OpenClawSessionEventAppendRequest( + role="status", + content=f"media job finished: {job.job_id}", + metadata={ + "source": "telegram_media_task", + "job_id": job.job_id, + "status": job.status.value, + }, + ), + ) + openclaw_service.set_status( + session_id=session_id, + status=JobStatus.COMPLETED if job.status is MediaJobStatus.COMPLETED else JobStatus.FAILED, + metadata_updates={"latest_media_job_status": job.status.value}, + ) + if notifier.enabled: + notifier.send_message(chat_id=chat_id, text=_build_media_job_result_message(job)) + + +def _build_manager_dispatch_queued_message( + dispatch: ManagerDispatchRead, + *, + issue_reference: str | None, +) -> str: + task_count = len(dispatch.execution_plan.tasks) if dispatch.execution_plan is not None else 0 + lines = [ + "[Manager Task]", + f"dispatch: {dispatch.dispatch_id}", + f"strategy: {dispatch.execution_plan.strategy.value if dispatch.execution_plan is not None else 'single_task'}", + f"tasks: {task_count}", + ] + if issue_reference: + lines.append(f"issue: {issue_reference}") + deferred_reason = str(dispatch.metadata.get("deferred_reason") or "").strip() + if deferred_reason: + lines.append(f"deferred: {deferred_reason}") + lines.append("已接收,当前不自动执行,等待夜间窗口或人工放行。") + else: + lines.append("已接收,开始拆解并执行。") + return _truncate_telegram_text("\n".join(lines)) + + +def _build_media_job_queued_message(job: MediaJobRead) -> str: + return _truncate_telegram_text( + "\n".join( + [ + "[Media Job]", + f"job: {job.job_id}", + f"status: {job.status.value}", + f"target_bucket: {job.target_bucket.value}", + f"mode: {job.mode.value}", + ] + ) + ) + + +def _build_media_job_result_message(job: MediaJobRead) -> str: + lines = [ + "[Media Job]", + f"job: {job.job_id}", + f"status: {job.status.value}", + f"target_bucket: {job.target_bucket.value}", + f"mode: {job.mode.value}", + ] + if job.output_files: + lines.extend(["", "output_files:"]) + lines.extend(f"- {path}" for path in job.output_files[:8]) + if job.error: + lines.extend(["", "error:", job.error.strip()]) + return _truncate_telegram_text("\n".join(lines)) + + +def _build_manager_dispatch_result_message( + dispatch: ManagerDispatchRead, + *, + issue_reference: str | None, + issue_url: str | None, +) -> str: + task_count = len(dispatch.execution_plan.tasks) if dispatch.execution_plan is not None else 0 + completed_count = ( + sum(1 for item in dispatch.execution_plan.tasks if item.status == JobStatus.COMPLETED) + if dispatch.execution_plan is not None + else 0 + ) + lines = [ + "[Manager Task]", + f"dispatch: {dispatch.dispatch_id}", + f"status: {dispatch.status.value}", + f"tasks: {completed_count}/{task_count}", + ] + if issue_reference: + lines.append(f"issue: {issue_reference}") + if issue_url: + lines.append(f"url: {issue_url}") + if dispatch.summary: + lines.extend(["", dispatch.summary]) + + promotion = dispatch.run_summary.promotion if dispatch.run_summary is not None else None + if promotion is not None and promotion.pr_url: + lines.append(f"draft_pr: {promotion.pr_url}") + elif dispatch.run_summary is not None and dispatch.run_summary.promotion_patch_uri: + lines.append(f"patch: {dispatch.run_summary.promotion_patch_uri}") + + error_text = ( + dispatch.error + or ( + dispatch.run_summary.driver_result.error + if dispatch.run_summary is not None and dispatch.run_summary.driver_result.error + else None + ) + ) + if error_text: + lines.extend(["", "error:", error_text.strip()]) + return _truncate_telegram_text("\n".join(lines)) + + +def _build_github_issue_comment_body( + dispatch: ManagerDispatchRead, + *, + issue_reference: str, + issue_url: str | None, +) -> str: + lines = [ + "Automated progress update from the local autonomous agent stack.", + "", + f"- Issue: {issue_reference}", + f"- Dispatch: {dispatch.dispatch_id}", + f"- Status: {dispatch.status.value}", + ] + if issue_url: + lines.append(f"- Issue URL: {issue_url}") + if dispatch.summary: + lines.append(f"- Summary: {dispatch.summary}") + promotion = dispatch.run_summary.promotion if dispatch.run_summary is not None else None + if promotion is not None and promotion.pr_url: + lines.append(f"- Draft PR: {promotion.pr_url}") + error_text = ( + dispatch.error + or ( + dispatch.run_summary.driver_result.error + if dispatch.run_summary is not None and dispatch.run_summary.driver_result.error + else None + ) + ) + if error_text: + lines.append(f"- Error: {error_text.strip()}") + lines.extend( + [ + "", + "This update was prepared automatically from Telegram `/task issue` and still expects human review before merge.", + ] + ) + return "\n".join(lines).strip() + + +def _build_github_issue_reply_approval_message( + *, + approval_id: str, + issue_reference: str, + issue_url: str | None, +) -> str: + lines = [ + "[GitHub Reply Pending]", + f"approval: {approval_id}", + f"issue: {issue_reference}", + ] + if issue_url: + lines.append(f"url: {issue_url}") + lines.extend( + [ + "", + f"/approve {approval_id} approve 发布执行结果到 GitHub issue", + f"/approve {approval_id} reject 保留结果,仅在 Telegram 查看", + ] + ) + return _truncate_telegram_text("\n".join(lines)) + + +def _build_github_issue_comment_posted_message( + *, + approval_id: str, + issue_reference: str, + output: str | None, +) -> str: + lines = [ + "[GitHub Reply Posted]", + f"approval: {approval_id}", + f"issue: {issue_reference}", + ] + if output: + lines.extend(["", output.strip()]) + return _truncate_telegram_text("\n".join(lines)) + + +def _truncate_telegram_text(text: str) -> str: + normalized = text.strip() + if len(normalized) > 3900: + return normalized[:3900] + "\n...[truncated]" + return normalized + + def _handle_status_query( *, chat_id: str, @@ -1288,7 +1939,12 @@ def _is_status_query(text: str) -> bool: def _is_help_command(text: str) -> bool: normalized = text.strip().lower() - return normalized in {"/help", "help", "帮助"} + return normalized in {"/help", "/start", "help", "帮助"} + + +def _is_task_command(text: str) -> bool: + normalized = text.strip().lower() + return normalized == "/task" or normalized.startswith("/task ") def _is_approve_command(text: str) -> bool: @@ -1366,6 +2022,43 @@ def _extract_approve_query(text: str) -> str: return "" +def _extract_task_query(text: str) -> str: + return _parse_task_command(text)[0] + + +def _parse_task_command(text: str) -> tuple[str, bool]: + normalized = text.strip() + lowered = normalized.lower() + if lowered == "/task": + return "", False + if lowered.startswith("/task "): + payload = normalized.split(" ", 1)[1].strip() + approval_requested = False + if payload.startswith("--approve"): + approval_requested = True + payload = payload[len("--approve") :].strip() + return payload, approval_requested + return "", False + + +def _can_telegram_task_self_approve( + *, + session_identity: TelegramSessionIdentityRead, +) -> bool: + return session_identity.actor.role in {ActorRole.OWNER, ActorRole.PARTNER} + + +def _extract_issue_task_parts(task_query: str) -> tuple[str, str]: + normalized = task_query.strip() + if not normalized.casefold().startswith("issue "): + raise ValueError("issue task must start with `issue `") + remainder = normalized[6:].strip() + if not remainder: + raise ValueError("missing GitHub issue reference") + issue_reference, _, operator_note = remainder.partition(" ") + return issue_reference.strip(), operator_note.strip() + + def _parse_approve_query(query: str) -> tuple[str, str | None, str]: normalized = query.strip() if not normalized: @@ -1570,7 +2263,11 @@ def _build_help_message(*, session_identity: TelegramSessionIdentityRead) -> str chat_type = session_identity.chat_context.chat_type lines = [ "[Telegram Commands]", + "/start 查看欢迎信息和命令列表", "/status 查看当前会话、任务和能力摘要", + "/task <需求> 走 Manager Agent DAG 执行任务", + "/task --approve <需求> owner/partner 直通 Draft PR 审批上下文", + "/task issue [补充说明] 读取 GitHub issue 后派发修复", "/approve 查看待审批列表", "/approve 查看待审批详情", "/approve approve [备注] 批准待审批事项", @@ -1595,6 +2292,32 @@ def _build_help_message(*, session_identity: TelegramSessionIdentityRead) -> str return "\n".join(lines) +def _post_github_issue_comment_for_approval( + *, + approval: Any, + approval_service: ApprovalStoreService, + github_issue_service: GitHubIssueService, + chat_id: str, + scope: str, +) -> str: + issue_reference = str(approval.metadata.get("issue_reference") or "").strip() + comment_body = str(approval.metadata.get("comment_body") or "").strip() + if not issue_reference or not comment_body: + raise RuntimeError("approval is missing GitHub issue comment payload") + output = github_issue_service.post_comment(issue_reference, comment_body) + approval_service.update_request_metadata( + approval.approval_id, + { + "comment_posted": True, + "comment_posted_at": _utc_now(), + "comment_post_result": output, + "resolved_via_chat_id": chat_id, + "resolved_scope": scope, + }, + ) + return output + + def _build_approval_list_message(approvals: list[Any]) -> str: if not approvals: return "当前没有待审批事项。" diff --git a/src/autoresearch/api/routers/housekeeper.py b/src/autoresearch/api/routers/housekeeper.py new file mode 100644 index 00000000..b3973567 --- /dev/null +++ b/src/autoresearch/api/routers/housekeeper.py @@ -0,0 +1,76 @@ +from __future__ import annotations + +from fastapi import APIRouter, Depends, status + +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.api.dependencies import ( + get_approval_store_service, + get_autoresearch_planner_service, + get_housekeeper_service, + get_manager_agent_service, + get_media_job_service, + get_telegram_notifier_service, +) +from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.core.services.media_jobs import MediaJobService +from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.shared.housekeeper_contract import ( + HousekeeperModeUpdateRequest, + HousekeeperMorningSummaryRead, + HousekeeperStateRead, + HousekeeperTickRead, +) + + +router = APIRouter(prefix="/api/v1/housekeeper", tags=["housekeeper"]) + + +@router.get("/state", response_model=HousekeeperStateRead) +def get_housekeeper_state( + service: HousekeeperService = Depends(get_housekeeper_service), +) -> HousekeeperStateRead: + return service.get_state() + + +@router.post("/mode", response_model=HousekeeperStateRead) +def update_housekeeper_mode( + payload: HousekeeperModeUpdateRequest, + service: HousekeeperService = Depends(get_housekeeper_service), +) -> HousekeeperStateRead: + return service.update_mode(payload) + + +@router.post("/night-explore/tick", response_model=HousekeeperTickRead, status_code=status.HTTP_200_OK) +def execute_night_explore_tick( + service: HousekeeperService = Depends(get_housekeeper_service), + manager_service: ManagerAgentService = Depends(get_manager_agent_service), + planner_service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), + media_service: MediaJobService = Depends(get_media_job_service), + notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), +) -> HousekeeperTickRead: + return service.execute_night_explore_tick( + manager_service=manager_service, + planner_service=planner_service, + notifier=notifier, + media_jobs=media_service.list(), + ) + + +@router.post("/summaries/morning", response_model=HousekeeperMorningSummaryRead, status_code=status.HTTP_200_OK) +def generate_morning_summary( + service: HousekeeperService = Depends(get_housekeeper_service), + manager_service: ManagerAgentService = Depends(get_manager_agent_service), + planner_service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), + approval_service: ApprovalStoreService = Depends(get_approval_store_service), + media_service: MediaJobService = Depends(get_media_job_service), + notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), +) -> HousekeeperMorningSummaryRead: + return service.create_morning_summary( + manager_service=manager_service, + planner_service=planner_service, + approval_service=approval_service, + notifier=notifier, + media_jobs=media_service.list(), + ) diff --git a/src/autoresearch/api/routers/manager_agent.py b/src/autoresearch/api/routers/manager_agent.py new file mode 100644 index 00000000..e1f0a937 --- /dev/null +++ b/src/autoresearch/api/routers/manager_agent.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status + +from autoresearch.api.dependencies import get_housekeeper_service, get_manager_agent_service +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.shared.manager_agent_contract import ManagerDispatchRead, ManagerDispatchRequest + + +router = APIRouter(prefix="/api/v1/agents/manager", tags=["manager-agent"]) + + +@router.post( + "/dispatch", + response_model=ManagerDispatchRead, + status_code=status.HTTP_202_ACCEPTED, +) +def dispatch_manager_agent( + payload: ManagerDispatchRequest, + background_tasks: BackgroundTasks, + service: ManagerAgentService = Depends(get_manager_agent_service), + housekeeper_service: HousekeeperService = Depends(get_housekeeper_service), +) -> ManagerDispatchRead: + prepared, _, _ = housekeeper_service.prepare_manager_request( + payload, + manager_service=service, + trigger_source="api", + ) + dispatch = service.create_dispatch(prepared) + if prepared.auto_dispatch: + background_tasks.add_task(service.execute_dispatch, dispatch.dispatch_id) + return dispatch + + +@router.get("/dispatches", response_model=list[ManagerDispatchRead]) +def list_manager_dispatches( + service: ManagerAgentService = Depends(get_manager_agent_service), +) -> list[ManagerDispatchRead]: + return service.list_dispatches() + + +@router.get("/dispatches/{dispatch_id}", response_model=ManagerDispatchRead) +def get_manager_dispatch( + dispatch_id: str, + service: ManagerAgentService = Depends(get_manager_agent_service), +) -> ManagerDispatchRead: + dispatch = service.get_dispatch(dispatch_id) + if dispatch is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Manager dispatch not found") + return dispatch diff --git a/src/autoresearch/api/routers/media_jobs.py b/src/autoresearch/api/routers/media_jobs.py new file mode 100644 index 00000000..6d469b92 --- /dev/null +++ b/src/autoresearch/api/routers/media_jobs.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status + +from autoresearch.api.dependencies import get_housekeeper_service, get_media_job_service, get_telegram_notifier_service +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.core.services.media_jobs import MediaJobService +from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.shared.media_job_contract import MediaJobRead, MediaJobRequest + + +router = APIRouter(prefix="/api/v1/media/jobs", tags=["media-jobs"]) + + +@router.post("", response_model=MediaJobRead, status_code=status.HTTP_202_ACCEPTED) +def create_media_job( + payload: MediaJobRequest, + background_tasks: BackgroundTasks, + service: MediaJobService = Depends(get_media_job_service), + housekeeper_service: HousekeeperService = Depends(get_housekeeper_service), + notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), +) -> MediaJobRead: + job = service.create(payload) + background_tasks.add_task( + _execute_media_job, + service=service, + housekeeper_service=housekeeper_service, + notifier=notifier, + job_id=job.job_id, + ) + return job + + +@router.get("", response_model=list[MediaJobRead]) +def list_media_jobs( + service: MediaJobService = Depends(get_media_job_service), +) -> list[MediaJobRead]: + return service.list() + + +@router.get("/{job_id}", response_model=MediaJobRead) +def get_media_job( + job_id: str, + service: MediaJobService = Depends(get_media_job_service), +) -> MediaJobRead: + job = service.get(job_id) + if job is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Media job not found") + return job + + +def _execute_media_job( + *, + service: MediaJobService, + housekeeper_service: HousekeeperService, + notifier: TelegramNotifierService, + job_id: str, +) -> None: + completed = service.execute(job_id) + housekeeper_service.record_media_job_outcome( + job=completed, + notifier=notifier, + media_jobs=service.list(), + ) diff --git a/src/autoresearch/api/routers/panel.py b/src/autoresearch/api/routers/panel.py index b62acfe0..30302fc5 100644 --- a/src/autoresearch/api/routers/panel.py +++ b/src/autoresearch/api/routers/panel.py @@ -7,6 +7,7 @@ from autoresearch.api.dependencies import ( get_approval_store_service, + get_autoresearch_planner_service, get_capability_provider_registry, get_claude_agent_service, get_openclaw_compat_service, @@ -16,11 +17,13 @@ ) from autoresearch.core.adapters import CapabilityProviderRegistry from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService from autoresearch.core.services.claude_agents import ClaudeAgentService from autoresearch.core.services.openclaw_compat import OpenClawCompatService from autoresearch.core.services.panel_access import PanelAccessService from autoresearch.core.services.panel_audit import PanelAuditService from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.shared.autoresearch_planner_contract import AutoResearchPlanRead from autoresearch.shared.models import ( ApprovalDecisionRequest, ApprovalNoteRequest, @@ -126,6 +129,7 @@ def get_panel_state( agent_service: ClaudeAgentService = Depends(get_claude_agent_service), audit_service: PanelAuditService = Depends(get_panel_audit_service), approval_service: ApprovalStoreService = Depends(get_approval_store_service), + planner_service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), capability_registry: CapabilityProviderRegistry = Depends(get_capability_provider_registry), ) -> PanelStateRead: sessions = _sessions_for_uid(openclaw_service=openclaw_service, telegram_uid=access.telegram_uid) @@ -142,6 +146,10 @@ def get_panel_state( CapabilityProviderSummaryRead(**descriptor.model_dump()) for descriptor in capability_registry.list_descriptors() ] + pending_autoresearch_plans = [ + plan.model_dump(mode="json") + for plan in planner_service.list_pending(telegram_uid=access.telegram_uid, limit=20) + ] return PanelStateRead( telegram_uid=access.telegram_uid, sessions=sessions, @@ -149,6 +157,7 @@ def get_panel_state( audit_logs=audit_logs, capability_providers=capability_providers, pending_approvals=pending_approvals, + pending_autoresearch_plans=pending_autoresearch_plans, issued_at=utc_now(), ) @@ -287,6 +296,54 @@ def reject_panel_approval( return resolved +@router.post("/autoresearch/plans/{plan_id}/dispatch", response_model=AutoResearchPlanRead) +def dispatch_panel_autoresearch_plan( + plan_id: str, + payload: ApprovalNoteRequest, + request: Request, + background_tasks: BackgroundTasks, + access: PanelAccessContext = Depends(_require_panel_access), + planner_service: AutoResearchPlannerService = Depends(get_autoresearch_planner_service), + audit_service: PanelAuditService = Depends(get_panel_audit_service), + notifier: TelegramNotifierService = Depends(get_telegram_notifier_service), +) -> AutoResearchPlanRead: + plan = _authorized_plan( + plan_id=plan_id, + telegram_uid=access.telegram_uid, + planner_service=planner_service, + ) + try: + queued = planner_service.request_dispatch(plan.plan_id, requested_by=access.telegram_uid) + except ValueError as exc: + raise HTTPException(status_code=status.HTTP_409_CONFLICT, detail=str(exc)) from exc + + entry = audit_service.log_action( + telegram_uid=access.telegram_uid, + action="dispatch", + target_id=plan_id, + target_type="autoresearch_plan", + status="accepted", + reason=payload.note, + request_ip=_request_ip(request), + user_agent=request.headers.get("user-agent"), + metadata={ + "plan_title": plan.selected_candidate.title if plan.selected_candidate is not None else None, + "plan_source_path": plan.selected_candidate.source_path if plan.selected_candidate is not None else None, + "auth_method": access.auth_method, + "token_id": access.token_id, + }, + ) + background_tasks.add_task( + _execute_autoresearch_plan_and_notify, + planner_service=planner_service, + notifier=notifier, + plan_id=plan.plan_id, + telegram_uid=access.telegram_uid, + audit_entry_id=entry.audit_id, + ) + return queued + + @router.get("/agents/{agent_run_id}", response_model=ClaudeAgentRunRead) def get_panel_agent( agent_run_id: str, @@ -443,6 +500,54 @@ def _authorized_approval( return approval +def _authorized_plan( + *, + plan_id: str, + telegram_uid: str, + planner_service: AutoResearchPlannerService, +) -> AutoResearchPlanRead: + plan = planner_service.get(plan_id) + if plan is None: + raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="autoresearch plan not found") + if plan.telegram_uid not in {None, telegram_uid}: + raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="forbidden") + return plan + + +def _execute_autoresearch_plan_and_notify( + *, + planner_service: AutoResearchPlannerService, + notifier: TelegramNotifierService, + plan_id: str, + telegram_uid: str, + audit_entry_id: str, +) -> None: + result = planner_service.execute_dispatch(plan_id) + if not notifier.enabled: + return + candidate = result.selected_candidate + title = candidate.title if candidate is not None else plan_id + status_value = result.dispatch_status.value + lines = [ + f"[AutoResearch Dispatch] {title}", + f"- status: {status_value}", + f"- plan: {result.plan_id}", + f"- audit: {audit_entry_id}", + ] + if result.dispatch_run is not None: + lines.append(f"- lane: {result.dispatch_run.lane.value}") + lines.append(f"- remote_status: {result.dispatch_run.status.value}") + if result.dispatch_run.failure_class is not None: + lines.append(f"- failure_class: {result.dispatch_run.failure_class.value}") + if result.run_summary is not None: + lines.append(f"- final_status: {result.run_summary.final_status}") + if result.run_summary.promotion_patch_uri: + lines.append(f"- patch: {result.run_summary.promotion_patch_uri}") + if result.dispatch_error: + lines.append(f"- error: {result.dispatch_error}") + notifier.send_message(chat_id=telegram_uid, text="\n".join(lines)) + + def _request_ip(request: Request) -> str | None: if request.client is None: return None @@ -499,6 +604,10 @@ def _request_ip(request: Request) -> str | None:

待审批

+
+

AutoResearch Plans

+
+

审计日志


@@ -525,6 +634,7 @@ def _request_ip(request: Request) -> str | None:
 const runsEl = document.getElementById("runs");
 const capabilitiesEl = document.getElementById("capabilities");
 const approvalsEl = document.getElementById("approvals");
+const autoresearchPlansEl = document.getElementById("autoresearch-plans");
 const auditEl = document.getElementById("audit");
 
 if (tgWebApp) {
@@ -592,6 +702,23 @@ def _request_ip(request: Request) -> str | None:
   `;
 }
 
+function autoresearchPlanRow(item) {
+  const candidate = item.selected_candidate || {};
+  const estimatedChanges = (candidate.allowed_paths || []).length || 0;
+  return `
+    
+      ${item.plan_id}
+      ${candidate.source_path || "-"}
+      ${candidate.category || "-"}
+      ${estimatedChanges}
+      ${item.dispatch_status || "-"}
+      
+        
+      
+    
+  `;
+}
+
 function renderPromotionCard() {
   const isSkillPromotion = pendingPromotionAction.action === "managed-skill-promote"
     && pendingPromotionAction.installId
@@ -657,18 +784,21 @@ def _request_ip(request: Request) -> str | None:
   try {
     const state = await callApi("/api/v1/panel/state?limit_runs=60&limit_audit=40");
     const mode = token ? "JWT" : "Telegram Mini App";
-    summary.textContent = `UID: ${state.telegram_uid} | mode: ${mode} | sessions: ${state.sessions.length} | runs: ${state.agent_runs.length} | approvals: ${(state.pending_approvals || []).length} | providers: ${(state.capability_providers || []).length}`;
+    summary.textContent = `UID: ${state.telegram_uid} | mode: ${mode} | sessions: ${state.sessions.length} | runs: ${state.agent_runs.length} | approvals: ${(state.pending_approvals || []).length} | plans: ${(state.pending_autoresearch_plans || []).length} | providers: ${(state.capability_providers || []).length}`;
     const rows = state.agent_runs.map(runRow).join("");
     runsEl.innerHTML = `${rows}
AgentStatusTaskUpdatedActionHint
`; const capabilityRows = (state.capability_providers || []).map(capabilityRow).join(""); capabilitiesEl.innerHTML = `${capabilityRows || ""}
ProviderDomainStatusCapabilities
暂无
`; const approvalRows = (state.pending_approvals || []).map(approvalRow).join(""); approvalsEl.innerHTML = `${approvalRows || ""}
IDRiskTitleSourceExpiresDecision
暂无
`; + const planRows = (state.pending_autoresearch_plans || []).map(autoresearchPlanRow).join(""); + autoresearchPlansEl.innerHTML = `${planRows || ""}
PlanTargetHotspotEstimatedStatusAction
暂无
`; auditEl.textContent = JSON.stringify(state.audit_logs, null, 2); } catch (err) { summary.textContent = `加载失败: ${err.message}`; capabilitiesEl.innerHTML = "

加载失败

"; approvalsEl.innerHTML = "

加载失败

"; + autoresearchPlansEl.innerHTML = "

加载失败

"; } } @@ -705,6 +835,19 @@ def _request_ip(request: Request) -> str | None: } }); +autoresearchPlansEl.addEventListener("click", async (event) => { + const target = event.target; + if (!target || !target.dataset || !target.dataset.planId) return; + const planId = target.dataset.planId; + const note = prompt("输入 dispatch 备注", "approved via panel") || ""; + try { + await callApi(`/api/v1/panel/autoresearch/plans/${planId}/dispatch`, "POST", {note, metadata: {}}); + await refresh(); + } catch (err) { + alert(`派发失败: ${err.message}`); + } +}); + promotionApproveBtn.addEventListener("click", async () => { try { await approvePromotionAndExecute(); diff --git a/src/autoresearch/api/settings.py b/src/autoresearch/api/settings.py index b1f27bd5..107fd9b2 100644 --- a/src/autoresearch/api/settings.py +++ b/src/autoresearch/api/settings.py @@ -331,6 +331,57 @@ def _normalize_allowed_roles(cls, value: Any) -> set[str]: return roles or {"viewer", "editor", "admin", "owner"} +class UpstreamWatcherSettings(_BaseApiSettings): + upstream_url: str = Field( + default="https://github.com/openclaw/openclaw.git", + validation_alias="AUTORESEARCH_UPSTREAM_WATCH_URL", + ) + workspace_root: Path = Field( + default=Path("/Volumes/AI_LAB/ai_lab/workspace"), + validation_alias="AUTORESEARCH_UPSTREAM_WATCH_WORKSPACE_ROOT", + ) + max_commits: int = Field(default=5, validation_alias="AUTORESEARCH_UPSTREAM_WATCH_MAX_COMMITS") + + @field_validator("workspace_root", mode="before") + @classmethod + def _normalize_workspace_root(cls, value: Any) -> Path: + path = _parse_path(value) + return path or Path("/Volumes/AI_LAB/ai_lab/workspace") + + +class HousekeeperSettings(_BaseApiSettings): + timezone_name: str = Field(default="Asia/Shanghai", validation_alias="AUTORESEARCH_HOUSEKEEPER_TIMEZONE") + summary_chat_id: str | None = Field(default=None, validation_alias="AUTORESEARCH_HOUSEKEEPER_SUMMARY_CHAT_ID") + + +class MediaSettings(_BaseApiSettings): + media_root: Path = Field(default=Path("/home/lisa/media"), validation_alias="AUTORESEARCH_MEDIA_ROOT") + allowed_domains: set[str] = Field( + default_factory=lambda: { + "youtube.com", + "youtu.be", + "bilibili.com", + "vimeo.com", + "tiktok.com", + "douyin.com", + }, + validation_alias="AUTORESEARCH_MEDIA_ALLOWED_DOMAINS", + ) + yt_dlp_bin: str = Field(default="yt-dlp", validation_alias="AUTORESEARCH_MEDIA_YT_DLP_BIN") + ffmpeg_bin: str = Field(default="ffmpeg", validation_alias="AUTORESEARCH_MEDIA_FFMPEG_BIN") + + @field_validator("media_root", mode="before") + @classmethod + def _normalize_media_root(cls, value: Any) -> Path: + path = _parse_path(value) + return path or Path("/home/lisa/media") + + @field_validator("allowed_domains", mode="before") + @classmethod + def _normalize_allowed_domains(cls, value: Any) -> set[str]: + return _parse_csv_set(value) + + def load_runtime_settings() -> RuntimeSettings: return RuntimeSettings() @@ -353,6 +404,18 @@ def load_admin_settings() -> AdminSettings: return AdminSettings() +def load_upstream_watcher_settings() -> UpstreamWatcherSettings: + return UpstreamWatcherSettings() + + +def load_housekeeper_settings() -> HousekeeperSettings: + return HousekeeperSettings() + + +def load_media_settings() -> MediaSettings: + return MediaSettings() + + @lru_cache(maxsize=1) def get_runtime_settings() -> RuntimeSettings: return load_runtime_settings() @@ -378,10 +441,28 @@ def get_admin_settings() -> AdminSettings: return load_admin_settings() +@lru_cache(maxsize=1) +def get_upstream_watcher_settings() -> UpstreamWatcherSettings: + return load_upstream_watcher_settings() + + +@lru_cache(maxsize=1) +def get_housekeeper_settings() -> HousekeeperSettings: + return load_housekeeper_settings() + + +@lru_cache(maxsize=1) +def get_media_settings() -> MediaSettings: + return load_media_settings() + + def clear_settings_caches() -> None: get_runtime_settings.cache_clear() get_telegram_settings.cache_clear() get_panel_settings.cache_clear() get_feature_settings.cache_clear() get_admin_settings.cache_clear() + get_upstream_watcher_settings.cache_clear() + get_housekeeper_settings.cache_clear() + get_media_settings.cache_clear() _WARNED_DEPRECATED_ALIASES.clear() diff --git a/src/autoresearch/core/dispatch/__init__.py b/src/autoresearch/core/dispatch/__init__.py new file mode 100644 index 00000000..d3174a44 --- /dev/null +++ b/src/autoresearch/core/dispatch/__init__.py @@ -0,0 +1 @@ +"""Dispatch abstractions for local and future remote execution lanes.""" diff --git a/src/autoresearch/core/dispatch/failure_classifier.py b/src/autoresearch/core/dispatch/failure_classifier.py new file mode 100644 index 00000000..aedf184a --- /dev/null +++ b/src/autoresearch/core/dispatch/failure_classifier.py @@ -0,0 +1,132 @@ +from __future__ import annotations + +from dataclasses import dataclass +from typing import Literal + +from autoresearch.agent_protocol.models import RunSummary +from autoresearch.shared.remote_run_contract import FailureClass, RecoveryAction, RemoteRunStatus + + +_FAILURE_ACTIONS: dict[FailureClass, RecoveryAction] = { + FailureClass.PLANNER_STALLED: RecoveryAction.REQUIRE_HUMAN_REVIEW, + FailureClass.EXECUTOR_STALLED: RecoveryAction.RETRY, + FailureClass.TOOL_TIMEOUT: RecoveryAction.RETRY, + FailureClass.MODEL_FALLBACK: RecoveryAction.DOWNGRADE_TO_DRAFT, + FailureClass.ASSERTION_FAILED_AFTER_FALLBACK: RecoveryAction.REQUIRE_HUMAN_REVIEW, + FailureClass.ENV_MISSING: RecoveryAction.ABORT, + FailureClass.WORKSPACE_DIRTY: RecoveryAction.ABORT, + FailureClass.TRANSIENT_NETWORK: RecoveryAction.RETRY, + FailureClass.UNKNOWN: RecoveryAction.QUARANTINE, +} + +_ENV_MISSING_MARKERS = ( + "environmentcheckfailed:", + "launch_ai_lab.sh not found", + "no such file or directory", + "command not found", + "docker socket is stale", +) +_WORKSPACE_DIRTY_MARKERS = ( + "repository worktree is not clean", + "repo root has uncommitted changes", + "clean git checkout", + "clean base repository", +) +_TRANSIENT_NETWORK_MARKERS = ( + "connection reset", + "connection refused", + "temporary failure", + "network is unreachable", + "timed out while connecting", + "ssh:", +) + + +@dataclass(frozen=True, slots=True) +class FailureDisposition: + failure_class: FailureClass | None + recovery_action: RecoveryAction | None + + +def recovery_action_for_failure_class(failure_class: FailureClass | None) -> RecoveryAction | None: + if failure_class is None: + return None + return _FAILURE_ACTIONS[failure_class] + + +def classify_failure_class(failure_class: FailureClass | None) -> FailureDisposition: + return FailureDisposition( + failure_class=failure_class, + recovery_action=recovery_action_for_failure_class(failure_class), + ) + + +def infer_failure_class_from_error(error_text: str | None) -> FailureClass | None: + normalized = (error_text or "").strip().lower() + if not normalized: + return None + if any(marker in normalized for marker in _WORKSPACE_DIRTY_MARKERS): + return FailureClass.WORKSPACE_DIRTY + if any(marker in normalized for marker in _ENV_MISSING_MARKERS): + return FailureClass.ENV_MISSING + if any(marker in normalized for marker in _TRANSIENT_NETWORK_MARKERS): + return FailureClass.TRANSIENT_NETWORK + return None + + +def classify_remote_status( + status: RemoteRunStatus, + *, + stage: Literal["planner", "executor"] = "executor", + error_text: str | None = None, +) -> FailureDisposition: + if status is RemoteRunStatus.STALLED: + if stage == "planner": + return classify_failure_class(FailureClass.PLANNER_STALLED) + return classify_failure_class(FailureClass.EXECUTOR_STALLED) + if status is RemoteRunStatus.TIMED_OUT: + return classify_failure_class(FailureClass.TOOL_TIMEOUT) + if status is RemoteRunStatus.FAILED: + inferred = infer_failure_class_from_error(error_text) + return classify_failure_class(inferred or FailureClass.UNKNOWN) + return classify_failure_class(None) + + +def classify_run_summary(summary: RunSummary) -> FailureDisposition: + driver_result = summary.driver_result + error_text = str(driver_result.error or "").strip() + + if driver_result.status == "stalled_no_progress": + return classify_failure_class(FailureClass.EXECUTOR_STALLED) + if driver_result.status == "timed_out": + return classify_failure_class(FailureClass.TOOL_TIMEOUT) + + inferred = infer_failure_class_from_error(error_text) + if inferred is not None: + return classify_failure_class(inferred) + + if driver_result.agent_id == "mock" and summary.validation.passed: + return classify_failure_class(FailureClass.MODEL_FALLBACK) + if driver_result.agent_id == "mock" and not summary.validation.passed: + return classify_failure_class(FailureClass.ASSERTION_FAILED_AFTER_FALLBACK) + if driver_result.recommended_action == "fallback" and not summary.validation.passed: + return classify_failure_class(FailureClass.ASSERTION_FAILED_AFTER_FALLBACK) + + if summary.final_status == "failed": + return classify_failure_class(FailureClass.UNKNOWN) + return classify_failure_class(None) + + +def classify_remote_terminal( + *, + status: RemoteRunStatus, + stage: Literal["planner", "executor"] = "executor", + error_text: str | None = None, + run_summary: RunSummary | None = None, +) -> FailureDisposition: + if run_summary is not None: + disposition = classify_run_summary(run_summary) + if disposition.failure_class is not None or status is not RemoteRunStatus.SUCCEEDED: + return disposition + return disposition + return classify_remote_status(status, stage=stage, error_text=error_text) diff --git a/src/autoresearch/core/dispatch/fake_remote_adapter.py b/src/autoresearch/core/dispatch/fake_remote_adapter.py new file mode 100644 index 00000000..be4dce6d --- /dev/null +++ b/src/autoresearch/core/dispatch/fake_remote_adapter.py @@ -0,0 +1,433 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +import json +from pathlib import Path +from typing import Any, Callable + +from autoresearch.agent_protocol.models import RunSummary +from autoresearch.core.dispatch.failure_classifier import ( + FailureDisposition, + classify_remote_terminal, + classify_run_summary, +) +from autoresearch.core.dispatch.remote_adapter import RemoteDispatchAdapter +from autoresearch.shared.models import utc_now +from autoresearch.shared.remote_run_contract import ( + DispatchLane, + FailureClass, + RemoteHeartbeat, + RemoteRunRecord, + RemoteRunStatus, + RemoteRunSummary, + RemoteTaskSpec, +) + + +@dataclass(slots=True) +class _FakeRunState: + task_spec: RemoteTaskSpec + scenario: str + record: RemoteRunRecord + poll_count: int = 0 + heartbeats: list[RemoteHeartbeat] = field(default_factory=list) + events: list[dict[str, Any]] = field(default_factory=list) + summary: RemoteRunSummary | None = None + + +class FakeRemoteAdapter(RemoteDispatchAdapter): + TERMINAL_STATUSES = { + RemoteRunStatus.SUCCEEDED, + RemoteRunStatus.FAILED, + RemoteRunStatus.STALLED, + RemoteRunStatus.TIMED_OUT, + } + + def __init__( + self, + *, + repo_root: Path, + local_runner: Callable[[Any], RunSummary], + runtime_root: Path | None = None, + ) -> None: + self._repo_root = repo_root.resolve() + self._runtime_root = self._resolve_runtime_root(runtime_root) + self._local_runner = local_runner + self._states: dict[str, _FakeRunState] = {} + + def _resolve_runtime_root(self, runtime_root: Path | None) -> Path: + candidate = (runtime_root or (self._repo_root / ".masfactory_runtime" / "runs")).resolve() + try: + candidate.relative_to(self._repo_root) + except ValueError as exc: + raise ValueError( + "fake remote runtime_root must live under repo_root so artifact_paths remain repo-relative" + ) from exc + return candidate + + def dispatch(self, spec: RemoteTaskSpec) -> RemoteRunRecord: + now = utc_now() + scenario = self._scenario_for(spec) + record = RemoteRunRecord( + run_id=spec.run_id, + requested_lane=spec.requested_lane, + lane=spec.lane, + status=RemoteRunStatus.QUEUED, + summary=f"dispatch queued for {spec.lane.value} lane", + updated_at=now, + fallback_reason=self._fallback_reason_for(spec), + metadata={ + "runtime_mode": spec.runtime_mode, + "scenario": scenario, + "planner_plan_id": spec.planner_plan_id, + "planner_candidate_id": spec.planner_candidate_id, + }, + ) + state = _FakeRunState(task_spec=spec, scenario=scenario, record=record) + state.events.append( + { + "type": "queued", + "recorded_at": now.isoformat(), + "requested_lane": spec.requested_lane.value, + "lane": spec.lane.value, + "scenario": scenario, + } + ) + self._states[spec.run_id] = state + self._persist_state(state) + return record + + def poll(self, run_id: str) -> RemoteRunRecord: + state = self._require_state(run_id) + if state.record.status in self.TERMINAL_STATUSES: + self._persist_state(state) + return state.record + + state.poll_count += 1 + scenario = state.scenario + if scenario in {"local_execute", "fallback_to_local"}: + if state.poll_count == 1: + state.record = self._running_record(state, summary="local execution in progress") + state.heartbeats.append(self._heartbeat(state, sequence=1, summary="local execution heartbeat")) + else: + run_summary = self._local_runner(state.task_spec.job) + state.summary = self._summary_from_run_summary(state, run_summary) + state.record = self._record_from_summary(state.summary) + elif scenario == "success": + if state.poll_count == 1: + state.record = self._running_record(state, summary="remote execution in progress") + state.heartbeats.append(self._heartbeat(state, sequence=1, summary="remote execution heartbeat")) + else: + state.summary = self._terminal_summary( + state=state, + status=RemoteRunStatus.SUCCEEDED, + summary_text="remote execution completed successfully", + ) + state.record = self._record_from_summary(state.summary) + elif scenario == "stalled": + if state.poll_count == 1: + state.record = self._running_record(state, summary="remote execution started without progress heartbeat") + else: + state.summary = self._terminal_summary( + state=state, + status=RemoteRunStatus.STALLED, + summary_text="remote execution stalled without progress heartbeat", + ) + state.record = self._record_from_summary(state.summary) + elif scenario == "timed_out": + if state.poll_count == 1: + state.record = self._running_record(state, summary="remote execution running toward timeout") + state.heartbeats.append(self._heartbeat(state, sequence=1, summary="remote execution heartbeat")) + else: + state.summary = self._terminal_summary( + state=state, + status=RemoteRunStatus.TIMED_OUT, + summary_text="remote execution timed out", + ) + state.record = self._record_from_summary(state.summary) + elif scenario == "env_missing": + state.summary = self._terminal_summary( + state=state, + status=RemoteRunStatus.FAILED, + summary_text="remote environment is missing required runtime dependencies", + error_text="EnvironmentCheckFailed: missing remote runtime dependencies", + ) + state.record = self._record_from_summary(state.summary) + elif scenario == "transient_network": + state.summary = self._terminal_summary( + state=state, + status=RemoteRunStatus.FAILED, + summary_text="remote dispatch failed because the connection was interrupted", + error_text="ssh: connection reset by peer", + ) + state.record = self._record_from_summary(state.summary) + elif scenario == "result_fetch_failure": + if state.poll_count == 1: + state.record = self._running_record(state, summary="remote execution in progress") + state.heartbeats.append(self._heartbeat(state, sequence=1, summary="remote execution heartbeat")) + else: + state.record = RemoteRunRecord( + run_id=state.task_spec.run_id, + requested_lane=state.task_spec.requested_lane, + lane=state.task_spec.lane, + status=RemoteRunStatus.SUCCEEDED, + summary="remote execution completed but summary artifact was lost", + started_at=state.record.started_at or utc_now(), + updated_at=utc_now(), + finished_at=utc_now(), + fallback_reason=state.record.fallback_reason, + metadata=state.record.metadata, + ) + state.events.append( + { + "type": "summary_missing", + "recorded_at": utc_now().isoformat(), + } + ) + else: + state.summary = self._terminal_summary( + state=state, + status=RemoteRunStatus.FAILED, + summary_text=f"unsupported fake remote scenario: {scenario}", + error_text=f"unsupported fake remote scenario: {scenario}", + ) + state.record = self._record_from_summary(state.summary) + + self._persist_state(state) + return state.record + + def heartbeat(self, run_id: str) -> RemoteHeartbeat | None: + state = self._require_state(run_id) + return state.heartbeats[-1] if state.heartbeats else None + + def fetch_summary(self, run_id: str) -> RemoteRunSummary: + state = self._require_state(run_id) + if state.summary is None: + raise FileNotFoundError(f"remote summary is not available for run: {run_id}") + return state.summary + + def _scenario_for(self, spec: RemoteTaskSpec) -> str: + explicit = str(spec.metadata.get("remote_scenario") or "").strip() + if explicit: + return explicit + if spec.requested_lane is DispatchLane.REMOTE and spec.lane is DispatchLane.LOCAL: + return "fallback_to_local" + if spec.lane is DispatchLane.LOCAL: + return "local_execute" + return "success" + + @staticmethod + def _fallback_reason_for(spec: RemoteTaskSpec) -> str | None: + raw_reason = str(spec.metadata.get("fallback_reason") or "").strip() + if raw_reason: + return raw_reason + if spec.requested_lane is DispatchLane.REMOTE and spec.lane is DispatchLane.LOCAL: + return "remote lane unavailable; downgraded to local" + return None + + def _running_record(self, state: _FakeRunState, *, summary: str) -> RemoteRunRecord: + now = utc_now() + record = state.record.model_copy( + update={ + "status": RemoteRunStatus.RUNNING, + "summary": summary, + "started_at": state.record.started_at or now, + "updated_at": now, + } + ) + state.events.append( + { + "type": "running", + "recorded_at": now.isoformat(), + "summary": summary, + } + ) + return record + + def _heartbeat(self, state: _FakeRunState, *, sequence: int, summary: str) -> RemoteHeartbeat: + heartbeat = RemoteHeartbeat( + run_id=state.task_spec.run_id, + lane=state.task_spec.lane, + status=RemoteRunStatus.RUNNING, + sequence=sequence, + summary=summary, + artifact_paths=self._artifact_paths(state, include_summary=False), + ) + state.events.append( + { + "type": "heartbeat", + "recorded_at": heartbeat.recorded_at.isoformat(), + "sequence": sequence, + "summary": summary, + } + ) + return heartbeat + + def _summary_from_run_summary(self, state: _FakeRunState, run_summary: RunSummary) -> RemoteRunSummary: + disposition = classify_run_summary(run_summary) + final_status = ( + RemoteRunStatus.SUCCEEDED + if run_summary.final_status in {"ready_for_promotion", "promoted"} + else RemoteRunStatus.FAILED + ) + now = utc_now() + summary = RemoteRunSummary( + run_id=state.task_spec.run_id, + requested_lane=state.task_spec.requested_lane, + lane=state.task_spec.lane, + status=final_status, + failure_class=disposition.failure_class, + recovery_action=disposition.recovery_action, + artifact_paths=self._artifact_paths(state), + summary=f"local lane completed with final_status={run_summary.final_status}", + started_at=state.record.started_at or now, + updated_at=now, + finished_at=now, + fallback_reason=state.record.fallback_reason, + metadata=state.record.metadata, + run_summary=run_summary, + ) + state.events.append( + { + "type": "completed", + "recorded_at": now.isoformat(), + "status": summary.status.value, + "final_status": run_summary.final_status, + } + ) + return summary + + def _terminal_summary( + self, + *, + state: _FakeRunState, + status: RemoteRunStatus, + summary_text: str, + error_text: str | None = None, + ) -> RemoteRunSummary: + now = utc_now() + disposition = classify_remote_terminal(status=status, error_text=error_text) + summary = RemoteRunSummary( + run_id=state.task_spec.run_id, + requested_lane=state.task_spec.requested_lane, + lane=state.task_spec.lane, + status=status, + failure_class=disposition.failure_class, + recovery_action=disposition.recovery_action, + artifact_paths=self._artifact_paths(state), + summary=summary_text, + started_at=state.record.started_at or now, + updated_at=now, + finished_at=now, + fallback_reason=state.record.fallback_reason, + metadata={ + **state.record.metadata, + **({"error": error_text} if error_text else {}), + }, + ) + state.events.append( + { + "type": "completed", + "recorded_at": now.isoformat(), + "status": status.value, + "summary": summary_text, + "error": error_text, + } + ) + return summary + + @staticmethod + def _record_from_summary(summary: RemoteRunSummary) -> RemoteRunRecord: + return RemoteRunRecord.model_validate(summary.model_dump(mode="json", exclude={"run_summary"})) + + def _artifact_paths(self, state: _FakeRunState, *, include_summary: bool = True) -> dict[str, str]: + run_dir = self._runtime_root / state.task_spec.run_id + control_dir = run_dir / "remote_control" + paths = { + "task_spec": self._relpath(control_dir / "task_spec.json"), + "record": self._relpath(control_dir / "record.json"), + "events": self._relpath(control_dir / "events.ndjson"), + } + if state.heartbeats: + paths["heartbeat"] = self._relpath(control_dir / "heartbeat.json") + if include_summary and state.summary is not None: + paths["summary"] = self._relpath(control_dir / "summary.json") + if state.summary is not None and state.summary.run_summary is not None: + legacy_summary = run_dir / "summary.json" + if legacy_summary.exists(): + paths["legacy_run_summary"] = self._relpath(legacy_summary) + patch_uri = str(state.summary.run_summary.promotion_patch_uri or "").strip() + if patch_uri: + patch_path = Path(patch_uri) + if not patch_path.is_absolute(): + paths["promotion_patch"] = patch_path.as_posix() + return paths + + def _persist_state(self, state: _FakeRunState) -> None: + run_dir = self._runtime_root / state.task_spec.run_id + control_dir = run_dir / "remote_control" + control_dir.mkdir(parents=True, exist_ok=True) + + task_spec_path = control_dir / "task_spec.json" + task_spec_path.write_text( + json.dumps(state.task_spec.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + record = RemoteRunRecord.model_validate( + { + **state.record.model_dump(mode="json"), + "artifact_paths": self._artifact_paths(state), + } + ) + state.record = record + (control_dir / "record.json").write_text( + json.dumps(record.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + (control_dir / "events.ndjson").write_text( + "".join(json.dumps(event, ensure_ascii=False) + "\n" for event in state.events), + encoding="utf-8", + ) + + if state.heartbeats: + heartbeat = RemoteHeartbeat.model_validate( + { + **state.heartbeats[-1].model_dump(mode="json"), + "artifact_paths": self._artifact_paths(state, include_summary=False), + } + ) + state.heartbeats[-1] = heartbeat + (control_dir / "heartbeat.json").write_text( + json.dumps(heartbeat.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + if state.summary is not None: + summary = RemoteRunSummary.model_validate( + { + **state.summary.model_dump(mode="json"), + "artifact_paths": self._artifact_paths(state), + } + ) + state.summary = summary + if state.scenario != "result_fetch_failure": + (control_dir / "summary.json").write_text( + json.dumps(summary.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) + + def _require_state(self, run_id: str) -> _FakeRunState: + normalized = run_id.strip() + if normalized not in self._states: + raise KeyError(f"unknown fake remote run: {normalized}") + return self._states[normalized] + + def _relpath(self, path: Path) -> str: + try: + return path.resolve().relative_to(self._repo_root).as_posix() + except ValueError as exc: + raise ValueError( + "fake remote artifact path escaped repo_root; refusing to emit an absolute artifact_paths value" + ) from exc diff --git a/src/autoresearch/core/dispatch/remote_adapter.py b/src/autoresearch/core/dispatch/remote_adapter.py new file mode 100644 index 00000000..9f0cab62 --- /dev/null +++ b/src/autoresearch/core/dispatch/remote_adapter.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod + +from autoresearch.shared.remote_run_contract import ( + RemoteHeartbeat, + RemoteRunRecord, + RemoteRunSummary, + RemoteTaskSpec, +) + + +class RemoteDispatchAdapter(ABC): + @abstractmethod + def dispatch(self, spec: RemoteTaskSpec) -> RemoteRunRecord: + raise NotImplementedError + + @abstractmethod + def poll(self, run_id: str) -> RemoteRunRecord: + raise NotImplementedError + + @abstractmethod + def heartbeat(self, run_id: str) -> RemoteHeartbeat | None: + raise NotImplementedError + + @abstractmethod + def fetch_summary(self, run_id: str) -> RemoteRunSummary: + raise NotImplementedError diff --git a/src/autoresearch/core/runtime/__init__.py b/src/autoresearch/core/runtime/__init__.py new file mode 100644 index 00000000..20bc0177 --- /dev/null +++ b/src/autoresearch/core/runtime/__init__.py @@ -0,0 +1 @@ +"""Runtime mode selection and config helpers.""" diff --git a/src/autoresearch/core/runtime/select_mode.py b/src/autoresearch/core/runtime/select_mode.py new file mode 100644 index 00000000..3c47d402 --- /dev/null +++ b/src/autoresearch/core/runtime/select_mode.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import os +from pathlib import Path +from typing import Any + +import yaml +from pydantic import Field, field_validator + +from autoresearch.shared.models import StrictModel +from autoresearch.shared.remote_run_contract import DispatchLane + + +class RuntimeModePolicy(StrictModel): + name: str = Field(..., min_length=1) + preferred_lane: DispatchLane = DispatchLane.LOCAL + max_workers: int = Field(default=1, ge=1) + max_concurrency: int = Field(default=1, ge=1) + allow_exploration: bool = False + allow_patch: bool = True + allow_draft_pr: bool = False + require_high_risk_approval: bool = True + step_budget: int = Field(default=8, ge=1) + token_budget: int = Field(default=20_000, ge=1) + timeout_sec: int = Field(default=900, ge=1) + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("name") + @classmethod + def _normalize_name(cls, value: str) -> str: + normalized = value.strip().lower() + if not normalized: + raise ValueError("mode name is required") + return normalized + + +class SelectedRuntimeMode(StrictModel): + name: str = Field(..., min_length=1) + requested_lane: DispatchLane + lane: DispatchLane + fallback_reason: str | None = None + policy: RuntimeModePolicy + + @field_validator("name") + @classmethod + def _normalize_selected_name(cls, value: str) -> str: + normalized = value.strip().lower() + if not normalized: + raise ValueError("selected mode name is required") + return normalized + + @field_validator("fallback_reason") + @classmethod + def _normalize_reason(cls, value: str | None) -> str | None: + if value is None: + return None + normalized = value.strip() + return normalized or None + + +def _repo_root(repo_root: Path | None) -> Path: + if repo_root is not None: + return repo_root.resolve() + return Path(__file__).resolve().parents[4] + + +def _config_path(repo_root: Path, mode_name: str) -> Path: + return repo_root / "configs" / "runtime" / f"{mode_name}.yaml" + + +def load_mode_policy(repo_root: Path | None = None, mode_name: str = "day") -> RuntimeModePolicy: + root = _repo_root(repo_root) + normalized_mode = mode_name.strip().lower() + config_path = _config_path(root, normalized_mode) + if not config_path.exists(): + config_path = _config_path(_repo_root(None), normalized_mode) + payload = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {} + if not isinstance(payload, dict): + raise ValueError(f"runtime mode config must be a mapping: {config_path}") + return RuntimeModePolicy.model_validate({"name": normalized_mode, **payload}) + + +def select_mode( + repo_root: Path | None = None, + *, + requested_mode: str | None = None, + remote_available: bool | None = None, +) -> SelectedRuntimeMode: + mode_name = (requested_mode or os.environ.get("AUTORESEARCH_RUNTIME_MODE") or "day").strip().lower() + policy = load_mode_policy(repo_root, mode_name) + lane = policy.preferred_lane + fallback_reason = None + if policy.preferred_lane is DispatchLane.REMOTE and remote_available is False: + lane = DispatchLane.LOCAL + fallback_reason = "remote lane unavailable; downgraded to local" + return SelectedRuntimeMode( + name=policy.name, + requested_lane=policy.preferred_lane, + lane=lane, + fallback_reason=fallback_reason, + policy=policy, + ) diff --git a/src/autoresearch/core/services/agent_audit_trail.py b/src/autoresearch/core/services/agent_audit_trail.py new file mode 100644 index 00000000..bf814730 --- /dev/null +++ b/src/autoresearch/core/services/agent_audit_trail.py @@ -0,0 +1,679 @@ +from __future__ import annotations + +from dataclasses import dataclass, field +from datetime import datetime +import json +from pathlib import Path +from typing import Any + +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService +from autoresearch.core.services.claude_agents import ClaudeAgentService +from autoresearch.shared.models import ( + AdminAgentAuditRole, + AdminAgentAuditTrailDetailRead, + AdminAgentAuditTrailEntryRead, + AdminAgentAuditTrailSnapshotRead, + AdminAgentAuditTrailStatsRead, + JobStatus, + utc_now, +) + +_MAX_PATCH_CHARS = 120_000 +_SUCCESS_STATUSES = {"completed", "ready_for_promotion", "promoted", "succeeded"} +_FAILED_STATUSES = { + "failed", + "blocked", + "interrupted", + "timed_out", + "stalled_no_progress", + "policy_blocked", + "contract_error", + "rejected", +} +_PENDING_STATUSES = {"queued", "created", "pending", "dispatching"} +_RUNNING_STATUSES = {"running"} +_REVIEW_STATUSES = {"human_review", "needs_human_review"} + + +@dataclass(slots=True) +class _AuditEntryContext: + entry: AdminAgentAuditTrailEntryRead + input_prompt: str | None = None + job_spec: dict[str, Any] = field(default_factory=dict) + worker_spec: dict[str, Any] = field(default_factory=dict) + controlled_request: dict[str, Any] = field(default_factory=dict) + patch_text: str = "" + patch_truncated: bool = False + error_reason: str | None = None + traceback: str | None = None + raw_record: dict[str, Any] = field(default_factory=dict) + + +class AgentAuditTrailService: + """Aggregate worker execution footprints into an admin-friendly audit timeline.""" + + def __init__( + self, + *, + repo_root: Path, + planner_service: AutoResearchPlannerService, + manager_service: ManagerAgentService, + agent_service: ClaudeAgentService, + ) -> None: + self._repo_root = repo_root.resolve() + self._planner_service = planner_service + self._manager_service = manager_service + self._agent_service = agent_service + + def snapshot( + self, + *, + limit: int = 20, + status_filter: str | None = None, + agent_role: str | None = None, + ) -> AdminAgentAuditTrailSnapshotRead: + contexts = self._filter_contexts( + self._collect_entry_contexts(), + status_filter=status_filter, + agent_role=agent_role, + ) + items = [context.entry for context in contexts[: max(1, limit)]] + return AdminAgentAuditTrailSnapshotRead( + items=items, + stats=self._build_stats(items), + issued_at=utc_now(), + ) + + def detail(self, entry_id: str) -> AdminAgentAuditTrailDetailRead: + normalized_entry_id = entry_id.strip() + if not normalized_entry_id: + raise KeyError("audit trail entry id is required") + for context in self._collect_entry_contexts(): + if context.entry.entry_id != normalized_entry_id: + continue + return AdminAgentAuditTrailDetailRead( + entry=context.entry, + input_prompt=context.input_prompt, + job_spec=dict(context.job_spec), + worker_spec=dict(context.worker_spec), + controlled_request=dict(context.controlled_request), + patch_text=context.patch_text, + patch_truncated=context.patch_truncated, + error_reason=context.error_reason, + traceback=context.traceback, + raw_record=dict(context.raw_record), + ) + raise KeyError(f"audit trail entry not found: {entry_id}") + + def _collect_entry_contexts(self) -> list[_AuditEntryContext]: + contexts_by_run: dict[str, _AuditEntryContext] = {} + for context in self._collect_manager_contexts(): + contexts_by_run[context.entry.run_id] = context + for context in self._collect_planner_contexts(): + contexts_by_run.setdefault(context.entry.run_id, context) + for context in self._collect_claude_contexts(): + contexts_by_run.setdefault(context.entry.run_id, context) + for context in self._collect_runtime_contexts(): + existing = contexts_by_run.get(context.entry.run_id) + if existing is None: + contexts_by_run[context.entry.run_id] = context + continue + contexts_by_run[context.entry.run_id] = self._merge_contexts( + existing=existing, + incoming=context, + ) + return sorted( + contexts_by_run.values(), + key=lambda item: item.entry.recorded_at, + reverse=True, + ) + + def _collect_manager_contexts(self) -> list[_AuditEntryContext]: + contexts: list[_AuditEntryContext] = [] + for dispatch in self._manager_service.list_dispatches(): + if dispatch.execution_plan is None: + continue + for task in dispatch.execution_plan.tasks: + run_summary = task.run_summary + patch_uri = run_summary.promotion_patch_uri if run_summary is not None else None + patch_text, patch_truncated = self._load_patch_text(patch_uri) + changed_paths = self._extract_changed_paths(run_summary) + error_reason = ( + self._normalize_text(task.error) + or self._extract_run_summary_error(run_summary) + or self._normalize_text(dispatch.error) + ) + contexts.append( + _AuditEntryContext( + entry=AdminAgentAuditTrailEntryRead( + entry_id=f"manager:{task.task_id}", + source="manager_task", + agent_role=AdminAgentAuditRole.MANAGER, + run_id=self._extract_task_run_id(task, fallback=task.task_id), + agent_id=run_summary.driver_result.agent_id if run_summary is not None else "openhands", + title=task.title, + status=task.status.value, + final_status=run_summary.final_status if run_summary is not None else None, + recorded_at=dispatch.updated_at, + duration_ms=self._extract_duration_ms(run_summary), + first_progress_ms=self._extract_metric(run_summary, "first_progress_ms"), + first_scoped_write_ms=self._extract_metric(run_summary, "first_scoped_write_ms"), + first_state_heartbeat_ms=self._extract_metric(run_summary, "first_state_heartbeat_ms"), + files_changed=len(changed_paths), + changed_paths=changed_paths, + scope_paths=list(task.worker_spec.allowed_paths) if task.worker_spec is not None else [], + patch_uri=patch_uri, + summary=task.summary, + metadata={ + "dispatch_id": dispatch.dispatch_id, + "intent": dispatch.selected_intent.label if dispatch.selected_intent is not None else None, + "stage": task.stage.value, + "depends_on": list(task.depends_on), + }, + ), + input_prompt=dispatch.prompt, + job_spec=self._model_payload(task.agent_job), + worker_spec=self._model_payload(task.worker_spec), + controlled_request=self._model_payload(task.controlled_request), + patch_text=patch_text, + patch_truncated=patch_truncated, + error_reason=error_reason, + traceback=self._multiline_or_none(task.error), + raw_record={"manager_dispatch": dispatch.model_dump(mode="json")}, + ) + ) + return contexts + + def _collect_planner_contexts(self) -> list[_AuditEntryContext]: + contexts: list[_AuditEntryContext] = [] + for plan in self._planner_service.list(): + run_summary = plan.run_summary + dispatch_run = plan.dispatch_run + patch_uri = run_summary.promotion_patch_uri if run_summary is not None else None + patch_text, patch_truncated = self._load_patch_text(patch_uri) + changed_paths = self._extract_changed_paths(run_summary) + title = plan.selected_candidate.title if plan.selected_candidate is not None else plan.goal + contexts.append( + _AuditEntryContext( + entry=AdminAgentAuditTrailEntryRead( + entry_id=f"plan:{plan.plan_id}", + source="autoresearch_plan", + agent_role=AdminAgentAuditRole.PLANNER, + run_id=run_summary.run_id if run_summary is not None else (plan.agent_job.run_id if plan.agent_job else plan.plan_id), + agent_id=run_summary.driver_result.agent_id if run_summary is not None else "openhands", + title=title, + status=plan.dispatch_status.value, + final_status=run_summary.final_status if run_summary is not None else None, + recorded_at=plan.dispatch_completed_at or plan.updated_at, + duration_ms=self._extract_duration_ms(run_summary), + first_progress_ms=self._extract_metric(run_summary, "first_progress_ms"), + first_scoped_write_ms=self._extract_metric(run_summary, "first_scoped_write_ms"), + first_state_heartbeat_ms=self._extract_metric(run_summary, "first_state_heartbeat_ms"), + files_changed=len(changed_paths), + changed_paths=changed_paths, + scope_paths=list(plan.worker_spec.allowed_paths) if plan.worker_spec is not None else [], + patch_uri=patch_uri, + summary=plan.summary, + metadata={ + "plan_id": plan.plan_id, + "candidate_category": ( + plan.selected_candidate.category if plan.selected_candidate is not None else None + ), + "source_path": ( + plan.selected_candidate.source_path if plan.selected_candidate is not None else None + ), + "dispatch_requested_lane": ( + dispatch_run.requested_lane.value if dispatch_run is not None else None + ), + "dispatch_lane": dispatch_run.lane.value if dispatch_run is not None else None, + "dispatch_remote_status": ( + dispatch_run.status.value if dispatch_run is not None else None + ), + "dispatch_failure_class": ( + dispatch_run.failure_class.value + if dispatch_run is not None and dispatch_run.failure_class is not None + else None + ), + "dispatch_recovery_action": ( + dispatch_run.recovery_action.value + if dispatch_run is not None and dispatch_run.recovery_action is not None + else None + ), + "dispatch_fallback_reason": ( + dispatch_run.fallback_reason if dispatch_run is not None else None + ), + }, + ), + input_prompt=plan.goal, + job_spec=self._model_payload(plan.agent_job), + worker_spec=self._model_payload(plan.worker_spec), + controlled_request=self._model_payload(plan.controlled_request), + patch_text=patch_text, + patch_truncated=patch_truncated, + error_reason=( + self._normalize_text(plan.dispatch_error) + or self._normalize_text(plan.error) + or self._extract_run_summary_error(run_summary) + ), + traceback=self._multiline_or_none(plan.error), + raw_record={"autoresearch_plan": plan.model_dump(mode="json")}, + ) + ) + return contexts + + def _collect_claude_contexts(self) -> list[_AuditEntryContext]: + contexts: list[_AuditEntryContext] = [] + for run in self._agent_service.list(): + contexts.append( + _AuditEntryContext( + entry=AdminAgentAuditTrailEntryRead( + entry_id=f"claude:{run.agent_run_id}", + source="claude_agent", + agent_role=AdminAgentAuditRole.WORKER, + run_id=run.agent_run_id, + agent_id=run.agent_name or "claude_cli", + title=run.task_name, + status=run.status.value, + final_status=( + run.status.value + if run.status in {JobStatus.COMPLETED, JobStatus.FAILED, JobStatus.INTERRUPTED} + else None + ), + recorded_at=run.updated_at, + duration_ms=int(run.duration_seconds * 1000) if run.duration_seconds is not None else None, + first_progress_ms=None, + first_scoped_write_ms=None, + first_state_heartbeat_ms=None, + files_changed=0, + changed_paths=[], + scope_paths=[], + patch_uri=self._normalize_text(run.metadata.get("patch_uri")), + summary=run.stderr_preview or run.stdout_preview or run.prompt[:160], + metadata={"session_id": run.session_id, "parent_agent_id": run.parent_agent_id}, + ), + input_prompt=run.prompt, + patch_text=self._load_patch_text(self._normalize_text(run.metadata.get("patch_uri")))[0] + if self._normalize_text(run.metadata.get("patch_uri")) + else "", + patch_truncated=self._load_patch_text(self._normalize_text(run.metadata.get("patch_uri")))[1] + if self._normalize_text(run.metadata.get("patch_uri")) + else False, + error_reason=self._normalize_text(run.error), + traceback=self._normalize_text(run.stderr_preview), + raw_record={"claude_agent": run.model_dump(mode="json")}, + ) + ) + return contexts + + def _collect_runtime_contexts(self) -> list[_AuditEntryContext]: + contexts: list[_AuditEntryContext] = [] + for path in self._runtime_summary_files(): + payload = self._load_json(path) + if not isinstance(payload, dict): + continue + run_id = str(payload.get("run_id", "")).strip() + if not run_id: + continue + changed_paths = self._runtime_changed_paths(payload) + patch_uri = self._runtime_patch_uri(payload) + patch_text, patch_truncated = self._load_patch_text(patch_uri) + contexts.append( + _AuditEntryContext( + entry=AdminAgentAuditTrailEntryRead( + entry_id=f"runtime:{run_id}", + source="runtime_artifact", + agent_role=AdminAgentAuditRole.WORKER, + run_id=run_id, + agent_id=self._runtime_agent_id(payload), + title=str(payload.get("task") or payload.get("run_id") or path.parent.name), + status=self._runtime_status(payload), + final_status=str(payload.get("final_status") or payload.get("status") or "").strip() or None, + recorded_at=datetime.fromtimestamp(path.stat().st_mtime, tz=utc_now().tzinfo), + duration_ms=self._runtime_duration_ms(payload), + first_progress_ms=self._runtime_metric_ms(payload, "first_progress_ms"), + first_scoped_write_ms=self._runtime_metric_ms(payload, "first_scoped_write_ms"), + first_state_heartbeat_ms=self._runtime_metric_ms(payload, "first_state_heartbeat_ms"), + files_changed=self._runtime_files_changed(payload, changed_paths), + changed_paths=changed_paths, + scope_paths=[], + patch_uri=patch_uri, + isolated_workspace=str(payload.get("isolated_workspace", "")).strip() or None, + summary=self._runtime_summary_text(payload), + metadata={"artifact_path": str(path)}, + ), + input_prompt=self._normalize_text(payload.get("task")), + job_spec=self._dict_payload(payload.get("job_spec")), + worker_spec=self._dict_payload(payload.get("worker_spec")), + controlled_request=self._dict_payload(payload.get("controlled_request")), + patch_text=patch_text, + patch_truncated=patch_truncated, + error_reason=self._runtime_error_reason(payload), + traceback=self._runtime_traceback(payload), + raw_record={"runtime_artifact": payload}, + ) + ) + return contexts + + def _filter_contexts( + self, + contexts: list[_AuditEntryContext], + *, + status_filter: str | None, + agent_role: str | None, + ) -> list[_AuditEntryContext]: + normalized_status = self._normalize_filter(status_filter) + normalized_role = self._normalize_filter(agent_role) + filtered: list[_AuditEntryContext] = [] + for context in contexts: + if not self._matches_status_filter(context.entry, normalized_status): + continue + if not self._matches_role_filter(context.entry, normalized_role): + continue + filtered.append(context) + return filtered + + def _merge_contexts( + self, + *, + existing: _AuditEntryContext, + incoming: _AuditEntryContext, + ) -> _AuditEntryContext: + return _AuditEntryContext( + entry=existing.entry.model_copy( + update={ + "duration_ms": self._prefer_metric( + existing.entry.duration_ms, + incoming.entry.duration_ms, + ), + "first_progress_ms": self._prefer_metric( + existing.entry.first_progress_ms, + incoming.entry.first_progress_ms, + ), + "first_scoped_write_ms": self._prefer_metric( + existing.entry.first_scoped_write_ms, + incoming.entry.first_scoped_write_ms, + ), + "first_state_heartbeat_ms": self._prefer_metric( + existing.entry.first_state_heartbeat_ms, + incoming.entry.first_state_heartbeat_ms, + ), + "files_changed": max(existing.entry.files_changed, incoming.entry.files_changed), + "changed_paths": existing.entry.changed_paths or incoming.entry.changed_paths, + "patch_uri": existing.entry.patch_uri or incoming.entry.patch_uri, + "isolated_workspace": existing.entry.isolated_workspace or incoming.entry.isolated_workspace, + "summary": existing.entry.summary or incoming.entry.summary, + "metadata": {**incoming.entry.metadata, **existing.entry.metadata}, + } + ), + input_prompt=existing.input_prompt or incoming.input_prompt, + job_spec=existing.job_spec or incoming.job_spec, + worker_spec=existing.worker_spec or incoming.worker_spec, + controlled_request=existing.controlled_request or incoming.controlled_request, + patch_text=existing.patch_text or incoming.patch_text, + patch_truncated=existing.patch_truncated or incoming.patch_truncated, + error_reason=existing.error_reason or incoming.error_reason, + traceback=existing.traceback or incoming.traceback, + raw_record={**incoming.raw_record, **existing.raw_record}, + ) + + def _build_stats(self, items: list[AdminAgentAuditTrailEntryRead]) -> AdminAgentAuditTrailStatsRead: + stats = AdminAgentAuditTrailStatsRead(total=len(items)) + for item in items: + normalized = self._status_bucket(item) + if normalized == "success": + stats.succeeded += 1 + elif normalized == "failed": + stats.failed += 1 + elif normalized == "running": + stats.running += 1 + elif normalized == "pending": + stats.queued += 1 + elif normalized == "review": + stats.review_required += 1 + return stats + + def _runtime_summary_files(self) -> list[Path]: + files: list[Path] = [] + for pattern in ( + ".masfactory_runtime/runs/*/summary.json", + ".masfactory_runtime/smokes/*/artifacts/chain_summary.json", + "logs/audit/openhands/jobs/*/chain_summary.json", + ): + files.extend(self._repo_root.glob(pattern)) + files.sort(key=lambda item: item.stat().st_mtime, reverse=True) + return files[:80] + + def _load_patch_text(self, patch_uri: str | None) -> tuple[str, bool]: + patch_path = self._resolve_repo_path(patch_uri) + if patch_path is None or not patch_path.exists() or not patch_path.is_file(): + return "", False + try: + patch_text = patch_path.read_text(encoding="utf-8", errors="replace") + except OSError: + return "", False + if len(patch_text) <= _MAX_PATCH_CHARS: + return patch_text, False + truncated = patch_text[:_MAX_PATCH_CHARS].rstrip() + return f"{truncated}\n\n... [patch truncated]", True + + def _resolve_repo_path(self, candidate: str | None) -> Path | None: + normalized = self._normalize_text(candidate) + if not normalized: + return None + path = Path(normalized) + if path.is_absolute(): + return path + return (self._repo_root / path).resolve() + + @staticmethod + def _normalize_filter(value: str | None) -> str | None: + normalized = str(value or "").strip().lower() + return None if normalized in {"", "all"} else normalized + + @staticmethod + def _prefer_metric(primary: int | None, secondary: int | None) -> int | None: + return primary if primary is not None else secondary + + @staticmethod + def _matches_role_filter(entry: AdminAgentAuditTrailEntryRead, agent_role: str | None) -> bool: + if agent_role is None: + return True + return entry.agent_role.value == agent_role + + def _matches_status_filter(self, entry: AdminAgentAuditTrailEntryRead, status_filter: str | None) -> bool: + if status_filter is None: + return True + return self._status_bucket(entry) == status_filter + + def _status_bucket(self, entry: AdminAgentAuditTrailEntryRead) -> str: + normalized = (entry.final_status or entry.status).strip().lower() + if normalized in _SUCCESS_STATUSES: + return "success" + if normalized in _FAILED_STATUSES: + return "failed" + if normalized in _RUNNING_STATUSES: + return "running" + if normalized in _PENDING_STATUSES: + return "pending" + if normalized in _REVIEW_STATUSES: + return "review" + return normalized or "pending" + + @staticmethod + def _extract_task_run_id(task: Any, *, fallback: str) -> str: + if getattr(task, "run_summary", None) is not None: + return task.run_summary.run_id + if getattr(task, "agent_job", None) is not None: + return task.agent_job.run_id + return fallback + + @staticmethod + def _extract_changed_paths(run_summary: Any) -> list[str]: + if run_summary is None: + return [] + return list(run_summary.driver_result.changed_paths) + + @staticmethod + def _extract_duration_ms(run_summary: Any) -> int | None: + if run_summary is None: + return None + return run_summary.driver_result.metrics.duration_ms + + @staticmethod + def _extract_metric(run_summary: Any, metric_name: str) -> int | None: + if run_summary is None: + return None + value = getattr(run_summary.driver_result.metrics, metric_name, None) + return int(value) if isinstance(value, (int, float)) else None + + @staticmethod + def _extract_run_summary_error(run_summary: Any) -> str | None: + if run_summary is None: + return None + return str(run_summary.driver_result.error or "").strip() or None + + @staticmethod + def _model_payload(model: Any) -> dict[str, Any]: + if model is None: + return {} + if hasattr(model, "model_dump"): + return model.model_dump(mode="json") + if isinstance(model, dict): + return dict(model) + return {"value": model} + + @staticmethod + def _dict_payload(value: Any) -> dict[str, Any]: + if isinstance(value, dict): + return dict(value) + return {} + + @staticmethod + def _load_json(path: Path) -> Any: + try: + return json.loads(path.read_text(encoding="utf-8")) + except (OSError, json.JSONDecodeError, UnicodeDecodeError): + return None + + @staticmethod + def _normalize_text(value: Any) -> str | None: + normalized = str(value or "").strip() + return normalized or None + + @staticmethod + def _multiline_or_none(value: Any) -> str | None: + normalized = str(value or "").strip() + if "\n" not in normalized: + return None + return normalized + + @staticmethod + def _runtime_status(payload: dict[str, Any]) -> str: + if "promotion_ready" in payload: + return "ready_for_promotion" if bool(payload.get("promotion_ready")) else "failed" + return str(payload.get("final_status") or payload.get("status") or "unknown") + + @staticmethod + def _runtime_agent_id(payload: dict[str, Any]) -> str | None: + driver_result = payload.get("driver_result") + if isinstance(driver_result, dict): + agent_id = str(driver_result.get("agent_id", "")).strip() + if agent_id: + return agent_id + return "openhands" + + @staticmethod + def _runtime_duration_ms(payload: dict[str, Any]) -> int | None: + return AgentAuditTrailService._runtime_metric_ms(payload, "duration_ms") + + @staticmethod + def _runtime_metric_ms(payload: dict[str, Any], metric_name: str) -> int | None: + driver_result = payload.get("driver_result") + if not isinstance(driver_result, dict): + return None + metrics = driver_result.get("metrics") + if not isinstance(metrics, dict): + return None + value = metrics.get(metric_name) + return int(value) if isinstance(value, (int, float)) else None + + @staticmethod + def _runtime_changed_paths(payload: dict[str, Any]) -> list[str]: + promotion = payload.get("promotion") + if isinstance(promotion, dict): + changed = promotion.get("changed_files") + if isinstance(changed, list): + return [str(item) for item in changed] + driver_result = payload.get("driver_result") + if isinstance(driver_result, dict): + changed = driver_result.get("changed_paths") + if isinstance(changed, list): + return [str(item) for item in changed] + return [] + + @staticmethod + def _runtime_files_changed(payload: dict[str, Any], changed_paths: list[str]) -> int: + promotion = payload.get("promotion") + if isinstance(promotion, dict): + diff_stats = promotion.get("diff_stats") + if isinstance(diff_stats, dict): + value = diff_stats.get("files_changed") + if isinstance(value, (int, float)): + return int(value) + return len(changed_paths) + + @staticmethod + def _runtime_patch_uri(payload: dict[str, Any]) -> str | None: + for candidate in ( + payload.get("promotion_patch_uri"), + (payload.get("artifacts") or {}).get("promotion_patch") + if isinstance(payload.get("artifacts"), dict) + else None, + (payload.get("promotion") or {}).get("patch_uri") + if isinstance(payload.get("promotion"), dict) + else None, + ): + normalized = str(candidate or "").strip() + if normalized: + return normalized + return None + + @staticmethod + def _runtime_summary_text(payload: dict[str, Any]) -> str: + driver_result = payload.get("driver_result") + if isinstance(driver_result, dict): + summary = str(driver_result.get("summary", "")).strip() + if summary: + return summary + return str(payload.get("task") or payload.get("status") or "").strip() + + def _runtime_error_reason(self, payload: dict[str, Any]) -> str | None: + for candidate in ( + payload.get("error"), + payload.get("detail"), + (payload.get("driver_result") or {}).get("error") + if isinstance(payload.get("driver_result"), dict) + else None, + self._runtime_status(payload) if self._runtime_status(payload) in _FAILED_STATUSES | _REVIEW_STATUSES else None, + ): + normalized = self._normalize_text(candidate) + if normalized: + return normalized + return None + + def _runtime_traceback(self, payload: dict[str, Any]) -> str | None: + for candidate in ( + payload.get("traceback"), + payload.get("stderr"), + (payload.get("driver_result") or {}).get("stderr") + if isinstance(payload.get("driver_result"), dict) + else None, + (payload.get("validation") or {}).get("detail") + if isinstance(payload.get("validation"), dict) + else None, + ): + normalized = self._normalize_text(candidate) + if normalized: + return normalized + return None diff --git a/src/autoresearch/core/services/autoresearch_planner.py b/src/autoresearch/core/services/autoresearch_planner.py new file mode 100644 index 00000000..abd038ab --- /dev/null +++ b/src/autoresearch/core/services/autoresearch_planner.py @@ -0,0 +1,826 @@ +from __future__ import annotations + +from dataclasses import dataclass +import os +from pathlib import Path +import re +from typing import Callable + +from autoresearch.agent_protocol.models import JobSpec, RunSummary +from autoresearch.core.dispatch.fake_remote_adapter import FakeRemoteAdapter +from autoresearch.core.dispatch.failure_classifier import classify_remote_terminal +from autoresearch.core.dispatch.remote_adapter import RemoteDispatchAdapter +from autoresearch.core.runtime.select_mode import SelectedRuntimeMode, select_mode +from autoresearch.core.services.writer_lease import WriterLeaseService +from autoresearch.executions.runner import AgentExecutionRunner +from autoresearch.core.services.openhands_worker import OpenHandsWorkerService +from autoresearch.core.services.upstream_watcher import UpstreamWatcherService +from autoresearch.shared.autoresearch_planner_contract import ( + AutoResearchPlanRead, + AutoResearchPlanDispatchStatus, + AutoResearchPlannerCandidateRead, + AutoResearchPlannerEvidenceRead, + AutoResearchPlannerRequest, + UpstreamWatchDecision, + UpstreamWatchRead, +) +from autoresearch.shared.models import GitPromotionMode, JobStatus, utc_now +from autoresearch.shared.openhands_worker_contract import OpenHandsWorkerJobSpec +from autoresearch.shared.remote_run_contract import ( + RemoteRunRecord, + RemoteRunStatus, + RemoteRunSummary, + RemoteTaskSpec, +) +from autoresearch.shared.store import Repository, create_resource_id + + +_IGNORED_PATH_PARTS = { + ".git", + ".venv", + "__pycache__", + ".mypy_cache", + ".pytest_cache", + ".ruff_cache", + ".masfactory_runtime", + "node_modules", + "panel", + "dashboard", + "memory", + "logs", +} +_MARKER_WEIGHTS = { + "BUG": 65.0, + "FIXME": 60.0, + "XXX": 50.0, + "HACK": 45.0, + "TODO": 35.0, +} +_MARKER_PATTERN = re.compile(r"\b(BUG|FIXME|XXX|HACK|TODO)\b[:\s-]*(.*)") +_CRITICAL_PREFIXES = ( + "src/autoresearch/core/services/", + "src/autoresearch/executions/", + "src/autoresearch/api/", + "scripts/", +) + + +@dataclass(slots=True) +class _MarkerOccurrence: + marker: str + line: int + detail: str + weight: float + + +class AutoResearchPlannerService: + """Scan the repository for bounded patch-only work and emit worker-ready specs.""" + + _TERMINAL_REMOTE_STATUSES = { + RemoteRunStatus.SUCCEEDED, + RemoteRunStatus.FAILED, + RemoteRunStatus.STALLED, + RemoteRunStatus.TIMED_OUT, + } + _MAX_DISPATCH_POLLS = 8 + + def __init__( + self, + repository: Repository[AutoResearchPlanRead], + *, + repo_root: Path | None = None, + worker_service: OpenHandsWorkerService | None = None, + dispatch_runner: Callable[[JobSpec], RunSummary] | None = None, + remote_adapter: RemoteDispatchAdapter | None = None, + writer_lease: WriterLeaseService | None = None, + upstream_watcher: UpstreamWatcherService | None = None, + ) -> None: + self._repository = repository + self._repo_root = (repo_root or Path(__file__).resolve().parents[4]).resolve() + self._worker_service = worker_service or OpenHandsWorkerService() + self._dispatch_runner = dispatch_runner or self._default_dispatch_runner + self._remote_adapter = remote_adapter or FakeRemoteAdapter( + repo_root=self._repo_root, + local_runner=self._dispatch_runner, + ) + self._writer_lease = writer_lease or WriterLeaseService() + self._upstream_watcher = upstream_watcher + + def create(self, request: AutoResearchPlannerRequest) -> AutoResearchPlanRead: + now = utc_now() + plan_id = create_resource_id("plan") + upstream_watch = self._inspect_upstream(request) + try: + candidates = self._scan_candidates(limit=request.max_candidates) + selected = candidates[0] if candidates else None + + worker_spec = None + controlled_request = None + agent_job = None + summary = "Planner scanned the repo but did not find a safe patch-only candidate." + if selected is not None: + worker_spec = self._build_worker_spec( + plan_id=plan_id, + candidate=selected, + request=request, + ) + controlled_request = self._worker_service.build_controlled_request(worker_spec) + agent_job = self._worker_service.build_agent_job_spec(worker_spec) + summary = ( + f"Selected {selected.title} from {len(candidates)} candidate(s); " + f"score={selected.priority_score:.1f}." + ) + summary = self._augment_summary_with_upstream_watch(summary, upstream_watch) + + plan = AutoResearchPlanRead( + plan_id=plan_id, + goal=request.goal, + status=JobStatus.COMPLETED, + summary=summary, + created_at=now, + updated_at=now, + selected_candidate=selected, + candidates=candidates, + worker_spec=worker_spec, + controlled_request=controlled_request, + agent_job=agent_job, + upstream_watch=upstream_watch, + telegram_uid=request.telegram_uid, + panel_action_url=None, + notification_sent=False, + dispatch_status=AutoResearchPlanDispatchStatus.PENDING, + dispatch_requested_at=None, + dispatch_completed_at=None, + dispatch_requested_by=None, + dispatch_run=None, + run_summary=None, + dispatch_error=None, + metadata={ + **request.metadata, + "repo_root": str(self._repo_root), + "pipeline_target": request.pipeline_target, + "target_base_branch": request.target_base_branch, + }, + error=None, + ) + except Exception as exc: + plan = AutoResearchPlanRead( + plan_id=plan_id, + goal=request.goal, + status=JobStatus.FAILED, + summary="Planner scan failed.", + created_at=now, + updated_at=now, + selected_candidate=None, + candidates=[], + worker_spec=None, + controlled_request=None, + agent_job=None, + upstream_watch=upstream_watch, + telegram_uid=request.telegram_uid, + panel_action_url=None, + notification_sent=False, + dispatch_status=AutoResearchPlanDispatchStatus.FAILED, + dispatch_requested_at=None, + dispatch_completed_at=None, + dispatch_requested_by=None, + dispatch_run=None, + run_summary=None, + dispatch_error=None, + metadata={ + **request.metadata, + "repo_root": str(self._repo_root), + "pipeline_target": request.pipeline_target, + "target_base_branch": request.target_base_branch, + }, + error=str(exc), + ) + + return self._repository.save(plan.plan_id, plan) + + def list(self) -> list[AutoResearchPlanRead]: + return self._repository.list() + + def get(self, plan_id: str) -> AutoResearchPlanRead | None: + return self._repository.get(plan_id) + + def list_pending(self, *, telegram_uid: str | None = None, limit: int = 20) -> list[AutoResearchPlanRead]: + normalized_uid = (telegram_uid or "").strip() or None + items: list[AutoResearchPlanRead] = [] + for item in self._repository.list(): + if item.dispatch_status is not AutoResearchPlanDispatchStatus.PENDING: + continue + if normalized_uid is not None and item.telegram_uid not in {None, normalized_uid}: + continue + items.append(item) + items.sort(key=lambda item: item.updated_at, reverse=True) + return items[: max(1, limit)] + + def update_delivery( + self, + plan_id: str, + *, + telegram_uid: str | None, + panel_action_url: str | None, + notification_sent: bool, + ) -> AutoResearchPlanRead: + with self._writer_lease.acquire(f"autoresearch-plan:{plan_id}"): + plan = self._require_plan(plan_id) + updated = plan.model_copy( + update={ + "telegram_uid": (telegram_uid or "").strip() or plan.telegram_uid, + "panel_action_url": panel_action_url, + "notification_sent": notification_sent, + "updated_at": utc_now(), + } + ) + return self._repository.save(updated.plan_id, updated) + + def request_dispatch( + self, + plan_id: str, + *, + requested_by: str, + ) -> AutoResearchPlanRead: + with self._writer_lease.acquire(f"autoresearch-plan:{plan_id}"): + plan = self._require_plan(plan_id) + if plan.worker_spec is None or plan.agent_job is None: + raise ValueError("plan does not have a dispatchable worker contract") + if plan.dispatch_status is AutoResearchPlanDispatchStatus.DISPATCHING: + raise ValueError("plan is already dispatching") + if plan.dispatch_status is AutoResearchPlanDispatchStatus.DISPATCHED: + raise ValueError("plan has already been dispatched") + + dispatch_job, selected_mode, task_spec = self._prepare_dispatch(plan) + now = utc_now() + updated = plan.model_copy( + update={ + "agent_job": dispatch_job, + "dispatch_status": AutoResearchPlanDispatchStatus.DISPATCHING, + "dispatch_requested_at": now, + "dispatch_requested_by": requested_by.strip(), + "dispatch_completed_at": None, + "dispatch_run": self._queued_dispatch_run( + task_spec=task_spec, + selected_mode=selected_mode, + ), + "dispatch_error": None, + "updated_at": now, + "metadata": { + **plan.metadata, + "dispatch_requested_by": requested_by.strip(), + }, + } + ) + return self._repository.save(updated.plan_id, updated) + + def execute_dispatch(self, plan_id: str) -> AutoResearchPlanRead: + plan = self._require_plan(plan_id) + if plan.worker_spec is None: + raise ValueError("plan does not have a worker spec") + + dispatch_job, _, task_spec = self._prepare_dispatch(plan) + try: + self._remote_adapter.dispatch(task_spec) + remote_summary = self._await_remote_summary(task_spec) + except Exception as exc: + remote_summary = self._dispatch_exception_summary(task_spec, exc) + with self._writer_lease.acquire(f"autoresearch-plan:{plan_id}"): + current = self._require_plan(plan_id) + updated = current.model_copy( + update={ + "agent_job": dispatch_job, + "dispatch_status": self._derive_dispatch_status(remote_summary), + "dispatch_completed_at": utc_now(), + "dispatch_run": self._record_from_summary(remote_summary), + "run_summary": remote_summary.run_summary, + "dispatch_error": self._dispatch_error_from_summary(remote_summary), + "updated_at": utc_now(), + } + ) + return self._repository.save(updated.plan_id, updated) + + with self._writer_lease.acquire(f"autoresearch-plan:{plan_id}"): + current = self._require_plan(plan_id) + updated = current.model_copy( + update={ + "agent_job": dispatch_job, + "dispatch_status": self._derive_dispatch_status(remote_summary), + "dispatch_completed_at": utc_now(), + "dispatch_run": self._record_from_summary(remote_summary), + "run_summary": remote_summary.run_summary, + "dispatch_error": self._dispatch_error_from_summary(remote_summary), + "updated_at": utc_now(), + } + ) + return self._repository.save(updated.plan_id, updated) + + def _scan_candidates(self, *, limit: int) -> list[AutoResearchPlannerCandidateRead]: + candidates: list[AutoResearchPlannerCandidateRead] = [] + marker_candidates = self._marker_candidates() + candidate_index = { + (candidate.category, candidate.source_path): candidate for candidate in marker_candidates + } + candidates.extend(marker_candidates) + + for candidate in self._test_gap_candidates(): + key = (candidate.category, candidate.source_path) + if key in candidate_index: + continue + candidates.append(candidate) + candidate_index[key] = candidate + + candidates.sort( + key=lambda item: ( + -item.priority_score, + item.category, + item.source_path, + ) + ) + return candidates[:limit] + + def _marker_candidates(self) -> list[AutoResearchPlannerCandidateRead]: + candidates: list[AutoResearchPlannerCandidateRead] = [] + for rel_path in self._iter_python_files(): + occurrences = self._find_markers(rel_path) + if not occurrences: + continue + suggested_tests = self._infer_test_paths(rel_path) + score = max(item.weight for item in occurrences) + score += min(18.0, (len(occurrences) - 1) * 6.0) + score += self._criticality_bonus(rel_path) + if not self._has_existing_test(suggested_tests): + score += 15.0 + + first = occurrences[0] + marker_list = ", ".join(item.marker for item in occurrences[:3]) + evidence = [ + AutoResearchPlannerEvidenceRead( + kind="marker", + path=rel_path, + line=item.line, + detail=f"{item.marker}: {item.detail}".strip(), + weight=item.weight, + ) + for item in occurrences[:5] + ] + if self._criticality_bonus(rel_path) > 0: + evidence.append( + AutoResearchPlannerEvidenceRead( + kind="hotspot", + path=rel_path, + detail="critical control-plane hotspot", + weight=self._criticality_bonus(rel_path), + ) + ) + if not self._has_existing_test(suggested_tests): + evidence.append( + AutoResearchPlannerEvidenceRead( + kind="test_gap", + path=suggested_tests[0], + detail="direct regression test is missing and should be added", + weight=15.0, + ) + ) + + candidates.append( + AutoResearchPlannerCandidateRead( + candidate_id=create_resource_id("candidate"), + title=f"Resolve {first.marker} backlog in {rel_path}", + summary=( + f"Address {marker_list} markers in {rel_path} and keep the patch focused " + f"to the source file plus a targeted regression test." + ), + category="marker_backlog", + priority_score=round(score, 1), + source_path=rel_path, + allowed_paths=[rel_path, *suggested_tests], + suggested_test_paths=suggested_tests, + test_command=self._build_test_command(rel_path, suggested_tests), + evidence=evidence, + metadata={ + "marker_count": len(occurrences), + "primary_marker": first.marker, + }, + ) + ) + return candidates + + def _test_gap_candidates(self) -> list[AutoResearchPlannerCandidateRead]: + candidates: list[AutoResearchPlannerCandidateRead] = [] + for rel_path in self._iter_python_files(): + if rel_path.startswith("tests/"): + continue + if Path(rel_path).name == "__init__.py": + continue + line_count = self._count_lines(rel_path) + if line_count < 120: + continue + + suggested_tests = self._infer_test_paths(rel_path) + if self._has_existing_test(suggested_tests): + continue + + score = 40.0 + min(20.0, line_count / 20.0) + score += self._criticality_bonus(rel_path) + candidates.append( + AutoResearchPlannerCandidateRead( + candidate_id=create_resource_id("candidate"), + title=f"Add focused regression coverage for {rel_path}", + summary=( + f"{rel_path} is relatively large and lacks a direct regression test. " + "Add a focused test while keeping source changes minimal." + ), + category="test_gap", + priority_score=round(score, 1), + source_path=rel_path, + allowed_paths=[rel_path, *suggested_tests], + suggested_test_paths=suggested_tests, + test_command=self._build_test_command(rel_path, suggested_tests), + evidence=[ + AutoResearchPlannerEvidenceRead( + kind="test_gap", + path=suggested_tests[0], + detail="no direct test file found for this source hotspot", + weight=25.0, + ), + AutoResearchPlannerEvidenceRead( + kind="hotspot", + path=rel_path, + detail=f"file has {line_count} lines", + weight=min(20.0, line_count / 20.0), + ), + ], + metadata={ + "line_count": line_count, + }, + ) + ) + return candidates + + def _build_worker_spec( + self, + *, + plan_id: str, + candidate: AutoResearchPlannerCandidateRead, + request: AutoResearchPlannerRequest, + ) -> OpenHandsWorkerJobSpec: + slug = self._slugify(candidate.source_path) + branch_suffix = candidate.candidate_id.split("_")[-1] + problem_statement = ( + f"{candidate.summary}\n\n" + f"Goal: {request.goal}\n" + f"Selected source: {candidate.source_path}\n" + f"Primary evidence: {candidate.evidence[0].detail if candidate.evidence else 'n/a'}" + ) + return OpenHandsWorkerJobSpec( + job_id=f"{plan_id}-{branch_suffix}", + problem_statement=problem_statement, + allowed_paths=list(candidate.allowed_paths), + test_command=candidate.test_command, + pipeline_target=request.pipeline_target, + target_base_branch=request.target_base_branch, + max_iterations=request.max_iterations, + metadata={ + **request.metadata, + "planner_plan_id": plan_id, + "planner_candidate_id": candidate.candidate_id, + "planner_score": candidate.priority_score, + "planner_category": candidate.category, + "approval_granted": request.approval_granted, + "branch_name": f"codex/autoresearch/{slug}-{branch_suffix[:6]}", + "commit_message": f"AutoResearch: {candidate.title}", + "pr_title": f"AutoResearch: {candidate.title}", + "pr_body": candidate.summary, + "base_branch": request.target_base_branch, + }, + ) + + def _default_dispatch_runner(self, job: JobSpec) -> RunSummary: + runner = AgentExecutionRunner(repo_root=self._repo_root) + return runner.run_job(job) + + def _prepare_dispatch( + self, + plan: AutoResearchPlanRead, + ) -> tuple[JobSpec, SelectedRuntimeMode, RemoteTaskSpec]: + if plan.worker_spec is None: + raise ValueError("plan does not have a dispatchable worker contract") + base_job = plan.agent_job or self._worker_service.build_agent_job_spec(plan.worker_spec) + selected_mode = self._select_dispatch_mode(plan) + dispatch_job = self._apply_mode_policy(base_job, selected_mode) + task_spec = RemoteTaskSpec( + run_id=dispatch_job.run_id, + requested_lane=selected_mode.requested_lane, + lane=selected_mode.lane, + runtime_mode=selected_mode.name, + planner_plan_id=plan.plan_id, + planner_candidate_id=( + plan.selected_candidate.candidate_id if plan.selected_candidate is not None else None + ), + job=dispatch_job, + metadata={ + **plan.metadata, + "runtime_mode": selected_mode.name, + "fallback_reason": selected_mode.fallback_reason, + }, + ) + return dispatch_job, selected_mode, task_spec + + def _select_dispatch_mode(self, plan: AutoResearchPlanRead) -> SelectedRuntimeMode: + requested_mode = str(plan.metadata.get("runtime_mode") or "").strip() or None + remote_available = self._coerce_bool( + plan.metadata.get("remote_available", os.environ.get("AUTORESEARCH_REMOTE_AVAILABLE")), + default=False, + ) + return select_mode( + self._repo_root, + requested_mode=requested_mode, + remote_available=remote_available, + ) + + def _apply_mode_policy(self, job: JobSpec, selected_mode: SelectedRuntimeMode) -> JobSpec: + metadata = { + **job.metadata, + "runtime_mode": selected_mode.name, + "dispatch_requested_lane": selected_mode.requested_lane.value, + "dispatch_lane": selected_mode.lane.value, + "dispatch_max_workers": selected_mode.policy.max_workers, + "dispatch_max_concurrency": selected_mode.policy.max_concurrency, + "dispatch_allow_exploration": selected_mode.policy.allow_exploration, + "dispatch_allow_draft_pr": selected_mode.policy.allow_draft_pr, + "dispatch_token_budget": selected_mode.policy.token_budget, + } + if selected_mode.fallback_reason: + metadata["dispatch_fallback_reason"] = selected_mode.fallback_reason + + preferred_mode = str(metadata.get("pipeline_target") or GitPromotionMode.PATCH.value).strip().lower() + if not selected_mode.policy.allow_draft_pr and preferred_mode == GitPromotionMode.DRAFT_PR.value: + metadata["pipeline_target"] = GitPromotionMode.PATCH.value + metadata["dispatch_pipeline_target_downgraded"] = GitPromotionMode.PATCH.value + + policy = job.policy.model_copy( + update={ + "timeout_sec": min(job.policy.timeout_sec, selected_mode.policy.timeout_sec), + "max_steps": min(job.policy.max_steps, selected_mode.policy.step_budget), + } + ) + return job.model_copy(update={"policy": policy, "metadata": metadata}) + + def _queued_dispatch_run( + self, + *, + task_spec: RemoteTaskSpec, + selected_mode: SelectedRuntimeMode, + ) -> RemoteRunRecord: + return RemoteRunRecord( + run_id=task_spec.run_id, + requested_lane=task_spec.requested_lane, + lane=task_spec.lane, + status=RemoteRunStatus.QUEUED, + summary=f"dispatch queued for {task_spec.lane.value} lane", + fallback_reason=selected_mode.fallback_reason, + metadata={ + "runtime_mode": selected_mode.name, + "dispatch_max_concurrency": selected_mode.policy.max_concurrency, + "dispatch_token_budget": selected_mode.policy.token_budget, + }, + ) + + def _await_remote_summary(self, task_spec: RemoteTaskSpec) -> RemoteRunSummary: + last_record: RemoteRunRecord | None = None + for _ in range(self._MAX_DISPATCH_POLLS): + last_record = self._remote_adapter.poll(task_spec.run_id) + if last_record.status in self._TERMINAL_REMOTE_STATUSES: + break + if last_record is None: + return self._planner_stalled_summary(task_spec, detail="remote adapter returned no dispatch record") + if last_record.status not in self._TERMINAL_REMOTE_STATUSES: + return self._planner_stalled_summary( + task_spec, + detail="dispatch polling exhausted before reaching a terminal state", + ) + try: + return self._remote_adapter.fetch_summary(task_spec.run_id) + except FileNotFoundError as exc: + return self._missing_summary_summary(task_spec, last_record=last_record, exc=exc) + + def _planner_stalled_summary(self, task_spec: RemoteTaskSpec, *, detail: str) -> RemoteRunSummary: + disposition = classify_remote_terminal(status=RemoteRunStatus.STALLED, stage="planner") + now = utc_now() + return RemoteRunSummary( + run_id=task_spec.run_id, + requested_lane=task_spec.requested_lane, + lane=task_spec.lane, + status=RemoteRunStatus.STALLED, + failure_class=disposition.failure_class, + recovery_action=disposition.recovery_action, + summary=detail, + started_at=now, + updated_at=now, + finished_at=now, + fallback_reason=str(task_spec.metadata.get("fallback_reason") or "").strip() or None, + metadata={"runtime_mode": task_spec.runtime_mode}, + ) + + def _missing_summary_summary( + self, + task_spec: RemoteTaskSpec, + *, + last_record: RemoteRunRecord, + exc: Exception, + ) -> RemoteRunSummary: + disposition = classify_remote_terminal( + status=RemoteRunStatus.FAILED, + error_text=str(exc), + ) + now = utc_now() + return RemoteRunSummary( + run_id=task_spec.run_id, + requested_lane=last_record.requested_lane, + lane=last_record.lane, + status=RemoteRunStatus.FAILED, + failure_class=disposition.failure_class, + recovery_action=disposition.recovery_action, + artifact_paths=last_record.artifact_paths, + summary=f"dispatch result fetch failed: {exc}", + started_at=last_record.started_at, + updated_at=now, + finished_at=now, + fallback_reason=last_record.fallback_reason, + metadata=last_record.metadata, + ) + + def _dispatch_exception_summary( + self, + task_spec: RemoteTaskSpec, + exc: Exception, + ) -> RemoteRunSummary: + disposition = classify_remote_terminal(status=RemoteRunStatus.FAILED, error_text=str(exc)) + now = utc_now() + return RemoteRunSummary( + run_id=task_spec.run_id, + requested_lane=task_spec.requested_lane, + lane=task_spec.lane, + status=RemoteRunStatus.FAILED, + failure_class=disposition.failure_class, + recovery_action=disposition.recovery_action, + summary=f"dispatch failed before completion: {exc}", + started_at=now, + updated_at=now, + finished_at=now, + fallback_reason=str(task_spec.metadata.get("fallback_reason") or "").strip() or None, + metadata={"runtime_mode": task_spec.runtime_mode}, + ) + + @staticmethod + def _record_from_summary(summary: RemoteRunSummary) -> RemoteRunRecord: + return RemoteRunRecord.model_validate(summary.model_dump(mode="json", exclude={"run_summary"})) + + @classmethod + def _derive_dispatch_status(cls, summary: RemoteRunSummary) -> AutoResearchPlanDispatchStatus: + if summary.status is not RemoteRunStatus.SUCCEEDED: + return AutoResearchPlanDispatchStatus.FAILED + if summary.run_summary is None: + return AutoResearchPlanDispatchStatus.DISPATCHED + if summary.run_summary.final_status in {"ready_for_promotion", "promoted"}: + return AutoResearchPlanDispatchStatus.DISPATCHED + return AutoResearchPlanDispatchStatus.FAILED + + @staticmethod + def _dispatch_error_from_summary(summary: RemoteRunSummary) -> str | None: + if summary.status is RemoteRunStatus.SUCCEEDED: + if summary.run_summary is None or summary.run_summary.final_status in {"ready_for_promotion", "promoted"}: + return None + if summary.run_summary is not None: + error_text = str(summary.run_summary.driver_result.error or "").strip() + if error_text: + return error_text + if summary.run_summary.final_status not in {"ready_for_promotion", "promoted"}: + return summary.run_summary.final_status + if summary.failure_class is not None: + return summary.failure_class.value + return summary.summary or None + + @staticmethod + def _coerce_bool(value: object, *, default: bool) -> bool: + if value is None: + return default + if isinstance(value, bool): + return value + normalized = str(value).strip().lower() + if not normalized: + return default + if normalized in {"1", "true", "yes", "on"}: + return True + if normalized in {"0", "false", "no", "off"}: + return False + return default + + def _inspect_upstream(self, request: AutoResearchPlannerRequest) -> UpstreamWatchRead | None: + if not request.include_upstream_watch or self._upstream_watcher is None: + return None + return self._upstream_watcher.inspect() + + def _augment_summary_with_upstream_watch( + self, + summary: str, + upstream_watch: UpstreamWatchRead | None, + ) -> str: + if upstream_watch is None or not upstream_watch.summary: + return summary + if upstream_watch.decision is UpstreamWatchDecision.SKIP: + return f"{summary} Upstream watcher auto-skipped merge noise: {upstream_watch.summary}" + if upstream_watch.decision is UpstreamWatchDecision.REVIEW: + return f"{summary} Upstream watcher flagged review-required changes: {upstream_watch.summary}" + return f"{summary} Upstream watcher failed: {upstream_watch.error or upstream_watch.summary}" + + def _require_plan(self, plan_id: str) -> AutoResearchPlanRead: + plan = self._repository.get(plan_id) + if plan is None: + raise KeyError(f"autoresearch plan not found: {plan_id}") + return plan + + def _iter_python_files(self) -> list[str]: + files: list[str] = [] + for root_name in ("src", "scripts", "tests"): + root = self._repo_root / root_name + if not root.exists(): + continue + for path in root.rglob("*.py"): + rel_path = path.relative_to(self._repo_root).as_posix() + if self._is_ignored(rel_path): + continue + files.append(rel_path) + return sorted(set(files)) + + def _find_markers(self, rel_path: str) -> list[_MarkerOccurrence]: + path = self._repo_root / rel_path + try: + lines = path.read_text(encoding="utf-8").splitlines() + except UnicodeDecodeError: + return [] + occurrences: list[_MarkerOccurrence] = [] + for line_number, line in enumerate(lines, start=1): + match = _MARKER_PATTERN.search(line) + if match is None: + continue + marker = match.group(1) + detail = match.group(2).strip() or line.strip() + occurrences.append( + _MarkerOccurrence( + marker=marker, + line=line_number, + detail=detail[:180], + weight=_MARKER_WEIGHTS[marker], + ) + ) + return occurrences + + def _count_lines(self, rel_path: str) -> int: + try: + return len((self._repo_root / rel_path).read_text(encoding="utf-8").splitlines()) + except UnicodeDecodeError: + return 0 + + def _infer_test_paths(self, rel_path: str) -> list[str]: + if rel_path.startswith("tests/"): + return [rel_path] + + tests_root = self._repo_root / "tests" + stem = Path(rel_path).stem + existing: list[str] = [] + if tests_root.exists(): + for path in tests_root.rglob(f"test_{stem}.py"): + candidate = path.relative_to(self._repo_root).as_posix() + if self._is_ignored(candidate): + continue + existing.append(candidate) + if existing: + return sorted(existing)[:2] + return [f"tests/test_{stem}.py"] + + def _build_test_command(self, source_path: str, test_paths: list[str]) -> str: + if source_path.startswith("tests/"): + return "pytest -q " + " ".join(test_paths) + if test_paths: + return "pytest -q " + " ".join(test_paths) + return f"python -m py_compile {source_path}" + + def _has_existing_test(self, test_paths: list[str]) -> bool: + return any((self._repo_root / test_path).exists() for test_path in test_paths) + + def _criticality_bonus(self, rel_path: str) -> float: + for prefix in _CRITICAL_PREFIXES: + if rel_path.startswith(prefix): + return 18.0 + return 0.0 + + @staticmethod + def _is_ignored(rel_path: str) -> bool: + return any(part in _IGNORED_PATH_PARTS for part in Path(rel_path).parts) + + @staticmethod + def _slugify(value: str) -> str: + slug = re.sub(r"[^a-z0-9]+", "-", value.lower()).strip("-") + return slug or "plan" diff --git a/src/autoresearch/core/services/git_promotion_gate.py b/src/autoresearch/core/services/git_promotion_gate.py index 4bf882b9..6f0117d4 100644 --- a/src/autoresearch/core/services/git_promotion_gate.py +++ b/src/autoresearch/core/services/git_promotion_gate.py @@ -38,6 +38,19 @@ ) +def _is_benign_runtime_artifact(path: str) -> bool: + normalized = path.replace("\\", "/").strip("/") + if not normalized: + return False + if normalized.startswith(".pytest_cache/") or "/.pytest_cache/" in f"/{normalized}": + return True + if "/__pycache__/" in f"/{normalized}": + return True + if normalized.startswith("apps/") and normalized.endswith("/README.md"): + return True + return False + + class GitPromotionProvider(Protocol): def probe_remote_health(self, repo_root: Path, *, base_branch: str) -> GitRemoteProbe: ... @@ -490,7 +503,11 @@ def _build_gate_checks( remote_probe: GitRemoteProbe, repo_dirty: list[str], ) -> list[PromotionGateCheck]: - changed_files = [item.replace("\\", "/") for item in intent.changed_files] + changed_files = [ + item.replace("\\", "/") + for item in intent.changed_files + if not _is_benign_runtime_artifact(item) + ] forbidden_paths = self._metadata_list(intent.metadata, "forbidden_paths") or self._default_forbidden_paths max_changed_files = int(intent.metadata.get("max_changed_files", self._default_max_changed_files)) max_patch_lines = int(intent.metadata.get("max_patch_lines", self._default_max_patch_lines)) diff --git a/src/autoresearch/core/services/github_issue_service.py b/src/autoresearch/core/services/github_issue_service.py new file mode 100644 index 00000000..a7f0c567 --- /dev/null +++ b/src/autoresearch/core/services/github_issue_service.py @@ -0,0 +1,222 @@ +from __future__ import annotations + +from dataclasses import dataclass +import json +from pathlib import Path +import re +import shutil +import subprocess + + +_ISSUE_URL_RE = re.compile( + r"^https://github\.com/(?P[A-Za-z0-9_.-]+)/(?P[A-Za-z0-9_.-]+)/issues/(?P\d+)(?:[/?#].*)?$" +) +_ISSUE_REF_RE = re.compile(r"^(?P[A-Za-z0-9_.-]+)/(?P[A-Za-z0-9_.-]+)#(?P\d+)$") +_ISSUE_NUMBER_RE = re.compile(r"^#?(?P\d+)$") +_REMOTE_RE = re.compile( + r"^(?:https://github\.com/|git@github\.com:)(?P[A-Za-z0-9_.-]+)/(?P[A-Za-z0-9_.-]+?)(?:\.git)?$" +) + + +@dataclass(frozen=True, slots=True) +class GitHubIssueCommentRead: + author: str + body: str + created_at: str | None = None + + +@dataclass(frozen=True, slots=True) +class GitHubIssueReference: + owner: str + repo: str + number: int + + @property + def repo_full_name(self) -> str: + return f"{self.owner}/{self.repo}" + + @property + def display(self) -> str: + return f"{self.repo_full_name}#{self.number}" + + @property + def url(self) -> str: + return f"https://github.com/{self.owner}/{self.repo}/issues/{self.number}" + + +@dataclass(frozen=True, slots=True) +class GitHubIssueRead: + reference: GitHubIssueReference + title: str + body: str + url: str + state: str + author: str + labels: tuple[str, ...] = () + comments: tuple[GitHubIssueCommentRead, ...] = () + + +class GitHubIssueService: + """Thin wrapper around `gh issue` for Telegram intake and safe comment back.""" + + def __init__( + self, + *, + repo_root: Path | None = None, + gh_binary: str | None = None, + ) -> None: + self._repo_root = (repo_root or Path(__file__).resolve().parents[3]).resolve() + self._gh_binary = gh_binary or shutil.which("gh") or "gh" + + def resolve_issue_reference(self, raw_value: str) -> GitHubIssueReference: + value = raw_value.strip() + if not value: + raise ValueError("missing GitHub issue reference") + + matched = _ISSUE_URL_RE.match(value) + if matched: + return GitHubIssueReference( + owner=matched.group("owner"), + repo=matched.group("repo"), + number=int(matched.group("number")), + ) + + matched = _ISSUE_REF_RE.match(value) + if matched: + return GitHubIssueReference( + owner=matched.group("owner"), + repo=matched.group("repo"), + number=int(matched.group("number")), + ) + + matched = _ISSUE_NUMBER_RE.match(value) + if matched: + owner, repo = self._resolve_current_repo() + return GitHubIssueReference( + owner=owner, + repo=repo, + number=int(matched.group("number")), + ) + + raise ValueError("unsupported GitHub issue reference; use URL, owner/repo#123, or #123") + + def fetch_issue(self, raw_reference: str) -> GitHubIssueRead: + reference = self.resolve_issue_reference(raw_reference) + payload = self._run_gh_json( + [ + "issue", + "view", + str(reference.number), + "--repo", + reference.repo_full_name, + "--json", + "number,title,body,url,state,author,labels,comments", + ] + ) + comments = tuple( + GitHubIssueCommentRead( + author=str((item.get("author") or {}).get("login") or "unknown"), + body=str(item.get("body") or ""), + created_at=str(item.get("createdAt") or "") or None, + ) + for item in payload.get("comments", []) + ) + labels = tuple( + str(item.get("name") or "").strip() + for item in payload.get("labels", []) + if str(item.get("name") or "").strip() + ) + return GitHubIssueRead( + reference=reference, + title=str(payload.get("title") or "").strip(), + body=str(payload.get("body") or ""), + url=str(payload.get("url") or reference.url).strip(), + state=str(payload.get("state") or "UNKNOWN").strip(), + author=str((payload.get("author") or {}).get("login") or "unknown"), + labels=labels, + comments=comments, + ) + + def build_manager_prompt(self, issue: GitHubIssueRead, *, operator_note: str | None = None) -> str: + lines = [ + "Resolve the following GitHub issue in the current repository through the existing patch-only manager pipeline.", + "", + f"Issue: {issue.reference.display}", + f"URL: {issue.url}", + f"Title: {issue.title or '(untitled)'}", + f"State: {issue.state}", + f"Author: {issue.author}", + ] + if issue.labels: + lines.append(f"Labels: {', '.join(issue.labels)}") + if operator_note: + lines.extend(["", "Operator note:", operator_note.strip()]) + lines.extend(["", "Issue body:", issue.body.strip() or "(empty)"]) + + recent_comments = [item for item in issue.comments if item.body.strip()][-3:] + if recent_comments: + lines.extend(["", "Recent comments:"]) + for item in recent_comments: + lines.append(f"- {item.author}: {item.body.strip()}") + + lines.extend( + [ + "", + "Deliver the smallest useful fix, stay within scoped files, update tests when needed, and prepare a draft PR when possible.", + ] + ) + return "\n".join(lines).strip() + + def post_comment(self, raw_reference: str, body: str) -> str: + reference = self.resolve_issue_reference(raw_reference) + completed = self._run_gh( + [ + "issue", + "comment", + str(reference.number), + "--repo", + reference.repo_full_name, + "--body", + body.strip(), + ] + ) + return (completed.stdout or "").strip() + + def _resolve_current_repo(self) -> tuple[str, str]: + completed = subprocess.run( + ["git", "remote", "get-url", "origin"], + cwd=self._repo_root, + capture_output=True, + text=True, + check=False, + ) + if completed.returncode != 0: + raise ValueError("cannot resolve current GitHub repo from git origin") + remote = (completed.stdout or "").strip() + matched = _REMOTE_RE.match(remote) + if not matched: + raise ValueError("current git origin is not a supported GitHub remote") + return matched.group("owner"), matched.group("repo") + + def _run_gh_json(self, args: list[str]) -> dict[str, object]: + completed = self._run_gh(args) + try: + payload = json.loads((completed.stdout or "").strip() or "{}") + except json.JSONDecodeError as exc: + raise RuntimeError("gh returned invalid JSON for issue query") from exc + if not isinstance(payload, dict): + raise RuntimeError("gh returned an unexpected payload for issue query") + return payload + + def _run_gh(self, args: list[str]) -> subprocess.CompletedProcess[str]: + completed = subprocess.run( + [self._gh_binary, *args], + cwd=self._repo_root, + capture_output=True, + text=True, + check=False, + ) + if completed.returncode != 0: + detail = (completed.stderr or completed.stdout or "gh command failed").strip() + raise RuntimeError(detail) + return completed diff --git a/src/autoresearch/core/services/housekeeper.py b/src/autoresearch/core/services/housekeeper.py new file mode 100644 index 00000000..e6735c0e --- /dev/null +++ b/src/autoresearch/core/services/housekeeper.py @@ -0,0 +1,842 @@ +from __future__ import annotations + +from dataclasses import dataclass +from datetime import datetime, time, timedelta +import hashlib +import math +from typing import Any +from zoneinfo import ZoneInfo + +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService +from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.shared.autoresearch_planner_contract import AutoResearchPlannerRequest +from autoresearch.shared.housekeeper_contract import ( + AdmissionRiskLevel, + CircuitBreakerStateRead, + CircuitBreakerStatus, + DeferredReason, + ExecutionProfileRead, + ExplorationBlockerReason, + ExplorationDedupKeyRead, + ExplorationRecordRead, + HousekeeperChangeReason, + HousekeeperMode, + HousekeeperModeUpdateRequest, + HousekeeperMorningSummaryRead, + HousekeeperStateRead, + HousekeeperTickRead, + NightBudgetStateRead, + TaskAdmissionAssessmentRead, +) +from autoresearch.shared.manager_agent_contract import ManagerDispatchRead, ManagerDispatchRequest, ManagerPlanStrategy +from autoresearch.shared.media_job_contract import MediaJobRead, MediaJobStatus +from autoresearch.shared.models import ApprovalStatus, JobStatus, utc_now +from autoresearch.shared.store import Repository, create_resource_id + + +@dataclass(frozen=True, slots=True) +class _NightWindow: + start: datetime + end: datetime + + +class ExecutionProfileResolver: + _PROFILES = { + HousekeeperMode.DAY_SAFE: ExecutionProfileRead( + profile_name=HousekeeperMode.DAY_SAFE, + pipeline_target="patch", + max_iterations=1, + auto_dispatch_allowed=False, + parallelism=1, + allow_draft_pr=False, + allow_repo_write=True, + allow_network=False, + allow_long_task_minutes=15, + ), + HousekeeperMode.NIGHT_READONLY_EXPLORE: ExecutionProfileRead( + profile_name=HousekeeperMode.NIGHT_READONLY_EXPLORE, + pipeline_target="patch", + max_iterations=2, + auto_dispatch_allowed=True, + parallelism=2, + allow_draft_pr=False, + allow_repo_write=True, + allow_network=False, + allow_long_task_minutes=90, + ), + HousekeeperMode.NIGHT_EXPLORE: ExecutionProfileRead( + profile_name=HousekeeperMode.NIGHT_EXPLORE, + pipeline_target="draft_pr", + max_iterations=2, + auto_dispatch_allowed=True, + parallelism=3, + allow_draft_pr=True, + allow_repo_write=True, + allow_network=False, + allow_long_task_minutes=120, + ), + } + + def resolve(self, mode: HousekeeperMode) -> ExecutionProfileRead: + return self._PROFILES[mode].model_copy(deep=True) + + +class HousekeeperService: + def __init__( + self, + *, + state_repository: Repository[HousekeeperStateRead], + budget_repository: Repository[NightBudgetStateRead], + exploration_repository: Repository[ExplorationRecordRead], + timezone_name: str = "Asia/Shanghai", + summary_chat_id: str | None = None, + profile_resolver: ExecutionProfileResolver | None = None, + ) -> None: + self._state_repository = state_repository + self._budget_repository = budget_repository + self._exploration_repository = exploration_repository + self._timezone = ZoneInfo(timezone_name) + self._summary_chat_id = (summary_chat_id or "").strip() or None + self._profile_resolver = profile_resolver or ExecutionProfileResolver() + + def get_state(self, *, now: datetime | None = None) -> HousekeeperStateRead: + current = self._state_repository.get("housekeeper") + resolved_now = self._normalize_now(now) + if current is None: + scheduled = self._scheduled_mode_for(resolved_now) + current = HousekeeperStateRead( + scheduled_mode=scheduled, + effective_mode=scheduled, + reason=HousekeeperChangeReason.SCHEDULE, + changed_by="system", + last_changed_at=resolved_now, + ) + return self._state_repository.save(current.state_id, current) + + updated = self._refresh_state(current=current, now=resolved_now) + if updated != current: + return self._state_repository.save(updated.state_id, updated) + return updated + + def update_mode(self, request: HousekeeperModeUpdateRequest, *, now: datetime | None = None) -> HousekeeperStateRead: + current = self.get_state(now=now) + resolved_now = self._normalize_now(now) + if request.action == "set_manual_override": + if request.target_mode is None: + raise ValueError("target_mode is required for set_manual_override") + updated = current.model_copy( + update={ + "manual_override_mode": request.target_mode, + "effective_until": request.effective_until or self._next_boundary(resolved_now), + "reason": request.reason, + "changed_by": request.changed_by.strip(), + "last_changed_at": resolved_now, + "metadata": {**current.metadata, **request.metadata}, + } + ) + elif request.action == "clear_manual_override": + updated = current.model_copy( + update={ + "manual_override_mode": None, + "effective_until": None, + "reason": request.reason, + "changed_by": request.changed_by.strip(), + "last_changed_at": resolved_now, + "metadata": {**current.metadata, **request.metadata}, + } + ) + elif request.action == "ack_circuit_breaker": + breaker = current.circuit_breaker_state.model_copy( + update={ + "status": CircuitBreakerStatus.CLOSED, + "acknowledged_at": resolved_now, + "reason": None, + "metadata": {**current.circuit_breaker_state.metadata, **request.metadata}, + } + ) + updated = current.model_copy( + update={ + "circuit_breaker_state": breaker, + "reason": request.reason, + "changed_by": request.changed_by.strip(), + "last_changed_at": resolved_now, + } + ) + else: + if request.target_mode is None: + raise ValueError("target_mode is required for apply_schedule") + updated = current.model_copy( + update={ + "scheduled_mode": request.target_mode, + "reason": request.reason, + "changed_by": request.changed_by.strip(), + "last_changed_at": resolved_now, + "metadata": {**current.metadata, **request.metadata}, + } + ) + + refreshed = self._refresh_state(current=updated, now=resolved_now) + return self._state_repository.save(refreshed.state_id, refreshed) + + def prepare_manager_request( + self, + request: ManagerDispatchRequest, + *, + manager_service: ManagerAgentService, + trigger_source: str, + now: datetime | None = None, + ) -> tuple[ManagerDispatchRequest, TaskAdmissionAssessmentRead, HousekeeperStateRead]: + state = self.get_state(now=now) + profile = self._profile_resolver.resolve(state.effective_mode) + assessment = manager_service.assess_request(request) + auto_dispatch = bool(request.auto_dispatch and profile.auto_dispatch_allowed) + deferred_reason: DeferredReason | None = None + + if state.circuit_breaker_state.status is CircuitBreakerStatus.OPEN: + auto_dispatch = False + deferred_reason = DeferredReason.CIRCUIT_BREAKER_OPEN + elif request.auto_dispatch and not profile.auto_dispatch_allowed: + auto_dispatch = False + deferred_reason = DeferredReason.DEFERRED_TO_NIGHT + elif auto_dispatch and not self._admission_allows(profile=profile, assessment=assessment): + auto_dispatch = False + deferred_reason = ( + DeferredReason.APPROVAL_REQUIRED + if assessment.risk_level is AdmissionRiskLevel.HIGH + else DeferredReason.DEFERRED_TO_NIGHT + ) + + updated = request.model_copy( + update={ + "pipeline_target": profile.pipeline_target, + "max_iterations": profile.max_iterations, + "auto_dispatch": auto_dispatch, + "metadata": { + **request.metadata, + "execution_profile": profile.model_dump(mode="json"), + "trigger_source": trigger_source, + "scheduled_window": state.effective_mode.value, + "admission_assessment": assessment.model_dump(mode="json"), + "deferred_reason": deferred_reason.value if deferred_reason is not None else None, + }, + } + ) + return updated, assessment, state + + def prepare_planner_request( + self, + request: AutoResearchPlannerRequest, + *, + trigger_source: str, + now: datetime | None = None, + ) -> tuple[AutoResearchPlannerRequest, TaskAdmissionAssessmentRead, HousekeeperStateRead]: + state = self.get_state(now=now) + profile = self._profile_resolver.resolve(state.effective_mode) + assessment = self.assess_planner_request(request) + deferred_reason: DeferredReason | None = None + if state.circuit_breaker_state.status is CircuitBreakerStatus.OPEN: + deferred_reason = DeferredReason.CIRCUIT_BREAKER_OPEN + + updated = request.model_copy( + update={ + "pipeline_target": profile.pipeline_target, + "max_iterations": profile.max_iterations, + "metadata": { + **request.metadata, + "execution_profile": profile.model_dump(mode="json"), + "trigger_source": trigger_source, + "scheduled_window": state.effective_mode.value, + "admission_assessment": assessment.model_dump(mode="json"), + "deferred_reason": deferred_reason.value if deferred_reason is not None else None, + }, + } + ) + return updated, assessment, state + + def assess_planner_request(self, request: AutoResearchPlannerRequest) -> TaskAdmissionAssessmentRead: + estimated_runtime = 12 if request.pipeline_target == "patch" else 20 + if request.max_candidates > 5: + estimated_runtime += 10 + if request.include_upstream_watch: + estimated_runtime += 10 + risk = AdmissionRiskLevel.LOW if estimated_runtime <= 15 else AdmissionRiskLevel.MEDIUM + return TaskAdmissionAssessmentRead( + plan_shape="planner_candidate", + estimated_runtime_minutes=estimated_runtime, + requires_repo_write=True, + requires_network=False, + fanout_count=1, + risk_level=risk, + ) + + def create_morning_summary( + self, + *, + manager_service: ManagerAgentService, + planner_service: AutoResearchPlannerService, + approval_service: ApprovalStoreService, + notifier: TelegramNotifierService, + media_jobs: list[MediaJobRead] | None = None, + now: datetime | None = None, + ) -> HousekeeperMorningSummaryRead: + state = self.get_state(now=now) + resolved_now = self._normalize_now(now) + window = self._previous_night_window(resolved_now) + + completed_items: list[str] = [] + blocked_items: list[str] = [] + decision_items: list[str] = [] + queue_items: list[str] = [] + + for dispatch in manager_service.list_dispatches(): + if not self._within_window(dispatch.updated_at, window): + continue + line = f"{dispatch.dispatch_id}: {dispatch.summary}" + if dispatch.status is JobStatus.COMPLETED: + completed_items.append(line) + elif dispatch.status is JobStatus.FAILED: + blocked_items.append(line) + + for plan in planner_service.list(): + if not self._within_window(plan.updated_at, window): + continue + line = f"{plan.plan_id}: {plan.summary}" + if plan.dispatch_status.value == "dispatched" and plan.run_summary is not None: + completed_items.append(line) + elif plan.dispatch_status.value == "failed": + blocked_items.append(line) + + for job in media_jobs or []: + if not self._within_window(job.updated_at, window): + continue + line = f"{job.job_id}: {job.mode.value} -> {job.status.value}" + if job.status is MediaJobStatus.COMPLETED: + completed_items.append(line) + elif job.status is MediaJobStatus.FAILED: + blocked_items.append(line) + + for approval in approval_service.list_requests(status=ApprovalStatus.PENDING, limit=20): + decision_items.append(f"{approval.approval_id}: {approval.title}") + + for dispatch in manager_service.list_dispatches(): + if dispatch.status in {JobStatus.CREATED, JobStatus.QUEUED}: + queue_items.append(f"manager {dispatch.dispatch_id}: {dispatch.summary}") + for plan in planner_service.list_pending(limit=20): + queue_items.append(f"plan {plan.plan_id}: {plan.summary}") + + summary_text = "\n".join( + [ + "昨夜完成了什么", + *(f"- {item}" for item in (completed_items or ["无"])), + "", + "失败/阻塞了什么", + *(f"- {item}" for item in (blocked_items or ["无"])), + "", + "今天需要你决定什么", + *(f"- {item}" for item in (decision_items or ["无"])), + "", + "系统当前模式与待执行队列", + f"- mode: {state.effective_mode.value}", + *(f"- {item}" for item in (queue_items or ["无"])), + ] + ) + + sent = bool(self._summary_chat_id and notifier.enabled and notifier.send_message(chat_id=self._summary_chat_id, text=summary_text)) + updated_state = state.model_copy(update={"last_summary_at": resolved_now}) + self._state_repository.save(updated_state.state_id, updated_state) + return HousekeeperMorningSummaryRead( + sent=sent, + summary_text=summary_text, + completed_items=completed_items, + blocked_items=blocked_items, + decision_items=decision_items, + queue_items=queue_items, + state=updated_state, + ) + + def execute_night_explore_tick( + self, + *, + manager_service: ManagerAgentService, + planner_service: AutoResearchPlannerService, + notifier: TelegramNotifierService, + media_jobs: list[MediaJobRead] | None = None, + now: datetime | None = None, + ) -> HousekeeperTickRead: + state = self.get_state(now=now) + resolved_now = self._normalize_now(now) + budget = self._get_or_create_budget(resolved_now) + + if state.circuit_breaker_state.status is CircuitBreakerStatus.OPEN: + return HousekeeperTickRead( + executed=False, + skipped_reason="circuit_breaker_open", + blocker_reason=ExplorationBlockerReason.CIRCUIT_BREAKER_OPEN, + summary="Night explore skipped because the circuit breaker is open.", + state=state, + budget=budget, + ) + if state.effective_mode not in {HousekeeperMode.NIGHT_READONLY_EXPLORE, HousekeeperMode.NIGHT_EXPLORE}: + return HousekeeperTickRead( + executed=False, + skipped_reason="not_in_night_mode", + summary="Night explore skipped because the effective mode is not a night mode.", + state=state, + budget=budget, + ) + if self._budget_exhausted(budget): + self._record_budget_blocker(now=resolved_now) + return HousekeeperTickRead( + executed=False, + skipped_reason="budget_exhausted", + blocker_reason=ExplorationBlockerReason.BUDGET_EXHAUSTED, + summary="Night explore skipped because the nightly budget is exhausted.", + state=state, + budget=budget, + ) + + pending_dispatch = self._pick_pending_manager_dispatch(manager_service=manager_service) + if pending_dispatch is not None: + dedup_key = self._build_dispatch_dedup_key(pending_dispatch) + if self._is_dedup_blocked(dedup_key=dedup_key, blocker_reason=None, now=resolved_now): + return HousekeeperTickRead( + executed=False, + skipped_reason="dedup_blocked", + target_kind="manager_dispatch", + target_id=pending_dispatch.dispatch_id, + blocker_reason=ExplorationBlockerReason.UNKNOWN, + summary="Night explore skipped a deferred manager dispatch because an equivalent attempt ran recently.", + state=state, + budget=budget, + ) + result = manager_service.execute_dispatch(pending_dispatch.dispatch_id) + blocker = self._blocker_from_dispatch(result) + budget = self._consume_budget(budget=budget, dispatch=result) + self._record_exploration_attempt( + dedup_key=dedup_key, + target_kind="manager_dispatch", + target_id=result.dispatch_id, + blocker_reason=blocker, + final_status=result.status.value, + metadata={"summary": result.summary}, + now=resolved_now, + ) + state = self._update_circuit_breaker_state(notifier=notifier, media_jobs=media_jobs, now=resolved_now) + return HousekeeperTickRead( + executed=True, + target_kind="manager_dispatch", + target_id=result.dispatch_id, + blocker_reason=blocker, + summary=result.summary, + state=state, + budget=budget, + ) + + profile = self._profile_resolver.resolve(state.effective_mode) + planner_request = AutoResearchPlannerRequest( + goal="Scan the repo for the next safe patch-only improvement.", + pipeline_target=profile.pipeline_target, + max_iterations=profile.max_iterations, + include_upstream_watch=True, + metadata={"trigger_source": "night_explore_tick"}, + ) + planner_request, _, _ = self.prepare_planner_request( + planner_request, + trigger_source="night_explore_tick", + now=resolved_now, + ) + plan = planner_service.create(planner_request) + if plan.selected_candidate is None: + return HousekeeperTickRead( + executed=False, + skipped_reason="no_candidate", + summary="Night explore did not find a new planner candidate.", + state=state, + budget=budget, + ) + + dedup_key = self._build_plan_dedup_key(plan) + if self._is_dedup_blocked(dedup_key=dedup_key, blocker_reason=None, now=resolved_now): + return HousekeeperTickRead( + executed=False, + skipped_reason="dedup_blocked", + target_kind="planner_dispatch", + target_id=plan.plan_id, + blocker_reason=ExplorationBlockerReason.UNKNOWN, + summary="Night explore skipped a planner candidate because an equivalent attempt ran recently.", + state=state, + budget=budget, + ) + + queued = planner_service.request_dispatch(plan.plan_id, requested_by="housekeeper") + result = planner_service.execute_dispatch(queued.plan_id) + blocker = self._blocker_from_plan(result) + budget = self._consume_budget(budget=budget, plan=result) + self._record_exploration_attempt( + dedup_key=dedup_key, + target_kind="planner_dispatch", + target_id=result.plan_id, + blocker_reason=blocker, + final_status=result.dispatch_status.value, + metadata={"summary": result.summary}, + now=resolved_now, + ) + state = self._update_circuit_breaker_state(notifier=notifier, media_jobs=media_jobs, now=resolved_now) + return HousekeeperTickRead( + executed=True, + target_kind="planner_dispatch", + target_id=result.plan_id, + blocker_reason=blocker, + summary=result.summary, + state=state, + budget=budget, + ) + + def record_media_job_outcome( + self, + *, + job: MediaJobRead, + notifier: TelegramNotifierService, + media_jobs: list[MediaJobRead] | None = None, + now: datetime | None = None, + ) -> HousekeeperStateRead: + resolved_now = self._normalize_now(now) + blocker = None if job.status is MediaJobStatus.COMPLETED else ExplorationBlockerReason.UNKNOWN + dedup = ExplorationDedupKeyRead( + repo_id="media", + target_scope_hash=self._hash_text(job.target_bucket.value), + intent_id=job.mode.value, + normalized_goal_hash=self._hash_text(job.url), + ) + self._record_exploration_attempt( + dedup_key=dedup, + target_kind="media_job", + target_id=job.job_id, + blocker_reason=blocker, + final_status=job.status.value, + metadata={"url": job.url}, + now=resolved_now, + ) + return self._update_circuit_breaker_state(notifier=notifier, media_jobs=media_jobs, now=resolved_now) + + def _refresh_state(self, *, current: HousekeeperStateRead, now: datetime) -> HousekeeperStateRead: + scheduled_mode = self._scheduled_mode_for(now) + manual_mode = current.manual_override_mode + effective_until = current.effective_until + if manual_mode is not None and effective_until is not None and effective_until <= now: + manual_mode = None + effective_until = None + + effective_mode = scheduled_mode + if current.circuit_breaker_state.status is CircuitBreakerStatus.OPEN: + effective_mode = HousekeeperMode.DAY_SAFE + elif manual_mode is not None and effective_until is not None and effective_until > now: + effective_mode = manual_mode + + return current.model_copy( + update={ + "scheduled_mode": scheduled_mode, + "manual_override_mode": manual_mode, + "effective_until": effective_until, + "effective_mode": effective_mode, + } + ) + + def _admission_allows(self, *, profile: ExecutionProfileRead, assessment: TaskAdmissionAssessmentRead) -> bool: + if not profile.auto_dispatch_allowed: + return False + if profile.profile_name is not HousekeeperMode.DAY_SAFE: + return True + return ( + assessment.estimated_runtime_minutes <= 15 + and assessment.fanout_count <= 1 + and assessment.risk_level in {AdmissionRiskLevel.LOW, AdmissionRiskLevel.MEDIUM} + and not profile.allow_draft_pr + ) + + def _pick_pending_manager_dispatch(self, *, manager_service: ManagerAgentService) -> ManagerDispatchRead | None: + candidates: list[ManagerDispatchRead] = [] + for dispatch in manager_service.list_dispatches(): + deferred = str(dispatch.metadata.get("deferred_reason") or "").strip() + if dispatch.status in {JobStatus.CREATED, JobStatus.QUEUED} and deferred in {"", DeferredReason.DEFERRED_TO_NIGHT.value}: + candidates.append(dispatch) + candidates.sort(key=lambda item: item.updated_at) + return candidates[0] if candidates else None + + def _build_dispatch_dedup_key(self, dispatch: ManagerDispatchRead) -> ExplorationDedupKeyRead: + scope = "|".join(dispatch.selected_intent.allowed_paths if dispatch.selected_intent is not None else []) + return ExplorationDedupKeyRead( + repo_id="repo", + target_scope_hash=self._hash_text(scope), + intent_id=dispatch.selected_intent.intent_id if dispatch.selected_intent is not None else "unknown", + normalized_goal_hash=self._hash_text(dispatch.normalized_goal), + ) + + def _build_plan_dedup_key(self, plan) -> ExplorationDedupKeyRead: + candidate = plan.selected_candidate + scope = "|".join(candidate.allowed_paths if candidate is not None else []) + return ExplorationDedupKeyRead( + repo_id="repo", + target_scope_hash=self._hash_text(scope), + intent_id=(candidate.category if candidate is not None else "planner"), + normalized_goal_hash=self._hash_text(plan.goal), + ) + + def _is_dedup_blocked( + self, + *, + dedup_key: ExplorationDedupKeyRead, + blocker_reason: ExplorationBlockerReason | None, + now: datetime, + ) -> bool: + cutoff = now - timedelta(hours=24) + for record in self._exploration_repository.list(): + if record.created_at < cutoff: + continue + if record.dedup_key != dedup_key: + continue + if record.blocker_reason == blocker_reason: + return True + return False + + def _record_exploration_attempt( + self, + *, + dedup_key: ExplorationDedupKeyRead, + target_kind: str, + target_id: str, + blocker_reason: ExplorationBlockerReason | None, + final_status: str | None, + metadata: dict[str, Any], + now: datetime, + ) -> None: + record = ExplorationRecordRead( + record_id=create_resource_id("explore"), + dedup_key=dedup_key, + target_kind=target_kind, + target_id=target_id, + blocker_reason=blocker_reason, + final_status=final_status, + created_at=now, + updated_at=now, + metadata=metadata, + ) + self._exploration_repository.save(record.record_id, record) + + def _update_circuit_breaker_state( + self, + *, + notifier: TelegramNotifierService, + media_jobs: list[MediaJobRead] | None, + now: datetime, + ) -> HousekeeperStateRead: + state = self.get_state(now=now) + recent_records = [ + record + for record in self._exploration_repository.list() + if record.updated_at >= now - timedelta(hours=2) + ] + recent_records.sort(key=lambda item: item.updated_at, reverse=True) + failures = 0 + consecutive_failures = 0 + for index, record in enumerate(recent_records): + failed = record.blocker_reason is not None or str(record.final_status or "").lower() in {"failed", "human_review"} + if failed: + failures += 1 + if index == consecutive_failures and failed: + consecutive_failures += 1 + elif index == consecutive_failures: + break + media_failures = 0 + for job in sorted(media_jobs or [], key=lambda item: item.updated_at, reverse=True): + if job.updated_at < now - timedelta(hours=2): + continue + if job.status is MediaJobStatus.FAILED: + media_failures += 1 + else: + break + + total = len(recent_records) + failure_rate = failures / total if total else 0.0 + should_open = consecutive_failures >= 3 or (total >= 3 and failure_rate >= 0.7) or media_failures >= 3 + if not should_open: + return state + if state.circuit_breaker_state.status is CircuitBreakerStatus.OPEN: + return state + + breaker = CircuitBreakerStateRead( + status=CircuitBreakerStatus.OPEN, + triggered_at=now, + reason="automatic failure threshold exceeded", + consecutive_failures=consecutive_failures, + recent_failure_rate=failure_rate, + metadata={"media_consecutive_failures": media_failures}, + ) + updated = state.model_copy( + update={ + "circuit_breaker_state": breaker, + "effective_mode": HousekeeperMode.DAY_SAFE, + "reason": HousekeeperChangeReason.CIRCUIT_BREAKER, + "changed_by": "system", + "last_changed_at": now, + } + ) + self._state_repository.save(updated.state_id, updated) + if self._summary_chat_id and notifier.enabled: + notifier.send_message( + chat_id=self._summary_chat_id, + text=( + "[housekeeper] circuit breaker opened\n" + f"- consecutive_failures: {consecutive_failures}\n" + f"- failure_rate_2h: {failure_rate:.2f}\n" + f"- media_consecutive_failures: {media_failures}" + ), + ) + return updated + + def _get_or_create_budget(self, now: datetime) -> NightBudgetStateRead: + current = self._budget_repository.get("night_budget") + window = self._current_night_window(now) + if current is not None and current.window_start == window.start and current.window_end == window.end: + return current + budget = NightBudgetStateRead( + window_start=window.start, + window_end=window.end, + updated_at=now, + ) + return self._budget_repository.save(budget.budget_id, budget) + + def _consume_budget(self, *, budget: NightBudgetStateRead, dispatch=None, plan=None) -> NightBudgetStateRead: + duration_ms = 0 + draft_prs = budget.draft_prs_used + if dispatch is not None and dispatch.run_summary is not None: + duration_ms = dispatch.run_summary.driver_result.metrics.duration_ms or 0 + if dispatch.run_summary.promotion is not None and dispatch.run_summary.promotion.pr_url: + draft_prs += 1 + if plan is not None and plan.run_summary is not None: + duration_ms = plan.run_summary.driver_result.metrics.duration_ms or 0 + if plan.run_summary.promotion is not None and plan.run_summary.promotion.pr_url: + draft_prs += 1 + updated = budget.model_copy( + update={ + "dispatches_used": budget.dispatches_used + 1, + "draft_prs_used": draft_prs, + "worker_minutes_used": budget.worker_minutes_used + math.ceil(duration_ms / 60000) if duration_ms else budget.worker_minutes_used, + "updated_at": utc_now(), + } + ) + return self._budget_repository.save(updated.budget_id, updated) + + def _budget_exhausted(self, budget: NightBudgetStateRead) -> bool: + return ( + budget.dispatches_used >= budget.max_dispatches_per_night + or budget.draft_prs_used >= budget.max_draft_pr_per_night + or budget.worker_minutes_used >= budget.max_worker_minutes_per_night + ) + + def _record_budget_blocker(self, *, now: datetime) -> None: + dedup_key = ExplorationDedupKeyRead( + repo_id="repo", + target_scope_hash=self._hash_text("night_budget"), + intent_id="night_budget", + normalized_goal_hash=self._hash_text("night_budget"), + ) + self._record_exploration_attempt( + dedup_key=dedup_key, + target_kind="planner_dispatch", + target_id="night_budget", + blocker_reason=ExplorationBlockerReason.BUDGET_EXHAUSTED, + final_status="skipped", + metadata={}, + now=now, + ) + + def _blocker_from_dispatch(self, dispatch: ManagerDispatchRead) -> ExplorationBlockerReason | None: + if dispatch.run_summary is None: + return ExplorationBlockerReason.UNKNOWN + return self._blocker_from_run_summary( + final_status=dispatch.run_summary.final_status, + error=dispatch.run_summary.driver_result.error, + ) + + def _blocker_from_plan(self, plan) -> ExplorationBlockerReason | None: + if plan.run_summary is None: + if plan.dispatch_status.value == "failed": + return ExplorationBlockerReason.UNKNOWN + return None + return self._blocker_from_run_summary( + final_status=plan.run_summary.final_status, + error=plan.run_summary.driver_result.error, + ) + + def _blocker_from_run_summary(self, *, final_status: str, error: str | None) -> ExplorationBlockerReason | None: + message = str(error or "").lower() + if not message and final_status in {"ready_for_promotion", "promoted"}: + return None + if "permission" in message: + return ExplorationBlockerReason.PERMISSION_DENIED + if "environmentcheckfailed" in message or "missing" in message: + return ExplorationBlockerReason.ENV_MISSING + if "dirty" in message: + return ExplorationBlockerReason.DIRTY_REPO + if final_status == "human_review": + return ExplorationBlockerReason.APPROVAL_PENDING + if "stalled" in message or "stalled_no_progress" in message: + return ExplorationBlockerReason.STALLED_NO_PROGRESS + if "validation" in message: + return ExplorationBlockerReason.VALIDATION_FAILED + return ExplorationBlockerReason.UNKNOWN + + def _scheduled_mode_for(self, now: datetime) -> HousekeeperMode: + local_now = now.astimezone(self._timezone) + local_time = local_now.timetz().replace(tzinfo=None) + if time(9, 0) <= local_time < time(23, 0): + return HousekeeperMode.DAY_SAFE + return HousekeeperMode.NIGHT_READONLY_EXPLORE + + def _next_boundary(self, now: datetime) -> datetime: + local_now = now.astimezone(self._timezone) + day_boundary = datetime.combine(local_now.date(), time(9, 0), tzinfo=self._timezone) + night_boundary = datetime.combine(local_now.date(), time(23, 0), tzinfo=self._timezone) + if local_now < day_boundary: + return day_boundary.astimezone(now.tzinfo) + if local_now < night_boundary: + return night_boundary.astimezone(now.tzinfo) + next_day = local_now.date() + timedelta(days=1) + return datetime.combine(next_day, time(9, 0), tzinfo=self._timezone).astimezone(now.tzinfo) + + def _current_night_window(self, now: datetime) -> _NightWindow: + local_now = now.astimezone(self._timezone) + today_23 = datetime.combine(local_now.date(), time(23, 0), tzinfo=self._timezone) + today_9 = datetime.combine(local_now.date(), time(9, 0), tzinfo=self._timezone) + if local_now < today_9: + start = today_23 - timedelta(days=1) + end = today_9 + elif local_now >= today_23: + start = today_23 + end = today_9 + timedelta(days=1) + else: + start = today_23 + end = today_9 + timedelta(days=1) + return _NightWindow(start=start.astimezone(now.tzinfo), end=end.astimezone(now.tzinfo)) + + def _previous_night_window(self, now: datetime) -> _NightWindow: + local_now = now.astimezone(self._timezone) + today_9 = datetime.combine(local_now.date(), time(9, 0), tzinfo=self._timezone) + end = today_9 if local_now >= today_9 else today_9 - timedelta(days=1) + start = end - timedelta(hours=10) + return _NightWindow(start=start.astimezone(now.tzinfo), end=end.astimezone(now.tzinfo)) + + @staticmethod + def _within_window(value: datetime, window: _NightWindow) -> bool: + return window.start <= value <= window.end + + @staticmethod + def _hash_text(value: str) -> str: + return hashlib.sha256(value.encode("utf-8")).hexdigest() + + @staticmethod + def _normalize_now(now: datetime | None) -> datetime: + return now or utc_now() diff --git a/src/autoresearch/core/services/media_jobs.py b/src/autoresearch/core/services/media_jobs.py new file mode 100644 index 00000000..553c7446 --- /dev/null +++ b/src/autoresearch/core/services/media_jobs.py @@ -0,0 +1,243 @@ +from __future__ import annotations + +from datetime import datetime +import json +from pathlib import Path +import re +import subprocess +from typing import Callable +from urllib.parse import urlparse + +from autoresearch.shared.media_job_contract import ( + MediaJobEventRead, + MediaJobMode, + MediaJobPostprocess, + MediaJobRead, + MediaJobRequest, + MediaJobStatus, + MediaTargetBucket, +) +from autoresearch.shared.models import utc_now +from autoresearch.shared.store import Repository, create_resource_id + +_TOKEN_TO_YTDLP = { + "{title}": "%(title)s", + "{id}": "%(id)s", + "{uploader}": "%(uploader)s", + "{upload_date}": "%(upload_date)s", +} +_URL_RE = re.compile(r"^https?://\S+$", re.IGNORECASE) + + +class MediaJobService: + def __init__( + self, + *, + repository: Repository[MediaJobRead], + event_repository: Repository[MediaJobEventRead], + media_root: Path, + allowed_domains: set[str], + yt_dlp_bin: str = "yt-dlp", + ffmpeg_bin: str = "ffmpeg", + command_runner: Callable[[list[str]], subprocess.CompletedProcess[str]] | None = None, + ) -> None: + self._repository = repository + self._event_repository = event_repository + self._media_root = media_root + self._allowed_domains = {item.lower().strip() for item in allowed_domains if item.strip()} + self._yt_dlp_bin = yt_dlp_bin + self._ffmpeg_bin = ffmpeg_bin + self._command_runner = command_runner or self._run_command + + def create(self, request: MediaJobRequest) -> MediaJobRead: + now = utc_now() + job = MediaJobRead( + job_id=create_resource_id("mediajob"), + url=request.url, + mode=request.mode, + target_bucket=request.target_bucket, + filename_template=request.filename_template, + postprocess=request.postprocess, + status=MediaJobStatus.QUEUED, + created_at=now, + updated_at=now, + metadata=request.metadata, + ) + self._ensure_directories() + self._record_event(job_id=job.job_id, stage="created", status=job.status.value, detail=request.url) + return self._repository.save(job.job_id, job) + + def get(self, job_id: str) -> MediaJobRead | None: + return self._repository.get(job_id) + + def list(self) -> list[MediaJobRead]: + return self._repository.list() + + def execute(self, job_id: str) -> MediaJobRead: + job = self._require_job(job_id) + running = job.model_copy(update={"status": MediaJobStatus.RUNNING, "updated_at": utc_now(), "error": None}) + self._repository.save(running.job_id, running) + self._record_event(job_id=job.job_id, stage="running", status="running", detail=job.mode.value) + + try: + metadata = self._probe_metadata(running.url) + output_files = self._execute_job(running=running, metadata=metadata) + completed = running.model_copy( + update={ + "status": MediaJobStatus.COMPLETED, + "updated_at": utc_now(), + "output_files": output_files, + "title": metadata.get("title"), + "duration_seconds": self._coerce_int(metadata.get("duration")), + "uploader": metadata.get("uploader"), + "subtitle_path": self._find_suffix(output_files, {".srt", ".vtt"}), + "metadata_path": self._metadata_path_for_job(running).as_posix(), + "error": None, + } + ) + self._write_metadata_file(completed=completed, metadata=metadata) + self._record_event(job_id=job.job_id, stage="completed", status="completed", detail="ok") + return self._repository.save(completed.job_id, completed) + except Exception as exc: + failed = running.model_copy( + update={"status": MediaJobStatus.FAILED, "updated_at": utc_now(), "error": str(exc)} + ) + self._record_event(job_id=job.job_id, stage="failed", status="failed", detail=str(exc)) + return self._repository.save(failed.job_id, failed) + + def parse_telegram_task(self, text: str) -> MediaJobRequest | None: + normalized = text.strip() + if not normalized: + return None + parts = normalized.split(maxsplit=1) + explicit_mode = None + url = normalized + if len(parts) == 2 and parts[0].lower() in {"video", "audio", "subtitle", "metadata"}: + explicit_mode = MediaJobMode(parts[0].lower()) + url = parts[1].strip() + if not _URL_RE.match(url): + return None + if not self.is_supported_url(url): + return None + mode = explicit_mode or MediaJobMode.VIDEO + bucket = { + MediaJobMode.AUDIO: MediaTargetBucket.AUDIO, + MediaJobMode.VIDEO: MediaTargetBucket.VIDEO, + MediaJobMode.SUBTITLE: MediaTargetBucket.SUBTITLES, + MediaJobMode.METADATA: MediaTargetBucket.META, + }[mode] + postprocess = { + MediaJobMode.AUDIO: MediaJobPostprocess.MP3, + MediaJobMode.VIDEO: MediaJobPostprocess.MP4, + MediaJobMode.SUBTITLE: MediaJobPostprocess.NONE, + MediaJobMode.METADATA: MediaJobPostprocess.NONE, + }[mode] + return MediaJobRequest( + url=url, + mode=mode, + target_bucket=bucket, + filename_template="{title}-{id}", + postprocess=postprocess, + ) + + def is_supported_url(self, url: str) -> bool: + host = (urlparse(url).hostname or "").lower() + return any(host == domain or host.endswith(f".{domain}") for domain in self._allowed_domains) + + def _execute_job(self, *, running: MediaJobRead, metadata: dict[str, object]) -> list[str]: + output_dir = self._job_bucket_dir(running) + output_template = output_dir / f"{self._translate_template(running.filename_template)}.%(ext)s" + commands = self._build_commands(running=running, output_template=output_template) + for command in commands: + result = self._command_runner(command) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or result.stdout.strip() or "media command failed") + + output_files = sorted(path.as_posix() for path in output_dir.rglob("*") if path.is_file()) + if running.mode is MediaJobMode.METADATA: + output_files = [] + return output_files + + def _build_commands(self, *, running: MediaJobRead, output_template: Path) -> list[list[str]]: + url = running.url + template = output_template.as_posix() + if running.mode is MediaJobMode.AUDIO: + return [[self._yt_dlp_bin, "-x", "--audio-format", "mp3", "-o", template, url]] + if running.mode is MediaJobMode.VIDEO: + return [[self._yt_dlp_bin, "-f", "mp4/best", "-o", template, url]] + if running.mode is MediaJobMode.SUBTITLE: + return [[self._yt_dlp_bin, "--skip-download", "--write-auto-sub", "--write-sub", "--sub-langs", "all", "-o", template, url]] + if running.mode is MediaJobMode.METADATA: + return [] + raise ValueError(f"unsupported media mode: {running.mode}") + + def _probe_metadata(self, url: str) -> dict[str, object]: + result = self._command_runner([self._yt_dlp_bin, "--dump-single-json", "--skip-download", url]) + if result.returncode != 0: + raise RuntimeError(result.stderr.strip() or "failed to probe media metadata") + try: + payload = json.loads(result.stdout or "{}") + except json.JSONDecodeError as exc: + raise RuntimeError("invalid media metadata payload") from exc + if not isinstance(payload, dict): + raise RuntimeError("invalid media metadata payload") + return payload + + def _write_metadata_file(self, *, completed: MediaJobRead, metadata: dict[str, object]) -> None: + metadata_path = self._metadata_path_for_job(completed) + metadata_path.parent.mkdir(parents=True, exist_ok=True) + metadata_path.write_text(json.dumps(metadata, ensure_ascii=False, indent=2), encoding="utf-8") + + def _metadata_path_for_job(self, job: MediaJobRead) -> Path: + return self._bucket_dir(MediaTargetBucket.META) / f"{job.job_id}.json" + + def _job_bucket_dir(self, job: MediaJobRead) -> Path: + return self._bucket_dir(job.target_bucket) / job.job_id + + def _record_event(self, *, job_id: str, stage: str, status: str, detail: str) -> None: + event = MediaJobEventRead( + event_id=create_resource_id("mediaevt"), + job_id=job_id, + stage=stage, + status=status, + detail=detail, + created_at=utc_now(), + ) + self._event_repository.save(event.event_id, event) + + def _bucket_dir(self, bucket: MediaTargetBucket) -> Path: + return self._media_root / bucket.value + + def _ensure_directories(self) -> None: + for bucket in MediaTargetBucket: + (self._media_root / bucket.value).mkdir(parents=True, exist_ok=True) + (self._media_root / "jobs").mkdir(parents=True, exist_ok=True) + + def _translate_template(self, template: str) -> str: + return "-".join(_TOKEN_TO_YTDLP[token] for token in template.split("-")) + + @staticmethod + def _find_suffix(paths: list[str], suffixes: set[str]) -> str | None: + for path in paths: + if Path(path).suffix.lower() in suffixes: + return path + return None + + @staticmethod + def _coerce_int(value: object) -> int | None: + try: + if value is None: + return None + return int(value) + except (TypeError, ValueError): + return None + + def _require_job(self, job_id: str) -> MediaJobRead: + job = self.get(job_id) + if job is None: + raise KeyError(f"media job not found: {job_id}") + return job + + @staticmethod + def _run_command(command: list[str]) -> subprocess.CompletedProcess[str]: + return subprocess.run(command, text=True, capture_output=True, check=False) diff --git a/src/autoresearch/core/services/openhands_controlled_backend.py b/src/autoresearch/core/services/openhands_controlled_backend.py index dd23f7f5..16e3c845 100644 --- a/src/autoresearch/core/services/openhands_controlled_backend.py +++ b/src/autoresearch/core/services/openhands_controlled_backend.py @@ -1,12 +1,15 @@ from __future__ import annotations +import ast from dataclasses import dataclass import fnmatch import json import os +import re import shlex import shutil import subprocess +import sys from pathlib import Path from autoresearch.core.services.git_promotion_gate import GitPromotionGateService @@ -71,6 +74,12 @@ class OpenHandsControlledBackendService: "SSH_ASKPASS", } _GIT_ENV_PREFIXES = ("GITHUB_", "GH_", "GIT_AUTHOR_", "GIT_COMMITTER_", "SSH_") + _FAST_FAIL_PATTERNS = ( + re.compile(r"\bSyntaxError\b"), + re.compile(r"\bModuleNotFoundError\b"), + re.compile(r"\bImportError\b"), + re.compile(r"\bPermission denied\b", re.IGNORECASE), + ) def __init__( self, @@ -95,6 +104,7 @@ def run(self, request: ControlledExecutionRequest) -> ControlledExecutionRead: log_file = artifacts_dir / "execution.log" patch_file = artifacts_dir / "promotion.patch" summary_file = artifacts_dir / "summary.json" + overlay = run_dir / "overlay" artifacts_dir.mkdir(parents=True, exist_ok=True) created_at = utc_now() @@ -154,6 +164,11 @@ def run(self, request: ControlledExecutionRequest) -> ControlledExecutionRead: current_backend_attempts += 1 total_attempts += 1 self._sync_directory(source=baseline, target=workspace, apply_excludes=False) + self._prepare_strict_workspace( + workspace=workspace, + overlay_root=overlay, + allowed_paths=request.allowed_paths, + ) self._append_log( log_file, f"\n=== attempt {total_attempts} backend={backend.value} iteration={current_backend_attempts}/{backend_limit} ===\n", @@ -208,6 +223,20 @@ def run(self, request: ControlledExecutionRequest) -> ControlledExecutionRead: else f"test_command failed with status={validation_status.value}" ) + fail_fast_reason = self._detect_fail_fast_reason( + execution_outcome=execution_outcome, + changed_files=changed_files, + workspace=workspace, + log_file=log_file, + ) + if fail_fast_reason is not None: + status = ControlledRunStatus.FAILED + error = fail_fast_reason + validation_status = ValidationStatus.FAILED + if validation_exit_code is None: + validation_exit_code = 2 + break + if exit_code == 0 and validation_status is ValidationStatus.PASSED and changed_files: status = ControlledRunStatus.READY_FOR_PROMOTION error = None @@ -399,6 +428,7 @@ def _finalize_promotion( def _sync_directory(self, *, source: Path, target: Path, apply_excludes: bool) -> None: if target.exists(): + self._make_tree_writable(target) shutil.rmtree(target) target.mkdir(parents=True, exist_ok=True) @@ -429,21 +459,30 @@ def _execute_backend( log_file: Path, allowed_paths: list[str], ) -> _BackendExecutionOutcome: - if backend is ControlledBackend.MOCK: - return self._run_mock_backend( + try: + if backend is ControlledBackend.MOCK: + return self._run_mock_backend( + prompt=prompt, + workspace=workspace, + log_file=log_file, + allowed_paths=allowed_paths, + ) + + return self._run_openhands_cli( prompt=prompt, workspace=workspace, + artifacts_dir=artifacts_dir, log_file=log_file, allowed_paths=allowed_paths, ) - - return self._run_openhands_cli( - prompt=prompt, - workspace=workspace, - artifacts_dir=artifacts_dir, - log_file=log_file, - allowed_paths=allowed_paths, - ) + except Exception as exc: + message = str(exc).strip() or exc.__class__.__name__ + self._append_log(log_file, f"[backend-exception] {exc.__class__.__name__}: {message}\n") + return _BackendExecutionOutcome( + exit_code=1, + error=message, + stderr=f"{exc.__class__.__name__}: {message}\n", + ) def _run_mock_backend( self, @@ -484,7 +523,7 @@ def _run_openhands_cli( "Execution contract:\n" "- Single task execution only. Do not start autonomous loops.\n" "- Do not commit, push, or edit git config.\n" - "- Modify files only under /opt/workspace.\n" + "- Modify files only inside the provided workspace root.\n" "- Return changed files and executed commands in final summary.\n" ) @@ -537,6 +576,8 @@ def _build_openhands_env(self, *, workspace: Path, artifacts_dir: Path) -> dict[ env["OPENHANDS_WORKSPACE"] = str(workspace) env["OPENHANDS_AUDIT_DIR"] = str(artifacts_dir) env["OPENHANDS_AUDIT_FILE"] = str(artifacts_dir / "openhands_compliance.json") + env["PYTHONDONTWRITEBYTECODE"] = "1" + env["PYTHONPYCACHEPREFIX"] = str(artifacts_dir / "pycache") if env.get("OPENHANDS_DRY_RUN") == "1" and "OPENHANDS_RUNTIME" not in env: env["OPENHANDS_RUNTIME"] = "host" return env @@ -588,6 +629,11 @@ def _run_validation( completed = subprocess.run( command, cwd=workspace, + env={ + **os.environ, + "PYTHONDONTWRITEBYTECODE": "1", + "PYTHONPYCACHEPREFIX": str(validation_dir / "pycache"), + }, capture_output=True, text=True, check=False, @@ -611,6 +657,171 @@ def _run_validation( return 0, ValidationStatus.PASSED return completed.returncode, ValidationStatus.FAILED + def _prepare_strict_workspace( + self, + *, + workspace: Path, + overlay_root: Path, + allowed_paths: list[str], + ) -> None: + if overlay_root.exists(): + shutil.rmtree(overlay_root) + overlay_root.mkdir(parents=True, exist_ok=True) + + file_paths: list[str] = [] + directory_paths: list[str] = [] + wildcard_paths: list[str] = [] + + for pattern in allowed_paths: + normalized = pattern.strip().replace("\\", "/").rstrip("/") + if not normalized: + continue + if any(char in normalized for char in "*?["): + wildcard_paths.append(normalized) + continue + target = workspace / normalized + if target.exists() and target.is_dir(): + directory_paths.append(normalized) + continue + if "." in Path(normalized).name: + file_paths.append(normalized) + else: + directory_paths.append(normalized) + + for rel_path in file_paths: + self._materialize_overlay_file( + workspace=workspace, + overlay_root=overlay_root, + relative_path=rel_path, + ) + + self._apply_readonly_tree(workspace=workspace) + + for rel_path in directory_paths: + self._make_path_tree_writable(workspace / rel_path) + + for pattern in wildcard_paths: + self._make_matching_paths_writable(workspace=workspace, pattern=pattern) + + for rel_path in file_paths: + self._make_overlay_target_writable(overlay_root=overlay_root, relative_path=rel_path) + + def _materialize_overlay_file( + self, + *, + workspace: Path, + overlay_root: Path, + relative_path: str, + ) -> None: + source = workspace / relative_path + overlay_target = overlay_root / relative_path + overlay_target.parent.mkdir(parents=True, exist_ok=True) + if source.exists(): + if source.is_dir(): + return + shutil.copy2(source, overlay_target) + source.unlink() + else: + overlay_target.touch() + source.parent.mkdir(parents=True, exist_ok=True) + source.symlink_to(overlay_target) + + def _apply_readonly_tree(self, *, workspace: Path) -> None: + for root, dirnames, filenames in os.walk(workspace): + root_path = Path(root) + os.chmod(root_path, 0o555) + for dirname in dirnames: + os.chmod(root_path / dirname, 0o555) + for filename in filenames: + path = root_path / filename + if path.is_symlink(): + continue + os.chmod(path, 0o444) + + def _make_path_tree_writable(self, path: Path) -> None: + if not path.exists(): + return + if path.is_file(): + os.chmod(path, 0o644) + return + for root, dirnames, filenames in os.walk(path): + root_path = Path(root) + os.chmod(root_path, 0o755) + for dirname in dirnames: + os.chmod(root_path / dirname, 0o755) + for filename in filenames: + candidate = root_path / filename + if candidate.is_symlink(): + continue + os.chmod(candidate, 0o644) + + def _make_matching_paths_writable(self, *, workspace: Path, pattern: str) -> None: + for candidate in workspace.rglob("*"): + relative = candidate.relative_to(workspace).as_posix() + if fnmatch.fnmatch(relative, pattern): + self._make_path_tree_writable(candidate) + + def _make_tree_writable(self, path: Path) -> None: + if not path.exists(): + return + for root, dirnames, filenames in os.walk(path): + root_path = Path(root) + os.chmod(root_path, 0o755) + for dirname in dirnames: + os.chmod(root_path / dirname, 0o755) + for filename in filenames: + candidate = root_path / filename + if candidate.is_symlink(): + continue + os.chmod(candidate, 0o644) + + def _make_overlay_target_writable(self, *, overlay_root: Path, relative_path: str) -> None: + target = overlay_root / relative_path + parent = target.parent + while True: + os.chmod(parent, 0o755) + if parent == overlay_root: + break + parent = parent.parent + if target.exists(): + os.chmod(target, 0o644) + + def _detect_fail_fast_reason( + self, + *, + execution_outcome: _BackendExecutionOutcome, + changed_files: list[str], + workspace: Path, + log_file: Path, + ) -> str | None: + combined_output = "\n".join( + part for part in (execution_outcome.error, execution_outcome.stdout, execution_outcome.stderr) if part + ) + for pattern in self._FAST_FAIL_PATTERNS: + if pattern.search(combined_output): + reason = f"fail-fast probe tripped on backend output: {pattern.pattern}" + self._append_log(log_file, f"[fail-fast] {reason}\n") + return reason + + python_changes = [path for path in changed_files if path.endswith(".py")] + if not python_changes: + return None + + for rel_path in python_changes: + target = workspace / rel_path + try: + source = target.read_text(encoding="utf-8") + ast.parse(source, filename=rel_path) + except SyntaxError as exc: + reason = f"fail-fast probe detected broken python artifacts: SyntaxError in {rel_path}:{exc.lineno}" + self._append_log(log_file, f"[fail-fast] {reason}\n") + return reason + except OSError as exc: + reason = f"fail-fast probe could not read changed file {rel_path}: {exc}" + self._append_log(log_file, f"[fail-fast] {reason}\n") + return reason + return None + def _detect_scope_violation( self, *, diff --git a/src/autoresearch/core/services/openhands_worker.py b/src/autoresearch/core/services/openhands_worker.py index ca21a9c5..005f17a0 100644 --- a/src/autoresearch/core/services/openhands_worker.py +++ b/src/autoresearch/core/services/openhands_worker.py @@ -1,6 +1,7 @@ from __future__ import annotations import shlex +import sys from autoresearch.agent_protocol.models import ExecutionPolicy, FallbackStep, JobSpec, ValidatorSpec from autoresearch.shared.models import GitPromotionMode @@ -15,9 +16,25 @@ class OpenHandsWorkerService: """Translate a patch-only OpenHands worker contract into existing AEP/backends.""" + DEFAULT_TIMEOUT_SEC = 420 + + def _execution_test_command_parts(self, raw_command: str) -> list[str]: + parts = shlex.split(raw_command) + if not parts: + return [] + if parts[0] == "pytest": + return [sys.executable, "-m", "pytest", *parts[1:]] + if parts[0] in {"python", "python3"}: + return [sys.executable, *parts[1:]] + return parts + + def _execution_test_command(self, raw_command: str) -> str: + return shlex.join(self._execution_test_command_parts(raw_command)) + def build_prompt(self, spec: OpenHandsWorkerJobSpec) -> str: allowed_paths = "\n".join(f"- {item}" for item in spec.allowed_paths) forbidden_paths = "\n".join(f"- {item}" for item in spec.forbidden_paths) + test_command = self._execution_test_command(spec.test_command) return ( "You are OpenHands operating as a constrained patch-only worker.\n\n" "Problem statement:\n" @@ -25,6 +42,8 @@ def build_prompt(self, spec: OpenHandsWorkerJobSpec) -> str: "Hard rules:\n" "- Only modify files that match allowed_paths.\n" "- Never modify forbidden_paths.\n" + "- The workspace is physically permission-scoped; out-of-scope writes will fail at the filesystem layer.\n" + "- If an allowed business surface directory does not exist yet, you may create it inside allowed_paths.\n" "- Do not run git add, git commit, git push, git merge, git rebase, git reset, or git checkout.\n" "- Do not create product-facing entrypoints or change approval/promotion policy.\n" "- Produce the smallest patch that can satisfy the validation command.\n" @@ -34,10 +53,11 @@ def build_prompt(self, spec: OpenHandsWorkerJobSpec) -> str: "forbidden_paths:\n" f"{forbidden_paths}\n\n" "test_command:\n" - f"- {spec.test_command}\n" + f"- {test_command}\n" ) def build_agent_job_spec(self, spec: OpenHandsWorkerJobSpec) -> JobSpec: + test_command = self._execution_test_command(spec.test_command) fallback: list[FallbackStep] = [] retry_attempts = max(spec.max_iterations - 1, 0) if retry_attempts > 0: @@ -53,6 +73,7 @@ def build_agent_job_spec(self, spec: OpenHandsWorkerJobSpec) -> JobSpec: mode="patch_only", task=self.build_prompt(spec), policy=ExecutionPolicy( + timeout_sec=self.DEFAULT_TIMEOUT_SEC, allowed_paths=list(spec.allowed_paths), forbidden_paths=list(spec.forbidden_paths), cleanup_on_success=True, @@ -62,7 +83,7 @@ def build_agent_job_spec(self, spec: OpenHandsWorkerJobSpec) -> JobSpec: ValidatorSpec( id="worker.test_command", kind="command", - command=spec.test_command, + command=test_command, ) ], fallback=fallback, @@ -77,6 +98,7 @@ def build_agent_job_spec(self, spec: OpenHandsWorkerJobSpec) -> JobSpec: ) def build_controlled_request(self, spec: OpenHandsWorkerJobSpec) -> ControlledExecutionRequest: + test_command = self._execution_test_command_parts(spec.test_command) fallback_backend = ControlledBackend.MOCK if spec.use_mock_fallback else None failure_strategy = FailureStrategy.FALLBACK if fallback_backend is not None else FailureStrategy.HUMAN_IN_LOOP return ControlledExecutionRequest( @@ -84,7 +106,7 @@ def build_controlled_request(self, spec: OpenHandsWorkerJobSpec) -> ControlledEx prompt=self.build_prompt(spec), allowed_paths=list(spec.allowed_paths), forbidden_paths=list(spec.forbidden_paths), - test_command=shlex.split(spec.test_command), + test_command=test_command, backend=ControlledBackend.OPENHANDS_CLI, fallback_backend=fallback_backend, worker_output_mode=spec.worker_output_mode, diff --git a/src/autoresearch/core/services/panel_access.py b/src/autoresearch/core/services/panel_access.py index 5283036a..53868e49 100644 --- a/src/autoresearch/core/services/panel_access.py +++ b/src/autoresearch/core/services/panel_access.py @@ -74,6 +74,7 @@ def __init__( *, secret: str | None, base_url: str = "http://127.0.0.1:8000/api/v1/panel/view", + mini_app_url: str | None = None, issuer: str = "autoresearch.telegram", audience: str = "autoresearch.panel", default_ttl_seconds: int = 300, @@ -84,6 +85,7 @@ def __init__( ) -> None: self._secret = (secret or "").strip() self._base_url = base_url.strip() or "http://127.0.0.1:8000/api/v1/panel/view" + self._mini_app_url = (mini_app_url or "").strip() or None self._issuer = issuer self._audience = audience self._default_ttl_seconds = max(30, default_ttl_seconds) @@ -100,10 +102,43 @@ def enabled(self) -> bool: def base_url(self) -> str: return self._base_url + @property + def mini_app_url(self) -> str | None: + return self._mini_app_url + @property def allowed_uids(self) -> tuple[str, ...]: return tuple(sorted(self._allowed_uids)) + def build_action_url( + self, + *, + query_params: dict[str, str], + telegram_uid: str | None = None, + prefer_mini_app: bool = True, + ) -> str: + base_candidate = self._base_url + if prefer_mini_app and self._mini_app_url: + base_candidate = self._mini_app_url + + parsed = urlparse(base_candidate) + query_items = dict(parse_qsl(parsed.query, keep_blank_values=True)) + for key, value in query_params.items(): + if value is None: + continue + query_items[key] = value + + normalized_uid = (telegram_uid or "").strip() + if normalized_uid and self.enabled: + magic_link = self.create_magic_link(normalized_uid) + magic_parsed = urlparse(magic_link.url) + magic_query = dict(parse_qsl(magic_parsed.query, keep_blank_values=True)) + token = (magic_query.get("token") or "").strip() + if token: + query_items["token"] = token + + return urlunparse(parsed._replace(query=urlencode(query_items))) + def create_magic_link(self, telegram_uid: str, ttl_seconds: int | None = None) -> PanelMagicLinkRead: if not self.enabled: raise RuntimeError("panel magic-link signing secret is not configured") diff --git a/src/autoresearch/core/services/upstream_watcher.py b/src/autoresearch/core/services/upstream_watcher.py new file mode 100644 index 00000000..71d56dd1 --- /dev/null +++ b/src/autoresearch/core/services/upstream_watcher.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +from datetime import datetime +from pathlib import Path +import shutil +import subprocess +import tempfile + +from autoresearch.shared.autoresearch_planner_contract import ( + UpstreamWatchCommitRead, + UpstreamWatchDecision, + UpstreamWatchRead, +) + + +_DEFAULT_UPSTREAM_URL = "https://github.com/openclaw/openclaw.git" +_DEFAULT_WORKSPACE_ROOT = Path("/Volumes/AI_LAB/ai_lab/workspace") +_NON_CORE_PATH_PREFIXES = ( + "extensions/", + "test/helpers/extensions/", + "docs/", +) +_NON_CORE_PATHS = { + "CHANGELOG.md", + ".gitignore", + "README.md", + "CONTRIBUTING.md", + "SECURITY.md", +} + + +class UpstreamWatcherService: + """Inspect upstream changes in an isolated temp clone and decide whether to skip them.""" + + def __init__( + self, + *, + upstream_url: str = _DEFAULT_UPSTREAM_URL, + workspace_root: Path | None = None, + max_commits: int = 5, + ) -> None: + self._upstream_url = upstream_url.strip() or _DEFAULT_UPSTREAM_URL + self._workspace_root = (workspace_root or _DEFAULT_WORKSPACE_ROOT).expanduser().resolve() + self._max_commits = max(1, min(max_commits, 20)) + + def inspect(self) -> UpstreamWatchRead: + self._workspace_root.mkdir(parents=True, exist_ok=True) + sync_dir = Path(tempfile.mkdtemp(prefix="openclaw-upstream.", dir=str(self._workspace_root))) + default_branch = "main" + try: + default_branch = self._detect_default_branch() + self._git( + "clone", + "--depth=1", + "--single-branch", + "--no-tags", + "--branch", + default_branch, + self._upstream_url, + str(sync_dir), + ) + self._git( + "fetch", + f"--depth={self._max_commits + 1}", + "origin", + default_branch, + cwd=sync_dir, + ) + recent_commits = self._load_recent_commits(sync_dir=sync_dir, default_branch=default_branch) + changed_paths = self._collect_changed_paths(recent_commits) + focus_areas = self._derive_focus_areas(changed_paths) + relevant_paths = [path for path in changed_paths if not self._is_non_core_path(path)] + latest_commit = recent_commits[0] if recent_commits else None + decision = UpstreamWatchDecision.SKIP if not relevant_paths else UpstreamWatchDecision.REVIEW + result = UpstreamWatchRead( + upstream_url=self._upstream_url, + default_branch=default_branch, + latest_commit_sha=latest_commit.sha if latest_commit else None, + latest_commit_title=latest_commit.title if latest_commit else None, + latest_commit_at=latest_commit.committed_at if latest_commit else None, + recent_commits=recent_commits, + changed_paths=changed_paths, + relevant_paths=relevant_paths, + focus_areas=focus_areas, + decision=decision, + summary=self._build_summary(decision=decision, focus_areas=focus_areas, relevant_paths=relevant_paths), + ) + except Exception as exc: + result = UpstreamWatchRead( + upstream_url=self._upstream_url, + default_branch=default_branch, + decision=UpstreamWatchDecision.FAILED, + summary="Upstream watch failed.", + error=str(exc), + ) + + cleanup_paths = self._cleanup_temp_dirs() + return result.model_copy(update={"cleaned_up": True, "cleanup_paths": cleanup_paths}) + + def _detect_default_branch(self) -> str: + output = self._git("ls-remote", "--symref", self._upstream_url, "HEAD") + for line in output.splitlines(): + if not line.startswith("ref: "): + continue + parts = line.split() + if len(parts) < 2 or not parts[1].startswith("refs/heads/"): + continue + return parts[1].removeprefix("refs/heads/") + return "main" + + def _load_recent_commits(self, *, sync_dir: Path, default_branch: str) -> list[UpstreamWatchCommitRead]: + raw = self._git( + "log", + f"origin/{default_branch}", + f"--max-count={self._max_commits}", + "--date=iso-strict", + "--pretty=format:%H%x1f%cI%x1f%s", + cwd=sync_dir, + ) + commits: list[UpstreamWatchCommitRead] = [] + for line in raw.splitlines(): + parts = line.split("\x1f", 2) + sha = parts[0] if len(parts) > 0 else "" + committed_at_raw = parts[1] if len(parts) > 1 else "" + title = parts[2] if len(parts) > 2 else "" + committed_at = None + if committed_at_raw: + committed_at = datetime.fromisoformat(committed_at_raw.replace("Z", "+00:00")) + commits.append( + UpstreamWatchCommitRead( + sha=sha, + title=title, + committed_at=committed_at, + touched_paths=self._load_touched_paths(sync_dir=sync_dir, sha=sha), + ) + ) + return commits + + def _load_touched_paths(self, *, sync_dir: Path, sha: str) -> list[str]: + raw = self._git( + "diff-tree", + "--no-commit-id", + "--name-only", + "-r", + "-m", + sha, + cwd=sync_dir, + ) + return self._dedupe(line.strip() for line in raw.splitlines() if line.strip()) + + def _collect_changed_paths(self, commits: list[UpstreamWatchCommitRead]) -> list[str]: + return self._dedupe(path for commit in commits for path in commit.touched_paths) + + def _derive_focus_areas(self, changed_paths: list[str]) -> list[str]: + return self._dedupe(self._classify_focus_area(path) for path in changed_paths if path) + + def _classify_focus_area(self, path: str) -> str: + normalized = path.strip("/") + parts = normalized.split("/") + if len(parts) >= 2 and parts[0] == "extensions": + return f"extension:{parts[1]}" + if len(parts) >= 4 and parts[:3] == ["test", "helpers", "extensions"]: + helper_name = Path(parts[3]).stem.split("-", 1)[0] + return f"extension:{helper_name}" + if normalized in _NON_CORE_PATHS: + return "repo-meta" + return parts[0] if parts else normalized + + def _build_summary( + self, + *, + decision: UpstreamWatchDecision, + focus_areas: list[str], + relevant_paths: list[str], + ) -> str: + if decision is UpstreamWatchDecision.SKIP: + focus_text = ", ".join(self._format_focus_area(item) for item in focus_areas[:3]) or "non-core extensions" + return f"Recent upstream changes remain in non-core areas ({focus_text}); auto-skipped." + if decision is UpstreamWatchDecision.REVIEW: + return f"Recent upstream changes touched review-required paths: {', '.join(relevant_paths[:5])}." + return "Upstream watch failed." + + def _format_focus_area(self, focus_area: str) -> str: + if focus_area.startswith("extension:"): + name = focus_area.split(":", 1)[1] + if name.lower() == "line": + return "LINE" + return name.replace("-", " ").title() + if focus_area == "repo-meta": + return "repo meta" + return focus_area.replace("-", " ") + + def _is_non_core_path(self, path: str) -> bool: + normalized = path.strip() + if normalized in _NON_CORE_PATHS: + return True + return normalized.startswith(_NON_CORE_PATH_PREFIXES) + + def _cleanup_temp_dirs(self) -> list[str]: + cleanup_paths: list[str] = [] + for path in sorted(self._workspace_root.glob("openclaw-upstream.*")): + if not path.is_dir() or path.parent != self._workspace_root: + continue + cleanup_paths.append(str(path)) + shutil.rmtree(path, ignore_errors=True) + return cleanup_paths + + def _git(self, *args: str, cwd: Path | None = None) -> str: + completed = subprocess.run( + ["git", *args], + cwd=str(cwd) if cwd is not None else None, + capture_output=True, + text=True, + check=False, + ) + if completed.returncode == 0: + return completed.stdout.strip() + detail = (completed.stderr or completed.stdout).strip() + raise RuntimeError(f"git {' '.join(args)} failed: {detail}") + + def _dedupe(self, values: object) -> list[str]: + seen: set[str] = set() + ordered: list[str] = [] + for item in values: + normalized = str(item).strip() + if not normalized or normalized in seen: + continue + seen.add(normalized) + ordered.append(normalized) + return ordered diff --git a/src/autoresearch/executions/runner.py b/src/autoresearch/executions/runner.py index 80b64ee5..5835d1ca 100644 --- a/src/autoresearch/executions/runner.py +++ b/src/autoresearch/executions/runner.py @@ -6,12 +6,15 @@ import os import re import shutil +import signal import subprocess +import sys import time from pathlib import Path, PurePosixPath from typing import Any from autoresearch.agent_protocol.models import ( + DriverMetrics, DriverResult, JobSpec, RunSummary, @@ -32,6 +35,40 @@ ".git/", ) +_AI_LAB_ENV_OVERRIDE_KEYS = ( + "ENV_FILE", + "OPENHANDS_ENV_FILE", + "COMPOSE_DIR", + "COMPOSE_FILE", + "WORKSPACE_DIR", + "LOG_DIR", + "CACHE_DIR", + "LAB_USER", + "AUTO_OPEN_DOCKER", + "AUTO_START_COLIMA", + "AI_LAB_IMAGE_TAG", + "AI_LAB_FORCE_DOCKER_RUN", + "AI_LAB_HOST_MOUNT_ROOT", + "OPENHANDS_HOME_DIR", + "DOCKER_HOST_SOCKET_PATH", + "DOCKER_HOST_IN_CONTAINER", + "DOCKER_HOST_MOUNT_DIR", + "AI_LAB_COLIMA_HELPER", +) + + +def _is_benign_runtime_artifact(path: str) -> bool: + normalized = path.replace("\\", "/").strip("/") + if not normalized: + return False + if normalized.startswith(".pytest_cache/") or "/.pytest_cache/" in f"/{normalized}": + return True + if "/__pycache__/" in f"/{normalized}": + return True + if normalized.startswith("apps/") and normalized.endswith("/README.md"): + return True + return False + class AgentExecutionRunner: def __init__( @@ -52,6 +89,21 @@ def __init__( writer_lease=WriterLeaseService(), ) + def _uses_openhands_ai_lab_runtime(self, manifest_entrypoint: str) -> bool: + if Path(manifest_entrypoint).name != "openhands_adapter.sh": + return False + runtime = str(os.environ.get("OPENHANDS_RUNTIME") or "ai-lab").strip().lower() + return runtime == "ai-lab" + + def _build_openhands_ai_lab_env(self) -> dict[str, str]: + env = dict(os.environ) + for key in _AI_LAB_ENV_OVERRIDE_KEYS: + env.pop(key, None) + env_file = str(self._repo_root / "ai_lab.env") + env["ENV_FILE"] = env_file + env["OPENHANDS_ENV_FILE"] = env_file + return env + def run_job(self, job: JobSpec) -> RunSummary: manifest = self._registry.load(job.agent_id) effective_policy = build_effective_policy(manifest.policy_defaults, job.policy) @@ -68,7 +120,7 @@ def run_job(self, job: JobSpec) -> RunSummary: patch_path = artifacts_dir / "promotion.patch" if run_dir.exists(): - shutil.rmtree(run_dir) + self._rmtree_force(run_dir) artifacts_dir.mkdir(parents=True, exist_ok=True) job_path.write_text( @@ -91,6 +143,8 @@ def run_job(self, job: JobSpec) -> RunSummary: pending_attempts = 1 attempt = 0 forced_final_status: str | None = None + cleanup_success = False + final_summary: RunSummary | None = None last_result = self._contract_error_result( run_id=job.run_id, @@ -101,165 +155,269 @@ def run_job(self, job: JobSpec) -> RunSummary: last_validation = ValidationReport(run_id=job.run_id, passed=False, checks=[]) last_patch_filtered_paths: list[str] = [] - while True: - if pending_attempts <= 0: - if fallback_index >= len(job.fallback): - break - step = job.fallback[fallback_index] - fallback_index += 1 - - if step.action == "retry": - pending_attempts = step.max_attempts - continue - if step.action == "fallback_agent": - if step.agent_id: - current_agent = step.agent_id - pending_attempts = step.max_attempts + try: + while True: + if pending_attempts <= 0: + if fallback_index >= len(job.fallback): + break + step = job.fallback[fallback_index] + fallback_index += 1 + + if step.action == "retry": + skip_retry_reason = self._retry_skip_reason(last_result) + if skip_retry_reason is not None: + self._append_event( + events_path, + { + "type": "fallback_skipped", + "attempt": attempt, + "agent_id": current_agent, + "action": "retry", + "reason": skip_retry_reason, + }, + ) + continue + pending_attempts = step.max_attempts + continue + if step.action == "fallback_agent": + if step.agent_id: + current_agent = step.agent_id + pending_attempts = step.max_attempts + continue + if step.action == "human_review": + forced_final_status = "human_review" + break + if step.action == "reject": + forced_final_status = "blocked" + break + + attempt += 1 + pending_attempts -= 1 + + active_manifest = self._registry.load(current_agent) + preflight_error = self._preflight_agent_environment( + agent_id=current_agent, + manifest_entrypoint=active_manifest.entrypoint, + ) + if preflight_error is not None: + driver_result = self._contract_error_result( + run_id=job.run_id, + agent_id=current_agent, + attempt=attempt, + message=preflight_error, + recommended_action="fallback", + ) + last_result = driver_result + last_validation = ValidationReport(run_id=job.run_id, passed=False, checks=[]) + self._append_event( + events_path, + { + "type": "attempt_blocked", + "attempt": attempt, + "agent_id": current_agent, + "reason": "environment_preflight_failed", + "detail": preflight_error, + }, + ) + self._append_event( + events_path, + { + "type": "attempt_completed", + "attempt": attempt, + "agent_id": current_agent, + "driver_status": driver_result.status, + "validation_passed": False, + }, + ) continue - if step.action == "human_review": - forced_final_status = "human_review" - break - if step.action == "reject": - forced_final_status = "blocked" - break - - attempt += 1 - pending_attempts -= 1 - self._snapshot_baseline_to_workspace(baseline_dir, workspace_dir) - self._append_event( - events_path, - { - "type": "attempt_started", - "attempt": attempt, - "agent_id": current_agent, - }, - ) + attempt_job = self._job_for_attempt( + job=job, + agent_id=current_agent, + attempt=attempt, + last_result=last_result, + last_validation=last_validation, + ) + job_path.write_text( + json.dumps(attempt_job.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) - active_manifest = self._registry.load(current_agent) - driver_result = self._invoke_adapter( - manifest_entrypoint=active_manifest.entrypoint, - run_dir=run_dir, - workspace_dir=workspace_dir, - artifacts_dir=artifacts_dir, - job_path=job_path, - result_path=result_path, - events_path=events_path, - baseline_dir=baseline_dir, - run_id=job.run_id, - agent_id=current_agent, - attempt=attempt, - timeout_sec=effective_policy.merged.timeout_sec, - ) - result_path.write_text( - json.dumps(driver_result.model_dump(mode="json"), ensure_ascii=False, indent=2), - encoding="utf-8", - ) + self._snapshot_baseline_to_workspace(baseline_dir, workspace_dir) + self._prepare_shadow_workspace(workspace_dir, effective_policy) + self._append_event( + events_path, + { + "type": "attempt_started", + "attempt": attempt, + "agent_id": current_agent, + }, + ) - changed_paths = self._collect_changed_paths(baseline_dir, workspace_dir) - patch_text, patch_filtered_paths, builtin_checks = self._build_filtered_patch( - baseline_dir=baseline_dir, - workspace_dir=workspace_dir, - changed_paths=changed_paths, - driver_result=driver_result, - policy=effective_policy, - ) - patch_path.write_text(patch_text, encoding="utf-8") + driver_result = self._invoke_adapter( + manifest_entrypoint=active_manifest.entrypoint, + run_dir=run_dir, + workspace_dir=workspace_dir, + artifacts_dir=artifacts_dir, + job_path=job_path, + result_path=result_path, + events_path=events_path, + baseline_dir=baseline_dir, + run_id=job.run_id, + agent_id=current_agent, + attempt=attempt, + timeout_sec=effective_policy.merged.timeout_sec, + policy=effective_policy, + ) + result_path.write_text( + json.dumps(driver_result.model_dump(mode="json"), ensure_ascii=False, indent=2), + encoding="utf-8", + ) - validation = self._run_validators( - run_id=job.run_id, - workspace_dir=workspace_dir, - patch_path=patch_path, - builtin_checks=builtin_checks, - validator_specs=job.validators, - timeout_sec=effective_policy.merged.timeout_sec, - ) + changed_paths = self._collect_changed_paths(baseline_dir, workspace_dir) + patch_text, patch_filtered_paths, builtin_checks = self._build_filtered_patch( + baseline_dir=baseline_dir, + workspace_dir=workspace_dir, + changed_paths=changed_paths, + driver_result=driver_result, + policy=effective_policy, + ) + patch_path.write_text(patch_text, encoding="utf-8") - if not driver_result.changed_paths: - driver_result = driver_result.model_copy( - update={"changed_paths": patch_filtered_paths} + validation = self._run_validators( + run_id=job.run_id, + workspace_dir=workspace_dir, + patch_path=patch_path, + builtin_checks=builtin_checks, + validator_specs=job.validators, + timeout_sec=effective_policy.merged.timeout_sec, ) - last_patch_filtered_paths = patch_filtered_paths - - if self._has_policy_violation(validation): - driver_result = driver_result.model_copy( - update={ - "status": "policy_blocked", - "recommended_action": "reject", - "error": "execution produced out-of-scope or forbidden changes", - } + + if not driver_result.changed_paths: + driver_result = driver_result.model_copy( + update={"changed_paths": patch_filtered_paths} + ) + last_patch_filtered_paths = patch_filtered_paths + + if self._has_policy_violation(validation): + driver_result = driver_result.model_copy( + update={ + "status": "policy_blocked", + "recommended_action": "reject", + "error": "execution produced out-of-scope or forbidden changes", + } + ) + + last_result = driver_result + last_validation = validation + + self._append_event( + events_path, + { + "type": "attempt_completed", + "attempt": attempt, + "agent_id": current_agent, + "driver_status": driver_result.status, + "validation_passed": validation.passed, + }, ) - last_result = driver_result - last_validation = validation + if attempt_succeeded(driver_result=driver_result, validation=validation): + promotion_preflight, promotion = self._finalize_promotion( + job=job, + agent_id=current_agent, + patch_path=patch_path, + changed_files=patch_filtered_paths, + validation=validation, + policy=effective_policy, + artifacts_dir=artifacts_dir, + ) + final_status = ( + "promoted" + if promotion.mode is GitPromotionMode.DRAFT_PR + else "ready_for_promotion" + ) + if not promotion.success: + final_status = "blocked" + final_summary = RunSummary( + run_id=job.run_id, + final_status=final_status, + driver_result=driver_result, + validation=validation, + promotion_patch_uri=str(patch_path), + promotion_preflight=promotion_preflight, + promotion=promotion, + ) + cleanup_success = True + break + + if driver_result.status == "policy_blocked": + break + if final_summary is None: + final_status = forced_final_status or derive_terminal_status(last_result, last_validation) + final_summary = RunSummary( + run_id=job.run_id, + final_status=final_status, + driver_result=last_result, + validation=last_validation, + promotion_patch_uri=str(patch_path) if patch_path.exists() else None, + promotion_preflight=None, + promotion=None, + ) + except Exception as exc: + error_message = f"runner crashed: {exc.__class__.__name__}: {exc}" + last_result = self._contract_error_result( + run_id=job.run_id, + agent_id=current_agent, + attempt=attempt or 1, + message=error_message, + ) self._append_event( events_path, { - "type": "attempt_completed", + "type": "runner_exception", "attempt": attempt, "agent_id": current_agent, - "driver_status": driver_result.status, - "validation_passed": validation.passed, + "detail": error_message, }, ) - - if attempt_succeeded(driver_result=driver_result, validation=validation): - promotion_preflight, promotion = self._finalize_promotion( - job=job, - agent_id=current_agent, - patch_path=patch_path, - changed_files=patch_filtered_paths, - validation=validation, - policy=effective_policy, - artifacts_dir=artifacts_dir, - ) - final_status = "promoted" if promotion.mode is GitPromotionMode.DRAFT_PR else "ready_for_promotion" - if not promotion.success: - final_status = "blocked" - summary = RunSummary( + final_summary = RunSummary( + run_id=job.run_id, + final_status="failed", + driver_result=last_result, + validation=last_validation, + promotion_patch_uri=str(patch_path) if patch_path.exists() else None, + promotion_preflight=None, + promotion=None, + ) + finally: + if final_summary is None: + final_summary = RunSummary( run_id=job.run_id, - final_status=final_status, - driver_result=driver_result, - validation=validation, - promotion_patch_uri=str(patch_path), - promotion_preflight=promotion_preflight, - promotion=promotion, + final_status=forced_final_status or derive_terminal_status(last_result, last_validation), + driver_result=last_result, + validation=last_validation, + promotion_patch_uri=str(patch_path) if patch_path.exists() else None, + promotion_preflight=None, + promotion=None, ) - summary_path.write_text( - json.dumps(summary.model_dump(mode="json"), ensure_ascii=False, indent=2), - encoding="utf-8", - ) - self._cleanup_workspace( - workspace_dir=workspace_dir, - success=True, - policy=effective_policy, - ) - return summary + self._write_summary(summary_path=summary_path, summary=final_summary) + self._cleanup_workspace( + workspace_dir=workspace_dir, + success=cleanup_success, + policy=effective_policy, + ) - if driver_result.status == "policy_blocked": - break + return final_summary - final_status = forced_final_status or derive_terminal_status(last_result, last_validation) - summary = RunSummary( - run_id=job.run_id, - final_status=final_status, - driver_result=last_result, - validation=last_validation, - promotion_patch_uri=str(patch_path) if patch_path.exists() else None, - promotion_preflight=None, - promotion=None, - ) + @staticmethod + def _write_summary(*, summary_path: Path, summary: RunSummary) -> None: + summary_path.parent.mkdir(parents=True, exist_ok=True) summary_path.write_text( json.dumps(summary.model_dump(mode="json"), ensure_ascii=False, indent=2), encoding="utf-8", ) - self._cleanup_workspace( - workspace_dir=workspace_dir, - success=False, - policy=effective_policy, - ) - return summary @staticmethod def _has_policy_violation(validation: ValidationReport) -> bool: @@ -285,9 +443,152 @@ def _snapshot_repo_to_baseline(self, baseline_dir: Path) -> None: def _snapshot_baseline_to_workspace(self, baseline_dir: Path, workspace_dir: Path) -> None: if workspace_dir.exists(): - shutil.rmtree(workspace_dir) + self._rmtree_force(workspace_dir) shutil.copytree(baseline_dir, workspace_dir, dirs_exist_ok=True) + def _prepare_shadow_workspace(self, workspace_dir: Path, policy: EffectivePolicy) -> None: + if not workspace_dir.exists(): + return + + self._apply_mode_tree(workspace_dir, file_mode=0o444, dir_mode=0o555) + + writable_paths: set[Path] = set() + for pattern in policy.merged.allowed_paths: + writable_paths.update(self._resolve_writable_targets(workspace_dir, pattern)) + + for target in sorted(writable_paths, key=lambda item: (len(item.parts), str(item))): + self._make_target_writable(workspace_dir, target) + + locked_paths: set[Path] = set() + for pattern in policy.merged.forbidden_paths: + locked_paths.update(self._resolve_matching_paths(workspace_dir, pattern)) + + for target in sorted(locked_paths, key=lambda item: len(item.parts), reverse=True): + self._make_target_read_only(target) + + def _resolve_writable_targets(self, workspace_dir: Path, pattern: str) -> set[Path]: + normalized = pattern.replace("\\", "/").strip("/") + if not normalized: + return set() + + targets = self._resolve_matching_paths(workspace_dir, normalized) + if targets: + return targets + + prefix = self._glob_prefix(normalized) + if prefix: + candidate = workspace_dir / prefix + if candidate.exists(): + return {candidate} + return {candidate} + + candidate = workspace_dir / normalized + if candidate.exists(): + return {candidate} + return {candidate} + + def _resolve_matching_paths(self, workspace_dir: Path, pattern: str) -> set[Path]: + normalized = pattern.replace("\\", "/").strip("/") + if not normalized: + return set() + + matched: set[Path] = set() + for path in [workspace_dir, *workspace_dir.rglob("*")]: + rel = path.relative_to(workspace_dir).as_posix() if path != workspace_dir else "." + if rel == ".": + continue + if self._matches_any(rel, [normalized]): + matched.add(path) + return matched + + @staticmethod + def _nearest_existing_ancestor(workspace_dir: Path, candidate: Path) -> Path: + probe = candidate + while probe != workspace_dir and not probe.exists(): + probe = probe.parent + if probe.exists(): + return probe + return workspace_dir + + def _make_target_writable(self, workspace_dir: Path, target: Path) -> None: + for ancestor in reversed(target.parents): + if ancestor == workspace_dir.parent or ancestor == target: + continue + if workspace_dir not in ancestor.parents and ancestor != workspace_dir: + continue + if ancestor.exists(): + self._chmod_path(ancestor, 0o777) + + if target.is_dir(): + self._apply_mode_tree(target, file_mode=0o666, dir_mode=0o777) + return + + if target.exists(): + self._chmod_path(target, 0o666) + if target.parent.exists(): + self._chmod_path(target.parent, 0o777) + return + + self._ensure_writable_path_chain(workspace_dir=workspace_dir, target=target) + if not target.suffix: + try: + target.mkdir(parents=True, exist_ok=True) + except OSError: + pass + if target.exists() and target.is_dir(): + self._apply_mode_tree(target, file_mode=0o666, dir_mode=0o777) + return + + if target.parent.exists(): + self._chmod_path(target.parent, 0o777) + + def _ensure_writable_path_chain(self, *, workspace_dir: Path, target: Path) -> None: + self._chmod_path(workspace_dir, 0o777) + try: + relative_target = target.relative_to(workspace_dir) + except ValueError: + return + + chain_parts = relative_target.parts if not target.suffix else relative_target.parts[:-1] + current = workspace_dir + for part in chain_parts: + current = current / part + if not current.exists(): + try: + current.mkdir(exist_ok=True) + except OSError: + return + self._chmod_path(current, 0o777) + + def _make_target_read_only(self, target: Path) -> None: + if target.is_dir(): + self._apply_mode_tree(target, file_mode=0o444, dir_mode=0o555) + return + if target.exists(): + self._chmod_path(target, 0o444) + if target.parent.exists(): + self._chmod_path(target.parent, 0o555) + + def _apply_mode_tree(self, root: Path, *, file_mode: int, dir_mode: int) -> None: + if not root.exists(): + return + if root.is_dir(): + self._chmod_path(root, dir_mode) + for path in root.rglob("*"): + if path.is_dir(): + self._chmod_path(path, dir_mode) + else: + self._chmod_path(path, file_mode) + return + self._chmod_path(root, file_mode) + + @staticmethod + def _chmod_path(path: Path, mode: int) -> None: + try: + path.chmod(mode) + except OSError: + return + def _invoke_adapter( self, *, @@ -303,6 +604,7 @@ def _invoke_adapter( agent_id: str, attempt: int, timeout_sec: int, + policy: EffectivePolicy, ) -> DriverResult: if result_path.exists(): result_path.unlink() @@ -316,7 +618,10 @@ def _invoke_adapter( message=f"adapter entrypoint not found: {entrypoint}", ) - env = dict(os.environ) + if self._uses_openhands_ai_lab_runtime(manifest_entrypoint): + env = self._build_openhands_ai_lab_env() + else: + env = dict(os.environ) env.update( { "AEP_RUN_DIR": str(run_dir), @@ -334,34 +639,191 @@ def _invoke_adapter( stderr_log = artifacts_dir / "stderr.log" started = time.perf_counter() - try: - completed = subprocess.run( + completed: subprocess.CompletedProcess[str] | None = None + probe_signature: tuple[tuple[str, int, int], ...] | None = None + last_probed_signature: tuple[tuple[str, int, int], ...] | None = None + stable_polls = 0 + stall_timeout_sec = self._stall_progress_timeout_sec(timeout_sec) + last_scoped_progress_signature = self._scoped_progress_signature( + baseline_dir=baseline_dir, + workspace_dir=workspace_dir, + allowed_paths=policy.merged.allowed_paths, + ) + last_state_progress_signature = self._runtime_heartbeat_signature(workspace_dir=workspace_dir) + last_progress_at = started + first_progress_ms: int | None = None + first_scoped_write_ms: int | None = None + first_state_heartbeat_ms: int | None = None + process_group_id: int | None = None + + with stdout_log.open("a", encoding="utf-8") as stdout_handle, stderr_log.open( + "a", encoding="utf-8" + ) as stderr_handle: + stdout_handle.write(f"\n=== attempt {attempt} ({agent_id}) ===\n") + stderr_handle.write(f"\n=== attempt {attempt} ({agent_id}) ===\n") + stdout_handle.flush() + stderr_handle.flush() + + process = subprocess.Popen( [str(entrypoint)], cwd=self._repo_root, env=env, - capture_output=True, + stdout=stdout_handle, + stderr=stderr_handle, text=True, - timeout=timeout_sec, + start_new_session=True, ) - duration_ms = int((time.perf_counter() - started) * 1000) - except subprocess.TimeoutExpired: - return DriverResult( + if hasattr(os, "getpgid"): + try: + process_group_id = os.getpgid(process.pid) + except OSError: + process_group_id = None + + while True: + returncode = process.poll() + now = time.perf_counter() + duration_ms = int((now - started) * 1000) + if returncode is not None: + completed = subprocess.CompletedProcess( + args=[str(entrypoint)], + returncode=returncode, + stdout="", + stderr="", + ) + break + + if duration_ms >= timeout_sec * 1000: + self._terminate_process(process, process_group_id=process_group_id) + return DriverResult( + run_id=run_id, + agent_id=agent_id, + attempt=attempt, + status="timed_out", + summary=f"adapter timed out after {timeout_sec}s", + metrics=DriverMetrics( + duration_ms=duration_ms, + first_progress_ms=first_progress_ms, + first_scoped_write_ms=first_scoped_write_ms, + first_state_heartbeat_ms=first_state_heartbeat_ms, + ), + recommended_action="fallback", + error=f"timeout after {timeout_sec}s", + ) + + current_scoped_progress_signature = self._scoped_progress_signature( + baseline_dir=baseline_dir, + workspace_dir=workspace_dir, + allowed_paths=policy.merged.allowed_paths, + ) + current_state_progress_signature = self._runtime_heartbeat_signature( + workspace_dir=workspace_dir, + ) + scoped_progress_changed = ( + current_scoped_progress_signature != last_scoped_progress_signature + ) + state_progress_changed = ( + current_state_progress_signature != last_state_progress_signature + ) + if scoped_progress_changed or state_progress_changed: + if ( + scoped_progress_changed + and first_scoped_write_ms is None + and current_scoped_progress_signature + ): + first_scoped_write_ms = duration_ms + if ( + state_progress_changed + and first_state_heartbeat_ms is None + and current_state_progress_signature + ): + first_state_heartbeat_ms = duration_ms + if first_progress_ms is None: + first_candidates = [ + value + for value in ( + first_scoped_write_ms, + first_state_heartbeat_ms, + ) + if value is not None + ] + if first_candidates: + first_progress_ms = min(first_candidates) + last_scoped_progress_signature = current_scoped_progress_signature + last_state_progress_signature = current_state_progress_signature + last_progress_at = now + elif (now - last_progress_at) >= stall_timeout_sec: + self._terminate_process(process, process_group_id=process_group_id) + stall_error = f"no workspace progress for {stall_timeout_sec}s" + return DriverResult( + run_id=run_id, + agent_id=agent_id, + attempt=attempt, + status="stalled_no_progress", + summary=f"adapter stalled after {stall_timeout_sec}s without workspace progress", + metrics=DriverMetrics( + duration_ms=duration_ms, + first_progress_ms=first_progress_ms, + first_scoped_write_ms=first_scoped_write_ms, + first_state_heartbeat_ms=first_state_heartbeat_ms, + ), + recommended_action="fallback", + error=stall_error, + ) + + current_signature, current_paths = self._changed_python_signature( + baseline_dir=baseline_dir, + workspace_dir=workspace_dir, + allowed_paths=policy.merged.allowed_paths, + ) + if current_signature and current_signature == probe_signature: + stable_polls += 1 + elif current_signature: + probe_signature = current_signature + stable_polls = 1 + else: + probe_signature = None + stable_polls = 0 + + if ( + current_signature + and stable_polls >= 2 + and current_signature != last_probed_signature + ): + probe_failure = self._run_fast_fail_probe( + workspace_dir=workspace_dir, + changed_python_paths=current_paths, + ) + last_probed_signature = current_signature + if probe_failure is not None: + self._terminate_process(process, process_group_id=process_group_id) + return DriverResult( + run_id=run_id, + agent_id=agent_id, + attempt=attempt, + status="failed", + summary="adapter aborted by fast-fail probe", + changed_paths=self._collect_changed_paths(baseline_dir, workspace_dir), + metrics=DriverMetrics( + duration_ms=duration_ms, + first_progress_ms=first_progress_ms, + first_scoped_write_ms=first_scoped_write_ms, + first_state_heartbeat_ms=first_state_heartbeat_ms, + ), + recommended_action="fallback", + error=probe_failure, + ) + + time.sleep(2) + + duration_ms = int((time.perf_counter() - started) * 1000) + if completed is None: + return self._contract_error_result( run_id=run_id, agent_id=agent_id, attempt=attempt, - status="timed_out", - summary=f"adapter timed out after {timeout_sec}s", - recommended_action="fallback", - error=f"timeout after {timeout_sec}s", + message="adapter process exited without completion record", ) - with stdout_log.open("a", encoding="utf-8") as handle: - handle.write(f"\n=== attempt {attempt} ({agent_id}) ===\n") - handle.write(completed.stdout or "") - with stderr_log.open("a", encoding="utf-8") as handle: - handle.write(f"\n=== attempt {attempt} ({agent_id}) ===\n") - handle.write(completed.stderr or "") - if not result_path.exists(): return self._contract_error_result( run_id=run_id, @@ -381,7 +843,26 @@ def _invoke_adapter( message=f"invalid driver_result.json: {exc}", ) - merged_metrics = result.metrics.model_copy(update={"duration_ms": duration_ms}) + merged_metrics = result.metrics.model_copy( + update={ + "duration_ms": duration_ms, + "first_progress_ms": ( + result.metrics.first_progress_ms + if result.metrics.first_progress_ms is not None + else first_progress_ms + ), + "first_scoped_write_ms": ( + result.metrics.first_scoped_write_ms + if result.metrics.first_scoped_write_ms is not None + else first_scoped_write_ms + ), + "first_state_heartbeat_ms": ( + result.metrics.first_state_heartbeat_ms + if result.metrics.first_state_heartbeat_ms is not None + else first_state_heartbeat_ms + ), + } + ) result = result.model_copy( update={"metrics": merged_metrics, "attempt": attempt, "agent_id": agent_id} ) @@ -430,16 +911,17 @@ def _build_filtered_patch( ) ) + relevant_changed = [path for path in changed_paths if not _is_benign_runtime_artifact(path)] forbidden_changed = [ - path for path in changed_paths if self._matches_any(path, policy.merged.forbidden_paths) + path for path in relevant_changed if self._matches_any(path, policy.merged.forbidden_paths) ] runtime_changed = [ - path for path in changed_paths if path.startswith(_RUNTIME_DENY_PREFIXES) + path for path in relevant_changed if path.startswith(_RUNTIME_DENY_PREFIXES) ] allowed_changed = [ path - for path in changed_paths + for path in relevant_changed if self._matches_any(path, policy.merged.allowed_paths) and not self._matches_any(path, policy.merged.forbidden_paths) and not path.startswith(_RUNTIME_DENY_PREFIXES) @@ -451,7 +933,7 @@ def _build_filtered_patch( passed=len( [ p - for p in changed_paths + for p in relevant_changed if p not in allowed_changed and p not in forbidden_changed and p not in runtime_changed @@ -478,8 +960,8 @@ def _build_filtered_patch( checks.append( ValidationCheck( id="builtin.max_changed_files", - passed=len(changed_paths) <= policy.merged.max_changed_files, - detail=f"changed={len(changed_paths)} limit={policy.merged.max_changed_files}", + passed=len(relevant_changed) <= policy.merged.max_changed_files, + detail=f"changed={len(relevant_changed)} limit={policy.merged.max_changed_files}", ) ) @@ -533,6 +1015,275 @@ def _build_filtered_patch( ) return patch_text, allowed_changed, checks + def _meaningful_progress_signature( + self, + *, + baseline_dir: Path, + workspace_dir: Path, + allowed_paths: list[str], + ) -> tuple[tuple[str, int, int], ...]: + return self._scoped_progress_signature( + baseline_dir=baseline_dir, + workspace_dir=workspace_dir, + allowed_paths=allowed_paths, + ) + self._state_heartbeat_signature(workspace_dir=workspace_dir) + + def _scoped_progress_signature( + self, + *, + baseline_dir: Path, + workspace_dir: Path, + allowed_paths: list[str], + ) -> tuple[tuple[str, int, int], ...]: + items: list[tuple[str, int, int]] = [] + changed_paths = self._collect_changed_paths(baseline_dir, workspace_dir) + for rel in sorted(changed_paths): + if not self._matches_any(rel, allowed_paths): + continue + path = workspace_dir / rel + if not path.exists(): + items.append((f"delete:{rel}", 0, 0)) + continue + stat = path.stat() + items.append((f"change:{rel}", stat.st_mtime_ns, stat.st_size)) + return tuple(items) + + def _state_heartbeat_signature( + self, + *, + workspace_dir: Path, + ) -> tuple[tuple[str, int, int], ...]: + items: list[tuple[str, int, int]] = [] + state_root = workspace_dir / ".openhands-state" + if state_root.exists(): + for path in sorted(candidate for candidate in state_root.rglob("*") if candidate.is_file()): + stat = path.stat() + items.append( + ( + f"state:{path.relative_to(workspace_dir).as_posix()}", + stat.st_mtime_ns, + stat.st_size, + ) + ) + return tuple(items) + + def _runtime_heartbeat_signature( + self, + *, + workspace_dir: Path, + ) -> tuple[tuple[str, int, int], ...]: + return self._state_heartbeat_signature(workspace_dir=workspace_dir) + + def _changed_python_signature( + self, + *, + baseline_dir: Path, + workspace_dir: Path, + allowed_paths: list[str], + ) -> tuple[tuple[tuple[str, int, int], ...] | None, list[str]]: + changed_paths = self._collect_changed_paths(baseline_dir, workspace_dir) + python_paths = [ + path + for path in changed_paths + if path.endswith(".py") + and path.startswith(("src/", "tests/")) + and self._matches_any(path, allowed_paths) + ] + if not python_paths: + return None, [] + + signature_items: list[tuple[str, int, int]] = [] + for rel in sorted(python_paths): + path = workspace_dir / rel + if not path.exists(): + continue + stat = path.stat() + signature_items.append((rel, stat.st_mtime_ns, stat.st_size)) + if not signature_items: + return None, [] + return tuple(signature_items), [item[0] for item in signature_items] + + def _run_fast_fail_probe( + self, + *, + workspace_dir: Path, + changed_python_paths: list[str], + ) -> str | None: + if not changed_python_paths: + return None + + compile_probe = subprocess.run( + [sys.executable, "-m", "py_compile", *changed_python_paths], + cwd=workspace_dir, + capture_output=True, + text=True, + check=False, + ) + compile_detail = (compile_probe.stderr or compile_probe.stdout or "").strip() + if compile_probe.returncode != 0 and "SyntaxError" in compile_detail: + return compile_detail[:2000] + + importable_modules: list[str] = [] + for rel in changed_python_paths: + if not rel.startswith("src/"): + continue + module_parts = list(Path(rel).with_suffix("").parts[1:]) + if module_parts and module_parts[-1] == "__init__": + module_parts = module_parts[:-1] + if module_parts: + importable_modules.append(".".join(module_parts)) + + if not importable_modules: + return None + + import_probe = subprocess.run( + [ + sys.executable, + "-c", + ( + "import importlib, sys; " + "mods = sys.argv[1:]; " + "[(importlib.import_module(name), None) for name in mods]" + ), + *importable_modules, + ], + cwd=workspace_dir, + env={ + **os.environ, + "PYTHONPATH": str(workspace_dir / "src"), + "PYTHONDONTWRITEBYTECODE": "1", + }, + capture_output=True, + text=True, + check=False, + ) + import_detail = (import_probe.stderr or import_probe.stdout or "").strip() + if import_probe.returncode != 0 and any( + token in import_detail for token in ("ModuleNotFoundError", "ImportError", "SyntaxError") + ): + return import_detail[:2000] + return None + + @staticmethod + def _stall_progress_timeout_sec(timeout_sec: int) -> int: + return min(timeout_sec, min(180, max(60, max(1, timeout_sec // 4)))) + + def _preflight_agent_environment(self, *, agent_id: str, manifest_entrypoint: str) -> str | None: + if agent_id != "openhands": + return None + if Path(manifest_entrypoint).name != "openhands_adapter.sh": + return None + if str(os.environ.get("OPENHANDS_DRY_RUN") or "0").strip() == "1": + return None + + if not self._uses_openhands_ai_lab_runtime(manifest_entrypoint): + return None + preflight_env = self._build_openhands_ai_lab_env() + + override_command = str(os.environ.get("OPENHANDS_PREFLIGHT_CMD") or "").strip() + if override_command: + completed = subprocess.run( + override_command, + cwd=self._repo_root, + env=preflight_env, + shell=True, + capture_output=True, + text=True, + check=False, + ) + else: + script = self._repo_root / "scripts" / "launch_ai_lab.sh" + if not script.exists(): + return f"EnvironmentCheckFailed: launch_ai_lab.sh not found at {script}" + completed = subprocess.run( + ["bash", str(script), "status"], + cwd=self._repo_root, + env=preflight_env, + capture_output=True, + text=True, + check=False, + ) + + if completed.returncode == 0: + return None + + detail = (completed.stderr or completed.stdout or "").strip() + if not detail: + detail = f"preflight exited with code {completed.returncode}" + collapsed = re.sub(r"\s+", " ", detail)[:400] + return f"EnvironmentCheckFailed: {collapsed}" + + def _job_for_attempt( + self, + *, + job: JobSpec, + agent_id: str, + attempt: int, + last_result: DriverResult, + last_validation: ValidationReport, + ) -> JobSpec: + if attempt <= 1 or agent_id != job.agent_id: + return job + + feedback = self._build_retry_feedback(last_result=last_result, last_validation=last_validation) + if feedback is None: + return job + + metadata = dict(job.metadata) + metadata["retry_feedback"] = feedback + metadata["retry_attempt"] = attempt + metadata["retry_source_status"] = last_result.status + return job.model_copy( + update={ + "task": ( + f"{job.task.rstrip()}\n\n" + "Retry feedback from the previous attempt. Fix these exact failures before making any new changes:\n" + f"{feedback}\n" + ), + "metadata": metadata, + } + ) + + @staticmethod + def _build_retry_feedback( + *, + last_result: DriverResult, + last_validation: ValidationReport, + ) -> str | None: + if last_validation.passed: + return None + + failed_checks = [check for check in last_validation.checks if not check.passed and check.detail.strip()] + error_text = str(last_result.error or "").strip() + if not failed_checks and not error_text: + return None + + parts = [ + f"Previous driver status: {last_result.status}", + f"Previous driver summary: {last_result.summary}", + ] + if last_result.changed_paths: + parts.append("Previous changed paths:") + parts.extend(f"- {path}" for path in last_result.changed_paths) + if failed_checks: + parts.append("Raw validator failures:") + for check in failed_checks: + parts.append(f"[{check.id}]") + parts.append(check.detail.strip()) + if error_text: + parts.append("Driver error:") + parts.append(error_text) + return "\n".join(parts) + + @staticmethod + def _retry_skip_reason(result: DriverResult) -> str | None: + if result.status == "stalled_no_progress": + return "stalled_no_progress" + error_text = str(result.error or result.summary or "").strip() + if result.status == "contract_error" and error_text.startswith("EnvironmentCheckFailed:"): + return "environment_preflight_failed" + return None + def _run_validators( self, *, @@ -615,6 +1366,7 @@ def _contract_error_result( agent_id: str, attempt: int, message: str, + recommended_action: str = "reject", ) -> DriverResult: return DriverResult( run_id=run_id, @@ -622,7 +1374,7 @@ def _contract_error_result( attempt=attempt, status="contract_error", summary=message, - recommended_action="reject", + recommended_action=recommended_action, error=message, ) @@ -691,6 +1443,15 @@ def _matches_any(path: str, patterns: list[str]) -> bool: return True return False + @staticmethod + def _glob_prefix(value: str) -> str: + prefix: list[str] = [] + for char in value: + if char in "*?[": + break + prefix.append(char) + return "".join(prefix).rstrip("/") + @staticmethod def _collect_files(root: Path) -> list[str]: files: list[str] = [] @@ -743,6 +1504,74 @@ def _append_event(events_path: Path, payload: dict[str, Any]) -> None: with events_path.open("a", encoding="utf-8") as handle: handle.write(json.dumps(payload, ensure_ascii=False) + "\n") + @staticmethod + def _terminate_process( + process: subprocess.Popen[str], + *, + process_group_id: int | None = None, + ) -> None: + if process.poll() is not None: + return + + def _send(sig: signal.Signals) -> None: + delivered = False + if process_group_id is not None and hasattr(os, "killpg"): + try: + os.killpg(process_group_id, sig) + delivered = True + except (OSError, ProcessLookupError): + delivered = False + + if delivered: + return + + if sig == signal.SIGTERM: + process.terminate() + else: + process.kill() + + try: + _send(signal.SIGTERM) + except (OSError, ProcessLookupError): + return + + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + try: + _send(signal.SIGKILL) + except (OSError, ProcessLookupError): + return + try: + process.wait(timeout=5) + except subprocess.TimeoutExpired: + try: + process.kill() + except OSError: + return + try: + process.wait(timeout=1) + except subprocess.TimeoutExpired: + return + + @staticmethod + def _rmtree_force(path: Path) -> None: + if not path.exists(): + return + + def onexc(func, failed_path, excinfo) -> None: + _ = excinfo + try: + os.chmod(failed_path, 0o777) + except OSError: + pass + try: + func(failed_path) + except OSError: + pass + + shutil.rmtree(path, onexc=onexc) + def _git_ref(self, args: list[str], *, default: str) -> str: completed = subprocess.run( ["git", *args], @@ -763,7 +1592,7 @@ def _sanitize_branch_name(value: str) -> str: @staticmethod def _cleanup_workspace(*, workspace_dir: Path, success: bool, policy: EffectivePolicy) -> None: if success and policy.merged.cleanup_on_success: - shutil.rmtree(workspace_dir, ignore_errors=True) + AgentExecutionRunner._rmtree_force(workspace_dir) return if not success and not policy.merged.retain_workspace_on_failure: - shutil.rmtree(workspace_dir, ignore_errors=True) + AgentExecutionRunner._rmtree_force(workspace_dir) diff --git a/src/autoresearch/shared/autoresearch_planner_contract.py b/src/autoresearch/shared/autoresearch_planner_contract.py new file mode 100644 index 00000000..024f7f77 --- /dev/null +++ b/src/autoresearch/shared/autoresearch_planner_contract.py @@ -0,0 +1,138 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Any, Literal + +from pydantic import Field, field_validator + +from autoresearch.agent_protocol.models import JobSpec, RunSummary +from autoresearch.shared.models import JobStatus, StrictModel +from autoresearch.shared.openhands_controlled_contract import ControlledExecutionRequest +from autoresearch.shared.openhands_worker_contract import OpenHandsWorkerJobSpec +from autoresearch.shared.remote_run_contract import RemoteRunRecord + + +class AutoResearchPlannerRequest(StrictModel): + goal: str = Field( + default="Scan the repo for the next safe patch-only improvement.", + min_length=1, + ) + max_candidates: int = Field(default=5, ge=1, le=20) + pipeline_target: Literal["patch", "draft_pr"] = "draft_pr" + target_base_branch: str = Field(default="main", min_length=1) + max_iterations: int = Field(default=2, ge=1, le=5) + approval_granted: bool = False + include_upstream_watch: bool = False + telegram_uid: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("goal", "target_base_branch") + @classmethod + def _normalize_non_empty_text(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("value must not be empty") + return normalized + + @field_validator("pipeline_target") + @classmethod + def _normalize_pipeline_target(cls, value: str) -> str: + normalized = value.strip().lower() + if normalized not in {"patch", "draft_pr"}: + raise ValueError("pipeline_target must be patch or draft_pr") + return normalized + + @field_validator("telegram_uid") + @classmethod + def _normalize_telegram_uid(cls, value: str | None) -> str | None: + if value is None: + return None + normalized = value.strip() + return normalized or None + + +class AutoResearchPlanDispatchStatus(str, Enum): + PENDING = "pending" + DISPATCHING = "dispatching" + DISPATCHED = "dispatched" + FAILED = "failed" + + +class UpstreamWatchDecision(str, Enum): + SKIP = "skip" + REVIEW = "review" + FAILED = "failed" + + +class UpstreamWatchCommitRead(StrictModel): + sha: str + title: str + committed_at: datetime | None = None + touched_paths: list[str] = Field(default_factory=list) + + +class UpstreamWatchRead(StrictModel): + upstream_url: str + default_branch: str = "main" + latest_commit_sha: str | None = None + latest_commit_title: str | None = None + latest_commit_at: datetime | None = None + recent_commits: list[UpstreamWatchCommitRead] = Field(default_factory=list) + changed_paths: list[str] = Field(default_factory=list) + relevant_paths: list[str] = Field(default_factory=list) + focus_areas: list[str] = Field(default_factory=list) + decision: UpstreamWatchDecision = UpstreamWatchDecision.SKIP + summary: str = "" + cleaned_up: bool = False + cleanup_paths: list[str] = Field(default_factory=list) + error: str | None = None + + +class AutoResearchPlannerEvidenceRead(StrictModel): + kind: Literal["marker", "test_gap", "hotspot"] + path: str = Field(..., min_length=1) + line: int | None = None + detail: str = "" + weight: float = 0.0 + + +class AutoResearchPlannerCandidateRead(StrictModel): + candidate_id: str + title: str + summary: str + category: Literal["marker_backlog", "test_gap"] + priority_score: float = 0.0 + source_path: str + allowed_paths: list[str] = Field(default_factory=list) + suggested_test_paths: list[str] = Field(default_factory=list) + test_command: str + evidence: list[AutoResearchPlannerEvidenceRead] = Field(default_factory=list) + metadata: dict[str, Any] = Field(default_factory=dict) + + +class AutoResearchPlanRead(StrictModel): + plan_id: str + goal: str + status: JobStatus + summary: str = "" + created_at: datetime + updated_at: datetime + selected_candidate: AutoResearchPlannerCandidateRead | None = None + candidates: list[AutoResearchPlannerCandidateRead] = Field(default_factory=list) + worker_spec: OpenHandsWorkerJobSpec | None = None + controlled_request: ControlledExecutionRequest | None = None + agent_job: JobSpec | None = None + upstream_watch: UpstreamWatchRead | None = None + telegram_uid: str | None = None + panel_action_url: str | None = None + notification_sent: bool = False + dispatch_status: AutoResearchPlanDispatchStatus = AutoResearchPlanDispatchStatus.PENDING + dispatch_requested_at: datetime | None = None + dispatch_completed_at: datetime | None = None + dispatch_requested_by: str | None = None + dispatch_run: RemoteRunRecord | None = None + run_summary: RunSummary | None = None + dispatch_error: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + error: str | None = None diff --git a/src/autoresearch/shared/housekeeper_contract.py b/src/autoresearch/shared/housekeeper_contract.py new file mode 100644 index 00000000..c7030c6e --- /dev/null +++ b/src/autoresearch/shared/housekeeper_contract.py @@ -0,0 +1,163 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Any, Literal + +from pydantic import Field + +from autoresearch.shared.models import StrictModel + + +class HousekeeperMode(str, Enum): + DAY_SAFE = "day_safe" + NIGHT_READONLY_EXPLORE = "night_readonly_explore" + NIGHT_EXPLORE = "night_explore" + + +class HousekeeperChangeReason(str, Enum): + SCHEDULE = "schedule" + MANUAL_PANEL = "manual_panel" + MANUAL_API = "manual_api" + CIRCUIT_BREAKER = "circuit_breaker" + RECOVERED_FROM_CIRCUIT_BREAKER = "recovered_from_circuit_breaker" + + +class CircuitBreakerStatus(str, Enum): + CLOSED = "closed" + OPEN = "open" + + +class AdmissionRiskLevel(str, Enum): + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + + +class DeferredReason(str, Enum): + DEFERRED_TO_NIGHT = "deferred_to_night" + APPROVAL_REQUIRED = "approval_required" + CIRCUIT_BREAKER_OPEN = "circuit_breaker_open" + BUDGET_EXHAUSTED = "budget_exhausted" + DEDUP_BLOCKED = "dedup_blocked" + + +class ExplorationBlockerReason(str, Enum): + CIRCUIT_BREAKER_OPEN = "circuit_breaker_open" + ENV_MISSING = "env_missing" + PERMISSION_DENIED = "permission_denied" + EMPTY_SCOPE = "empty_scope" + STALLED_NO_PROGRESS = "stalled_no_progress" + VALIDATION_FAILED = "validation_failed" + APPROVAL_PENDING = "approval_pending" + DIRTY_REPO = "dirty_repo" + BUDGET_EXHAUSTED = "budget_exhausted" + UNKNOWN = "unknown" + + +class CircuitBreakerStateRead(StrictModel): + status: CircuitBreakerStatus = CircuitBreakerStatus.CLOSED + triggered_at: datetime | None = None + reason: str | None = None + consecutive_failures: int = 0 + recent_failure_rate: float = 0.0 + acknowledged_at: datetime | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class ExecutionProfileRead(StrictModel): + profile_name: HousekeeperMode + pipeline_target: Literal["patch", "draft_pr"] + max_iterations: int = Field(default=1, ge=1, le=5) + auto_dispatch_allowed: bool = False + parallelism: int = Field(default=1, ge=1, le=32) + allow_draft_pr: bool = False + allow_repo_write: bool = True + allow_network: bool = False + allow_long_task_minutes: int = Field(default=15, ge=1, le=1440) + + +class TaskAdmissionAssessmentRead(StrictModel): + plan_shape: Literal["single_task", "task_dag", "planner_candidate", "media_job", "unknown"] = "unknown" + estimated_runtime_minutes: int = Field(default=15, ge=0, le=1440) + requires_repo_write: bool = True + requires_network: bool = False + fanout_count: int = Field(default=1, ge=0, le=100) + risk_level: AdmissionRiskLevel = AdmissionRiskLevel.MEDIUM + + +class HousekeeperStateRead(StrictModel): + state_id: str = "housekeeper" + scheduled_mode: HousekeeperMode = HousekeeperMode.DAY_SAFE + manual_override_mode: HousekeeperMode | None = None + effective_mode: HousekeeperMode = HousekeeperMode.DAY_SAFE + effective_until: datetime | None = None + reason: HousekeeperChangeReason = HousekeeperChangeReason.SCHEDULE + changed_by: str = "system" + last_changed_at: datetime + circuit_breaker_state: CircuitBreakerStateRead = Field(default_factory=CircuitBreakerStateRead) + last_summary_at: datetime | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class HousekeeperModeUpdateRequest(StrictModel): + action: Literal["set_manual_override", "clear_manual_override", "ack_circuit_breaker", "apply_schedule"] + target_mode: HousekeeperMode | None = None + changed_by: str = Field(..., min_length=1) + reason: HousekeeperChangeReason + effective_until: datetime | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + +class NightBudgetStateRead(StrictModel): + budget_id: str = "night_budget" + window_start: datetime + window_end: datetime + dispatches_used: int = 0 + draft_prs_used: int = 0 + worker_minutes_used: int = 0 + max_dispatches_per_night: int = 4 + max_draft_pr_per_night: int = 2 + max_worker_minutes_per_night: int = 180 + updated_at: datetime + metadata: dict[str, Any] = Field(default_factory=dict) + + +class HousekeeperTickRead(StrictModel): + executed: bool = False + skipped_reason: str | None = None + target_kind: Literal["manager_dispatch", "planner_dispatch", "none"] = "none" + target_id: str | None = None + blocker_reason: ExplorationBlockerReason | None = None + summary: str = "" + state: HousekeeperStateRead + budget: NightBudgetStateRead | None = None + + +class HousekeeperMorningSummaryRead(StrictModel): + sent: bool = False + summary_text: str + completed_items: list[str] = Field(default_factory=list) + blocked_items: list[str] = Field(default_factory=list) + decision_items: list[str] = Field(default_factory=list) + queue_items: list[str] = Field(default_factory=list) + state: HousekeeperStateRead + + +class ExplorationDedupKeyRead(StrictModel): + repo_id: str + target_scope_hash: str + intent_id: str + normalized_goal_hash: str + + +class ExplorationRecordRead(StrictModel): + record_id: str + dedup_key: ExplorationDedupKeyRead + target_kind: Literal["manager_dispatch", "planner_dispatch", "media_job"] + target_id: str + blocker_reason: ExplorationBlockerReason | None = None + final_status: str | None = None + created_at: datetime + updated_at: datetime + metadata: dict[str, Any] = Field(default_factory=dict) diff --git a/src/autoresearch/shared/manager_agent_contract.py b/src/autoresearch/shared/manager_agent_contract.py new file mode 100644 index 00000000..33310711 --- /dev/null +++ b/src/autoresearch/shared/manager_agent_contract.py @@ -0,0 +1,92 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Any, Literal + +from pydantic import Field, field_validator + +from autoresearch.agent_protocol.models import JobSpec, RunSummary +from autoresearch.shared.models import JobStatus, StrictModel +from autoresearch.shared.openhands_controlled_contract import ControlledExecutionRequest +from autoresearch.shared.openhands_worker_contract import OpenHandsWorkerJobSpec + + +class ManagerDispatchRequest(StrictModel): + prompt: str = Field(..., min_length=1) + pipeline_target: Literal["patch", "draft_pr"] = "draft_pr" + target_base_branch: str = Field(default="main", min_length=1) + max_iterations: int = Field(default=2, ge=1, le=5) + approval_granted: bool = False + auto_dispatch: bool = True + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("prompt", "target_base_branch") + @classmethod + def _normalize_text(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("value must not be empty") + return normalized + + +class ManagerIntentRead(StrictModel): + intent_id: str + label: str + summary: str + matched_keywords: list[str] = Field(default_factory=list) + allowed_paths: list[str] = Field(default_factory=list) + suggested_test_paths: list[str] = Field(default_factory=list) + metadata: dict[str, Any] = Field(default_factory=dict) + + +class ManagerPlanStrategy(str, Enum): + SINGLE_TASK = "single_task" + TASK_DAG = "task_dag" + + +class ManagerTaskStage(str, Enum): + BACKEND = "backend" + TESTS = "tests" + FRONTEND = "frontend" + GENERIC = "generic" + + +class ManagerPlanTaskRead(StrictModel): + task_id: str + title: str + summary: str + stage: ManagerTaskStage = ManagerTaskStage.GENERIC + depends_on: list[str] = Field(default_factory=list) + status: JobStatus = JobStatus.CREATED + worker_spec: OpenHandsWorkerJobSpec | None = None + controlled_request: ControlledExecutionRequest | None = None + agent_job: JobSpec | None = None + run_summary: RunSummary | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + error: str | None = None + + +class ManagerExecutionPlanRead(StrictModel): + plan_id: str + strategy: ManagerPlanStrategy = ManagerPlanStrategy.SINGLE_TASK + summary: str = "" + tasks: list[ManagerPlanTaskRead] = Field(default_factory=list) + + +class ManagerDispatchRead(StrictModel): + dispatch_id: str + prompt: str + normalized_goal: str + status: JobStatus + summary: str = "" + created_at: datetime + updated_at: datetime + selected_intent: ManagerIntentRead | None = None + execution_plan: ManagerExecutionPlanRead | None = None + worker_spec: OpenHandsWorkerJobSpec | None = None + controlled_request: ControlledExecutionRequest | None = None + agent_job: JobSpec | None = None + run_summary: RunSummary | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + error: str | None = None diff --git a/src/autoresearch/shared/media_job_contract.py b/src/autoresearch/shared/media_job_contract.py new file mode 100644 index 00000000..95ad444a --- /dev/null +++ b/src/autoresearch/shared/media_job_contract.py @@ -0,0 +1,101 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from typing import Any + +from pydantic import Field, field_validator + +from autoresearch.shared.models import StrictModel + +_ALLOWED_FILENAME_TOKENS = {"{title}", "{id}", "{uploader}", "{upload_date}"} + + +class MediaJobMode(str, Enum): + AUDIO = "audio" + VIDEO = "video" + SUBTITLE = "subtitle" + METADATA = "metadata" + + +class MediaJobPostprocess(str, Enum): + NONE = "none" + MP3 = "mp3" + MP4 = "mp4" + + +class MediaTargetBucket(str, Enum): + INBOX = "inbox" + AUDIO = "audio" + VIDEO = "video" + SUBTITLES = "subtitles" + META = "meta" + + +class MediaJobStatus(str, Enum): + CREATED = "created" + QUEUED = "queued" + RUNNING = "running" + COMPLETED = "completed" + FAILED = "failed" + + +class MediaJobRequest(StrictModel): + url: str = Field(..., min_length=1) + mode: MediaJobMode + target_bucket: MediaTargetBucket + filename_template: str = "{title}-{id}" + postprocess: MediaJobPostprocess = MediaJobPostprocess.NONE + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("url") + @classmethod + def _normalize_url(cls, value: str) -> str: + normalized = value.strip() + if not normalized.startswith(("http://", "https://")): + raise ValueError("media url must be http or https") + return normalized + + @field_validator("filename_template") + @classmethod + def _validate_template(cls, value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("filename_template is required") + parts = [part for part in normalized.split("-") if part] + if not parts: + raise ValueError("filename_template must include at least one token") + for part in parts: + if part not in _ALLOWED_FILENAME_TOKENS: + raise ValueError(f"unsupported filename token: {part}") + return "-".join(parts) + + +class MediaJobRead(StrictModel): + job_id: str + url: str + mode: MediaJobMode + target_bucket: MediaTargetBucket + filename_template: str + postprocess: MediaJobPostprocess + status: MediaJobStatus = MediaJobStatus.CREATED + output_files: list[str] = Field(default_factory=list) + title: str | None = None + duration_seconds: int | None = None + uploader: str | None = None + subtitle_path: str | None = None + metadata_path: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + created_at: datetime + updated_at: datetime + error: str | None = None + + +class MediaJobEventRead(StrictModel): + event_id: str + job_id: str + stage: str + status: str + detail: str = "" + created_at: datetime + metadata: dict[str, Any] = Field(default_factory=dict) diff --git a/src/autoresearch/shared/models.py b/src/autoresearch/shared/models.py index 3984e0d0..0f805c5f 100644 --- a/src/autoresearch/shared/models.py +++ b/src/autoresearch/shared/models.py @@ -893,8 +893,8 @@ class PanelMagicLinkRead(StrictModel): class PanelAuditLogRead(StrictModel): audit_id: str telegram_uid: str - action: Literal["cancel", "retry", "approve", "reject"] - target_type: Literal["agent_run", "approval_request"] = "agent_run" + action: Literal["cancel", "retry", "approve", "reject", "dispatch"] + target_type: Literal["agent_run", "approval_request", "autoresearch_plan"] = "agent_run" target_id: str status: Literal["accepted", "rejected", "failed"] = "accepted" reason: str | None = None @@ -965,6 +965,63 @@ class AdminManagedSkillPromotionExecuteRequest(StrictModel): metadata: dict[str, Any] = Field(default_factory=dict) +class AdminAgentAuditRole(str, Enum): + MANAGER = "manager" + PLANNER = "planner" + WORKER = "worker" + + +class AdminAgentAuditTrailEntryRead(StrictModel): + entry_id: str + source: Literal["manager_task", "autoresearch_plan", "claude_agent", "runtime_artifact"] + agent_role: AdminAgentAuditRole = AdminAgentAuditRole.WORKER + run_id: str + agent_id: str | None = None + title: str + status: str + final_status: str | None = None + recorded_at: datetime + duration_ms: int | None = None + first_progress_ms: int | None = None + first_scoped_write_ms: int | None = None + first_state_heartbeat_ms: int | None = None + files_changed: int = 0 + changed_paths: list[str] = Field(default_factory=list) + scope_paths: list[str] = Field(default_factory=list) + patch_uri: str | None = None + isolated_workspace: str | None = None + summary: str = "" + metadata: dict[str, Any] = Field(default_factory=dict) + + +class AdminAgentAuditTrailStatsRead(StrictModel): + total: int = 0 + succeeded: int = 0 + failed: int = 0 + running: int = 0 + queued: int = 0 + review_required: int = 0 + + +class AdminAgentAuditTrailSnapshotRead(StrictModel): + items: list[AdminAgentAuditTrailEntryRead] = Field(default_factory=list) + stats: AdminAgentAuditTrailStatsRead = Field(default_factory=AdminAgentAuditTrailStatsRead) + issued_at: datetime + + +class AdminAgentAuditTrailDetailRead(StrictModel): + entry: AdminAgentAuditTrailEntryRead + input_prompt: str | None = None + job_spec: dict[str, Any] = Field(default_factory=dict) + worker_spec: dict[str, Any] = Field(default_factory=dict) + controlled_request: dict[str, Any] = Field(default_factory=dict) + patch_text: str = "" + patch_truncated: bool = False + error_reason: str | None = None + traceback: str | None = None + raw_record: dict[str, Any] = Field(default_factory=dict) + + class PanelStateRead(StrictModel): telegram_uid: str sessions: list[OpenClawSessionRead] = Field(default_factory=list) @@ -972,6 +1029,7 @@ class PanelStateRead(StrictModel): audit_logs: list[PanelAuditLogRead] = Field(default_factory=list) capability_providers: list[CapabilityProviderSummaryRead] = Field(default_factory=list) pending_approvals: list[ApprovalRequestRead] = Field(default_factory=list) + pending_autoresearch_plans: list[dict[str, Any]] = Field(default_factory=list) issued_at: datetime diff --git a/src/autoresearch/shared/remote_run_contract.py b/src/autoresearch/shared/remote_run_contract.py new file mode 100644 index 00000000..f15c5705 --- /dev/null +++ b/src/autoresearch/shared/remote_run_contract.py @@ -0,0 +1,171 @@ +from __future__ import annotations + +from datetime import datetime +from enum import Enum +from pathlib import PurePosixPath +from typing import Any, Literal + +from pydantic import Field, field_validator + +from autoresearch.agent_protocol.models import JobSpec, RunSummary +from autoresearch.shared.models import StrictModel, utc_now + + +def _normalize_non_empty_text(value: str) -> str: + normalized = value.strip() + if not normalized: + raise ValueError("value must not be empty") + return normalized + + +def _normalize_relative_artifact_path(value: str) -> str: + normalized = value.strip().replace("\\", "/") + if not normalized: + raise ValueError("artifact path must not be empty") + candidate = PurePosixPath(normalized) + if candidate.is_absolute(): + raise ValueError("artifact paths must be repo-relative or runtime-relative") + parts = candidate.parts + if any(part == ".." for part in parts): + raise ValueError("artifact paths must stay inside the repo/runtime root") + return normalized + + +def _normalize_artifact_paths(value: dict[str, str] | None) -> dict[str, str]: + if not value: + return {} + normalized: dict[str, str] = {} + for raw_key, raw_path in value.items(): + key = _normalize_non_empty_text(str(raw_key)) + normalized[key] = _normalize_relative_artifact_path(str(raw_path)) + return normalized + + +class DispatchLane(str, Enum): + LOCAL = "local" + REMOTE = "remote" + + +class RemoteRunStatus(str, Enum): + QUEUED = "queued" + RUNNING = "running" + SUCCEEDED = "succeeded" + FAILED = "failed" + STALLED = "stalled" + TIMED_OUT = "timed_out" + + +class FailureClass(str, Enum): + PLANNER_STALLED = "planner_stalled" + EXECUTOR_STALLED = "executor_stalled" + TOOL_TIMEOUT = "tool_timeout" + MODEL_FALLBACK = "model_fallback" + ASSERTION_FAILED_AFTER_FALLBACK = "assertion_failed_after_fallback" + ENV_MISSING = "env_missing" + WORKSPACE_DIRTY = "workspace_dirty" + TRANSIENT_NETWORK = "transient_network" + UNKNOWN = "unknown" + + +class RecoveryAction(str, Enum): + RETRY = "retry" + ABORT = "abort" + REQUIRE_HUMAN_REVIEW = "require_human_review" + DOWNGRADE_TO_DRAFT = "downgrade_to_draft" + QUARANTINE = "quarantine" + + +class RemoteTaskSpec(StrictModel): + protocol_version: Literal["remote-run/v1"] = "remote-run/v1" + run_id: str = Field(..., min_length=1) + requested_lane: DispatchLane = DispatchLane.LOCAL + lane: DispatchLane = DispatchLane.LOCAL + runtime_mode: str = Field(default="day", min_length=1) + planner_plan_id: str | None = None + planner_candidate_id: str | None = None + job: JobSpec + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("run_id", "runtime_mode") + @classmethod + def _normalize_required_text(cls, value: str) -> str: + return _normalize_non_empty_text(value) + + @field_validator("planner_plan_id", "planner_candidate_id") + @classmethod + def _normalize_optional_text(cls, value: str | None) -> str | None: + if value is None: + return None + normalized = value.strip() + return normalized or None + + +class RemoteRunRecord(StrictModel): + protocol_version: Literal["remote-run/v1"] = "remote-run/v1" + run_id: str = Field(..., min_length=1) + requested_lane: DispatchLane = DispatchLane.LOCAL + lane: DispatchLane = DispatchLane.LOCAL + status: RemoteRunStatus = RemoteRunStatus.QUEUED + failure_class: FailureClass | None = None + recovery_action: RecoveryAction | None = None + artifact_paths: dict[str, str] = Field(default_factory=dict) + summary: str = "" + started_at: datetime | None = None + updated_at: datetime = Field(default_factory=utc_now) + finished_at: datetime | None = None + fallback_reason: str | None = None + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("run_id") + @classmethod + def _normalize_run_id(cls, value: str) -> str: + return _normalize_non_empty_text(value) + + @field_validator("summary") + @classmethod + def _normalize_summary(cls, value: str) -> str: + return value.strip() + + @field_validator("fallback_reason") + @classmethod + def _normalize_fallback_reason(cls, value: str | None) -> str | None: + if value is None: + return None + normalized = value.strip() + return normalized or None + + @field_validator("artifact_paths", mode="before") + @classmethod + def _validate_artifact_paths(cls, value: dict[str, str] | None) -> dict[str, str]: + return _normalize_artifact_paths(value) + + +class RemoteHeartbeat(StrictModel): + protocol_version: Literal["remote-run/v1"] = "remote-run/v1" + run_id: str = Field(..., min_length=1) + lane: DispatchLane = DispatchLane.LOCAL + status: RemoteRunStatus = RemoteRunStatus.RUNNING + sequence: int = Field(default=1, ge=1) + summary: str = "" + recorded_at: datetime = Field(default_factory=utc_now) + artifact_paths: dict[str, str] = Field(default_factory=dict) + metadata: dict[str, Any] = Field(default_factory=dict) + + @field_validator("run_id") + @classmethod + def _normalize_heartbeat_run_id(cls, value: str) -> str: + return _normalize_non_empty_text(value) + + @field_validator("summary") + @classmethod + def _normalize_heartbeat_summary(cls, value: str) -> str: + return value.strip() + + @field_validator("artifact_paths", mode="before") + @classmethod + def _validate_heartbeat_artifact_paths(cls, value: dict[str, str] | None) -> dict[str, str]: + return _normalize_artifact_paths(value) + + +class RemoteRunSummary(RemoteRunRecord): + run_summary: RunSummary | None = None diff --git a/src/gateway/telegram_webhook.py b/src/gateway/telegram_webhook.py index c1703ae1..a77e8c8c 100644 --- a/src/gateway/telegram_webhook.py +++ b/src/gateway/telegram_webhook.py @@ -1,5 +1,9 @@ """ -Telegram Webhook Handler - 指令拦截与工作流触发 +Legacy Telegram webhook compatibility handler. + +This module exists only for backward compatibility with the old +workflow-driven `/telegram/webhook` path. New Telegram integration work +must target `autoresearch.api.routers.gateway_telegram`. """ import asyncio @@ -20,7 +24,12 @@ @router.post("/telegram/webhook") async def telegram_webhook(request: Request): - """Telegram Webhook 处理器""" + """Legacy Telegram webhook handler. + + Deprecated: keep this path stable for existing callers, but do not + extend it with new product behavior. The mainline Telegram entrypoint + lives under `/api/v1/gateway/telegram/webhook`. + """ try: data = await request.json() @@ -28,8 +37,10 @@ async def telegram_webhook(request: Request): message = data.get("message", {}) chat_id = message.get("chat", {}).get("id") text = message.get("text", "") - user_id = message.get("from", {}).get("id") + logger.warning( + "[Legacy Telegram Webhook] compatibility path hit; prefer /api/v1/gateway/telegram/webhook" + ) logger.info(f"[Webhook] 收到消息: {text[:50]}...") # 指令拦截:GitHub 深度审查 diff --git a/tests/remote_dispatch/test_fake_remote_adapter.py b/tests/remote_dispatch/test_fake_remote_adapter.py new file mode 100644 index 00000000..88f9e9e0 --- /dev/null +++ b/tests/remote_dispatch/test_fake_remote_adapter.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +from pathlib import Path + +import pytest + +from autoresearch.agent_protocol.models import DriverResult, JobSpec, RunSummary, ValidationReport +from autoresearch.core.dispatch.fake_remote_adapter import FakeRemoteAdapter +from autoresearch.shared.remote_run_contract import DispatchLane, FailureClass, RemoteRunStatus, RemoteTaskSpec + + +def _success_summary(job: JobSpec) -> RunSummary: + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="local runner succeeded", + changed_paths=["src/demo.py"], + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri="artifacts/promotion.patch", + ) + + +def _task_spec( + *, + run_id: str, + lane: DispatchLane, + requested_lane: DispatchLane | None = None, + scenario: str | None = None, +) -> RemoteTaskSpec: + metadata: dict[str, object] = {} + if scenario is not None: + metadata["remote_scenario"] = scenario + return RemoteTaskSpec( + run_id=run_id, + requested_lane=requested_lane or lane, + lane=lane, + runtime_mode="night", + planner_plan_id="plan_test", + planner_candidate_id="candidate_test", + job=JobSpec(run_id=run_id, agent_id="openhands", task="demo"), + metadata=metadata, + ) + + +def test_fake_remote_adapter_success_flow(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + adapter = FakeRemoteAdapter(repo_root=repo_root, local_runner=_success_summary) + spec = _task_spec(run_id="run-success", lane=DispatchLane.REMOTE, scenario="success") + + queued = adapter.dispatch(spec) + running = adapter.poll(spec.run_id) + heartbeat = adapter.heartbeat(spec.run_id) + terminal = adapter.poll(spec.run_id) + summary = adapter.fetch_summary(spec.run_id) + + assert queued.status is RemoteRunStatus.QUEUED + assert running.status is RemoteRunStatus.RUNNING + assert heartbeat is not None + assert terminal.status is RemoteRunStatus.SUCCEEDED + assert summary.status is RemoteRunStatus.SUCCEEDED + assert summary.run_summary is None + assert all(not artifact_path.startswith("/") for artifact_path in summary.artifact_paths.values()) + assert ( + repo_root + / ".masfactory_runtime" + / "runs" + / spec.run_id + / "remote_control" + / "summary.json" + ).exists() + + +def test_fake_remote_adapter_stalled_flow_uses_missing_heartbeat_signal(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + adapter = FakeRemoteAdapter(repo_root=repo_root, local_runner=_success_summary) + spec = _task_spec(run_id="run-stalled", lane=DispatchLane.REMOTE, scenario="stalled") + + adapter.dispatch(spec) + running = adapter.poll(spec.run_id) + stalled = adapter.poll(spec.run_id) + summary = adapter.fetch_summary(spec.run_id) + + assert running.status is RemoteRunStatus.RUNNING + assert adapter.heartbeat(spec.run_id) is None + assert stalled.status is RemoteRunStatus.STALLED + assert summary.failure_class is FailureClass.EXECUTOR_STALLED + + +def test_fake_remote_adapter_timeout_flow(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + adapter = FakeRemoteAdapter(repo_root=repo_root, local_runner=_success_summary) + spec = _task_spec(run_id="run-timeout", lane=DispatchLane.REMOTE, scenario="timed_out") + + adapter.dispatch(spec) + adapter.poll(spec.run_id) + timed_out = adapter.poll(spec.run_id) + summary = adapter.fetch_summary(spec.run_id) + + assert timed_out.status is RemoteRunStatus.TIMED_OUT + assert summary.failure_class is FailureClass.TOOL_TIMEOUT + + +@pytest.mark.parametrize( + ("scenario", "failure_class"), + [ + ("env_missing", FailureClass.ENV_MISSING), + ("transient_network", FailureClass.TRANSIENT_NETWORK), + ], +) +def test_fake_remote_adapter_failure_mapping( + tmp_path: Path, + scenario: str, + failure_class: FailureClass, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + adapter = FakeRemoteAdapter(repo_root=repo_root, local_runner=_success_summary) + spec = _task_spec(run_id=f"run-{scenario}", lane=DispatchLane.REMOTE, scenario=scenario) + + adapter.dispatch(spec) + failed = adapter.poll(spec.run_id) + summary = adapter.fetch_summary(spec.run_id) + + assert failed.status is RemoteRunStatus.FAILED + assert summary.failure_class is failure_class + + +def test_fake_remote_adapter_records_remote_to_local_fallback(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + adapter = FakeRemoteAdapter(repo_root=repo_root, local_runner=_success_summary) + spec = _task_spec( + run_id="run-fallback", + lane=DispatchLane.LOCAL, + requested_lane=DispatchLane.REMOTE, + ) + + queued = adapter.dispatch(spec) + adapter.poll(spec.run_id) + terminal = adapter.poll(spec.run_id) + summary = adapter.fetch_summary(spec.run_id) + + assert queued.requested_lane is DispatchLane.REMOTE + assert queued.lane is DispatchLane.LOCAL + assert queued.fallback_reason is not None + assert terminal.status is RemoteRunStatus.SUCCEEDED + assert summary.run_summary is not None + assert summary.run_summary.final_status == "ready_for_promotion" + + +def test_fake_remote_adapter_result_fetch_failure_raises(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + adapter = FakeRemoteAdapter(repo_root=repo_root, local_runner=_success_summary) + spec = _task_spec( + run_id="run-fetch-failure", + lane=DispatchLane.REMOTE, + scenario="result_fetch_failure", + ) + + adapter.dispatch(spec) + adapter.poll(spec.run_id) + terminal = adapter.poll(spec.run_id) + + assert terminal.status is RemoteRunStatus.SUCCEEDED + with pytest.raises(FileNotFoundError): + adapter.fetch_summary(spec.run_id) + + +def test_fake_remote_adapter_rejects_runtime_root_outside_repo_root(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + runtime_root = tmp_path / "external-runs" + + with pytest.raises(ValueError, match="runtime_root must live under repo_root"): + FakeRemoteAdapter( + repo_root=repo_root, + local_runner=_success_summary, + runtime_root=runtime_root, + ) diff --git a/tests/test_admin_backend.py b/tests/test_admin_backend.py index 866cc49a..0b4420fb 100644 --- a/tests/test_admin_backend.py +++ b/tests/test_admin_backend.py @@ -1,5 +1,6 @@ from __future__ import annotations +import json import sys import time from pathlib import Path @@ -7,12 +8,16 @@ import pytest from fastapi.testclient import TestClient +from autoresearch.agent_protocol.models import DriverMetrics, DriverResult, JobSpec, RunSummary, ValidationReport from autoresearch.api.dependencies import ( get_admin_auth_service, get_admin_config_service, + get_agent_audit_trail_service, get_approval_store_service, + get_autoresearch_planner_service, get_capability_provider_registry, get_claude_agent_service, + get_manager_agent_service, get_openclaw_compat_service, ) from autoresearch.api.main import app @@ -21,9 +26,14 @@ from autoresearch.core.services.admin_auth import AdminAuthService from autoresearch.core.services.admin_config import AdminConfigService from autoresearch.core.services.admin_secrets import AdminSecretCipher +from autoresearch.core.services.agent_audit_trail import AgentAuditTrailService from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService from autoresearch.core.services.claude_agents import ClaudeAgentService +from autoresearch.agents.manager_agent import ManagerAgentService from autoresearch.core.services.openclaw_compat import OpenClawCompatService +from autoresearch.shared.autoresearch_planner_contract import AutoResearchPlannerRequest +from autoresearch.shared.manager_agent_contract import ManagerDispatchRequest from autoresearch.shared.models import ( AdminAgentConfigRead, AdminChannelConfigRead, @@ -31,10 +41,11 @@ AdminSecretRecordRead, ApprovalRequestCreateRequest, ApprovalRequestRead, + ClaudeAgentCreateRequest, ClaudeAgentRunRead, OpenClawSessionRead, ) -from autoresearch.shared.store import SQLiteModelRepository +from autoresearch.shared.store import InMemoryRepository, SQLiteModelRepository class _StubCapabilityProvider: @@ -112,6 +123,50 @@ def query_events(self, query): return {} +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _successful_run_summary(job: JobSpec) -> RunSummary: + patch_path = Path("/tmp") / f"{job.run_id}.patch" + patch_path.write_text( + "\n".join( + [ + "diff --git a/src/demo.py b/src/demo.py", + "--- a/src/demo.py", + "+++ b/src/demo.py", + "@@ -1 +1 @@", + "+VALUE = 'READY_FOR_PROMOTION'", + "", + ] + ), + encoding="utf-8", + ) + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="admin audit flow completed successfully", + changed_paths=list(job.policy.allowed_paths), + metrics=DriverMetrics( + duration_ms=1800, + steps=3, + commands=2, + first_progress_ms=300, + first_scoped_write_ms=900, + first_state_heartbeat_ms=300, + ), + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri=str(patch_path), + ) + + @pytest.fixture def admin_client(tmp_path: Path) -> TestClient: db_path = tmp_path / "admin.sqlite3" @@ -133,6 +188,22 @@ def admin_client(tmp_path: Path) -> TestClient: max_agents=10, max_depth=3, ) + planner_service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=tmp_path, + dispatch_runner=_successful_run_summary, + ) + manager_service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=tmp_path, + dispatch_runner=_successful_run_summary, + ) + audit_trail_service = AgentAuditTrailService( + repo_root=tmp_path, + planner_service=planner_service, + manager_service=manager_service, + agent_service=claude_service, + ) admin_service = AdminConfigService( agent_repository=SQLiteModelRepository( db_path=db_path, @@ -174,6 +245,9 @@ def admin_client(tmp_path: Path) -> TestClient: app.dependency_overrides[get_openclaw_compat_service] = lambda: openclaw_service app.dependency_overrides[get_claude_agent_service] = lambda: claude_service + app.dependency_overrides[get_autoresearch_planner_service] = lambda: planner_service + app.dependency_overrides[get_manager_agent_service] = lambda: manager_service + app.dependency_overrides[get_agent_audit_trail_service] = lambda: audit_trail_service app.dependency_overrides[get_admin_config_service] = lambda: admin_service app.dependency_overrides[get_admin_auth_service] = lambda: auth_service app.dependency_overrides[get_approval_store_service] = lambda: approval_store @@ -190,6 +264,10 @@ def admin_client(tmp_path: Path) -> TestClient: client.headers.update({"authorization": f"Bearer {token}"}) setattr(client, "_admin_service", admin_service) setattr(client, "_approval_store", approval_store) + setattr(client, "_planner_service", planner_service) + setattr(client, "_manager_service", manager_service) + setattr(client, "_claude_service", claude_service) + setattr(client, "_repo_root", tmp_path) yield client app.dependency_overrides.clear() @@ -214,6 +292,8 @@ def test_admin_requires_bearer_token(admin_client: TestClient) -> None: assert denied.status_code == 401 denied_capabilities = admin_client.get("/api/v1/admin/capabilities") assert denied_capabilities.status_code == 401 + denied_audit = admin_client.get("/api/v1/admin/audit-trail") + assert denied_audit.status_code == 401 if existing is not None: admin_client.headers["authorization"] = existing @@ -223,8 +303,11 @@ def test_admin_view_contains_capability_inventory_section(admin_client: TestClie assert response.status_code == 200 assert "Capability Inventory" in response.text + assert "Agent Audit Trail" in response.text assert "Approval Queue" in response.text assert "Managed Skill Queue" in response.text + assert "/api/v1/admin/audit-trail" in response.text + assert "loadAuditDetail" in response.text assert "/api/v1/admin/capabilities" in response.text assert "/api/v1/admin/approvals" in response.text assert "/api/v1/admin/skills/status" in response.text @@ -250,6 +333,216 @@ def test_admin_capability_snapshot_lists_provider_inventory(admin_client: TestCl assert skill_provider["skills"][0]["skill_key"] == "daily_brief" +def test_admin_audit_trail_lists_recent_worker_activity(admin_client: TestClient) -> None: + repo_root: Path = getattr(admin_client, "_repo_root") + planner_service: AutoResearchPlannerService = getattr(admin_client, "_planner_service") + manager_service: ManagerAgentService = getattr(admin_client, "_manager_service") + claude_service: ClaudeAgentService = getattr(admin_client, "_claude_service") + + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "audit_target.py", + "def collect_audit_events() -> bool:\n # FIXME: add audit timeline regression coverage\n return True\n", + ) + _write(repo_root / "tests" / "test_audit_target.py", "def test_collect_audit_events():\n assert True\n") + _write(repo_root / "panel" / "app.tsx", "export const App = () => null;\n") + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "admin.py", "router = object()\n") + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "panel.py", "router = object()\n") + _write(repo_root / "tests" / "test_panel_security.py", "def test_panel_ok():\n assert True\n") + _write(repo_root / "tests" / "test_admin_managed_skills.py", "def test_admin_ok():\n assert True\n") + + plan = planner_service.create( + AutoResearchPlannerRequest( + goal="Find the next audit trail hardening task.", + metadata={"runtime_mode": "night", "remote_available": False}, + ) + ) + planner_service.request_dispatch(plan.plan_id, requested_by="admin-ui") + planner_service.execute_dispatch(plan.plan_id) + + dispatch = manager_service.create_dispatch( + ManagerDispatchRequest( + prompt="在 Admin Panel 里加一个带图表的智能体行为审计大屏。", + auto_dispatch=False, + ) + ) + manager_service.execute_dispatch(dispatch.dispatch_id) + + claude_service.create( + ClaudeAgentCreateRequest( + task_name="audit reviewer", + prompt="Inspect the latest worker execution traces.", + command_override=[sys.executable, "-c", "print('audit-review')"], + append_prompt=False, + metadata={ + "allowed_paths": ["src/autoresearch/api/routers/admin.py"], + "changed_paths": ["src/autoresearch/api/routers/admin.py"], + }, + ) + ) + + runtime_summary_path = ( + repo_root / ".masfactory_runtime" / "smokes" / "audit-smoke" / "artifacts" / "chain_summary.json" + ) + runtime_summary_path.parent.mkdir(parents=True, exist_ok=True) + runtime_patch_path = runtime_summary_path.parent / "promotion.patch" + runtime_patch_path.write_text( + "\n".join( + [ + "diff --git a/src/autoresearch/api/routers/admin.py b/src/autoresearch/api/routers/admin.py", + "--- a/src/autoresearch/api/routers/admin.py", + "+++ b/src/autoresearch/api/routers/admin.py", + "@@ -1 +1 @@", + "+AUDIT_TRAIL = True", + "", + ] + ), + encoding="utf-8", + ) + runtime_summary_path.write_text( + json.dumps( + { + "run_id": "runtime-audit-001", + "status": "ready_for_promotion", + "task": "Rebuild audit trail timeline", + "isolated_workspace": "/tmp/audit-runtime", + "driver_result": { + "agent_id": "openhands", + "summary": "runtime artifact captured successfully", + "changed_paths": ["src/autoresearch/api/routers/admin.py"], + "metrics": { + "duration_ms": 2400, + "first_progress_ms": 700, + "first_scoped_write_ms": 1100, + "first_state_heartbeat_ms": 700, + }, + }, + "promotion": { + "changed_files": ["src/autoresearch/api/routers/admin.py"], + "diff_stats": {"files_changed": 1}, + "patch_uri": str(runtime_patch_path), + }, + } + ), + encoding="utf-8", + ) + + failed_runtime_path = ( + repo_root / "logs" / "audit" / "openhands" / "jobs" / "audit-failed-001" / "chain_summary.json" + ) + failed_runtime_path.parent.mkdir(parents=True, exist_ok=True) + failed_patch_path = failed_runtime_path.parent / "promotion.patch" + failed_patch_path.write_text( + "\n".join( + [ + "diff --git a/src/autoresearch/core/services/agent_audit_trail.py b/src/autoresearch/core/services/agent_audit_trail.py", + "--- a/src/autoresearch/core/services/agent_audit_trail.py", + "+++ b/src/autoresearch/core/services/agent_audit_trail.py", + "@@ -1 +1 @@", + "+BROKEN = True", + "", + ] + ), + encoding="utf-8", + ) + failed_runtime_path.write_text( + json.dumps( + { + "run_id": "runtime-audit-failed-001", + "status": "failed", + "task": "Patch audit trail filters", + "error": "validation command failed", + "traceback": "Traceback (most recent call last):\\nValueError: missing token", + "artifacts": {"promotion_patch": str(failed_patch_path)}, + "driver_result": { + "agent_id": "openhands", + "summary": "worker failed during validation", + "changed_paths": ["src/autoresearch/core/services/agent_audit_trail.py"], + "metrics": { + "duration_ms": 3200, + "first_progress_ms": 600, + "first_scoped_write_ms": 1600, + "first_state_heartbeat_ms": 600, + }, + "error": "pytest exited with code 1", + }, + } + ), + encoding="utf-8", + ) + + response = admin_client.get("/api/v1/admin/audit-trail?limit=20") + + assert response.status_code == 200 + payload = response.json() + assert payload["stats"]["total"] >= 5 + assert payload["stats"]["queued"] >= 1 + assert payload["stats"]["succeeded"] >= 2 + assert payload["stats"]["failed"] >= 1 + sources = {item["source"] for item in payload["items"]} + assert {"manager_task", "autoresearch_plan", "claude_agent", "runtime_artifact"} <= sources + + planner_item = next(item for item in payload["items"] if item["source"] == "autoresearch_plan") + assert planner_item["final_status"] == "ready_for_promotion" + assert planner_item["status"] == "dispatched" + assert "tests/test_audit_target.py" in planner_item["scope_paths"] + assert planner_item["first_progress_ms"] == 300 + assert planner_item["first_scoped_write_ms"] == 900 + assert planner_item["first_state_heartbeat_ms"] == 300 + assert planner_item["metadata"]["dispatch_requested_lane"] == "remote" + assert planner_item["metadata"]["dispatch_lane"] == "local" + assert planner_item["metadata"]["dispatch_remote_status"] == "succeeded" + assert planner_item["metadata"]["dispatch_failure_class"] is None + assert planner_item["metadata"]["dispatch_recovery_action"] is None + assert planner_item["metadata"]["dispatch_fallback_reason"] is not None + + manager_backend = next( + item + for item in payload["items"] + if item["source"] == "manager_task" and item["metadata"]["stage"] == "backend" + ) + assert manager_backend["final_status"] == "ready_for_promotion" + assert "src/autoresearch/api/routers/admin.py" in manager_backend["scope_paths"] + + runtime_item = next( + item + for item in payload["items"] + if item["source"] == "runtime_artifact" and item["run_id"] == "runtime-audit-001" + ) + assert runtime_item["isolated_workspace"] == "/tmp/audit-runtime" + assert runtime_item["patch_uri"] == str(runtime_patch_path) + assert runtime_item["agent_role"] == "worker" + assert runtime_item["first_progress_ms"] == 700 + assert runtime_item["first_scoped_write_ms"] == 1100 + assert runtime_item["first_state_heartbeat_ms"] == 700 + + detail_response = admin_client.get(f"/api/v1/admin/audit-trail/{planner_item['entry_id']}") + assert detail_response.status_code == 200 + detail_payload = detail_response.json() + assert detail_payload["input_prompt"] == "Find the next audit trail hardening task." + assert detail_payload["job_spec"]["task"] != "" + assert "diff --git" in detail_payload["patch_text"] + assert detail_payload["patch_truncated"] is False + assert detail_payload["entry"]["first_progress_ms"] == 300 + assert detail_payload["raw_record"]["autoresearch_plan"]["dispatch_run"]["requested_lane"] == "remote" + assert detail_payload["raw_record"]["autoresearch_plan"]["dispatch_run"]["lane"] == "local" + assert detail_payload["raw_record"]["autoresearch_plan"]["dispatch_run"]["status"] == "succeeded" + assert detail_payload["raw_record"]["autoresearch_plan"]["run_summary"]["final_status"] == "ready_for_promotion" + + failed_response = admin_client.get("/api/v1/admin/audit-trail?limit=20&status_filter=failed&agent_role=worker") + assert failed_response.status_code == 200 + failed_items = failed_response.json()["items"] + assert len(failed_items) == 1 + assert failed_items[0]["run_id"] == "runtime-audit-failed-001" + + failed_detail = admin_client.get(f"/api/v1/admin/audit-trail/{failed_items[0]['entry_id']}") + assert failed_detail.status_code == 200 + failed_detail_payload = failed_detail.json() + assert failed_detail_payload["error_reason"] == "validation command failed" + assert "Traceback" in (failed_detail_payload["traceback"] or "") + assert "diff --git" in failed_detail_payload["patch_text"] + assert failed_detail_payload["entry"]["first_progress_ms"] == 600 + + def test_admin_approvals_list_and_resolve(admin_client: TestClient) -> None: approval_store = getattr(admin_client, "_approval_store") owned = approval_store.create_request( diff --git a/tests/test_admin_managed_skills.py b/tests/test_admin_managed_skills.py index d36c6ea9..f92fce16 100644 --- a/tests/test_admin_managed_skills.py +++ b/tests/test_admin_managed_skills.py @@ -34,8 +34,9 @@ class StubTelegramNotifier: - def __init__(self) -> None: + def __init__(self, *, send_results: list[bool] | None = None) -> None: self.messages: list[dict[str, object]] = [] + self._send_results = list(send_results or []) @property def enabled(self) -> bool: @@ -57,6 +58,8 @@ def send_message( "reply_markup": reply_markup, } ) + if self._send_results: + return self._send_results.pop(0) return True @@ -188,6 +191,7 @@ def admin_skill_client(tmp_path: Path) -> TestClient: telegram_bot_token="123456:TEST_BOT_TOKEN", telegram_init_data_max_age_seconds=900, base_url="https://panel.example/api/v1/panel/view", + mini_app_url="https://panel.example/api/v1/panel/view", allowed_uids={"10001"}, ) notifier = StubTelegramNotifier() @@ -283,6 +287,7 @@ def test_admin_managed_skill_promote_creates_approval_and_mini_app_link( assert "actionNonce=" in payload["mini_app_url"] assert "actionHash=" in payload["mini_app_url"] assert "actionIssuedAt=" in payload["mini_app_url"] + assert "token=" in payload["mini_app_url"] assert payload["notification_sent"] is True assert notifier.messages[0]["chat_id"] == "10001" reply_markup = notifier.messages[0]["reply_markup"] @@ -291,6 +296,79 @@ def test_admin_managed_skill_promote_creates_approval_and_mini_app_link( assert button["web_app"]["url"] == payload["mini_app_url"] +def test_admin_managed_skill_promote_falls_back_to_url_button_when_web_app_send_fails( + tmp_path: Path, +) -> None: + db_path = tmp_path / "admin-managed-skills-fallback.sqlite3" + auth_service = AdminAuthService( + secret="test-admin-jwt-secret", + bootstrap_key="bootstrap-test-key", + ) + approval_store = ApprovalStoreService( + repository=SQLiteModelRepository( + db_path=db_path, + table_name="approval_requests_admin_skill_fallback", + model_cls=ApprovalRequestRead, + ) + ) + private_key, public_key = _trusted_keys() + registry = ManagedSkillRegistryService( + repo_root=tmp_path, + repository=InMemoryRepository(), + quarantine_root=tmp_path / "artifacts" / "managed_skills" / "quarantine", + active_root=tmp_path / "artifacts" / "managed_skills" / "active", + trusted_signers={"test-signer": public_key}, + allowed_capabilities={"prompt", "filesystem_read"}, + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="https://panel.example/api/v1/panel/view", + mini_app_url="https://panel.example/api/v1/panel/view", + allowed_uids={"10001"}, + ) + notifier = StubTelegramNotifier(send_results=[False, True]) + + app.dependency_overrides[get_admin_auth_service] = lambda: auth_service + app.dependency_overrides[get_approval_store_service] = lambda: approval_store + app.dependency_overrides[get_managed_skill_registry_service] = lambda: registry + app.dependency_overrides[get_panel_access_service] = lambda: panel_access + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + + with TestClient(app) as client: + token_response = client.post( + "/api/v1/admin/auth/token", + json={"subject": "test-owner", "roles": ["owner"], "ttl_seconds": 3600}, + headers={"x-admin-bootstrap-key": "bootstrap-test-key"}, + ) + assert token_response.status_code == 200 + client.headers.update({"authorization": f"Bearer {token_response.json()['token']}"}) + + bundle_dir = _build_bundle(tmp_path / "bundle-promote-fallback", private_key=private_key) + validated = registry.run_cold_validation( + registry.install_to_quarantine( + ManagedSkillInstallRequest(bundle_dir=str(bundle_dir), requested_by="owner") + ).install_id + ) + + response = client.post( + f"/api/v1/admin/skills/{validated.install_id}/promote", + json={"note": "retry with url button"}, + ) + + app.dependency_overrides.clear() + + assert response.status_code == 200 + payload = response.json() + assert payload["notification_sent"] is True + assert len(notifier.messages) == 2 + assert notifier.messages[0]["reply_markup"]["inline_keyboard"][0][0]["web_app"]["url"] == payload["mini_app_url"] + assert notifier.messages[1]["reply_markup"] == { + "inline_keyboard": [[{"text": "打开 Panel 审批", "url": payload["mini_app_url"]}]] + } + + def test_admin_managed_skill_promote_execute_requires_approved_telegram_flow( admin_skill_client: TestClient, tmp_path: Path, diff --git a/tests/test_agent_fallbacks.py b/tests/test_agent_fallbacks.py index dc75f7c8..897f818a 100644 --- a/tests/test_agent_fallbacks.py +++ b/tests/test_agent_fallbacks.py @@ -101,3 +101,90 @@ def test_retry_then_fallback_agent(tmp_path: Path) -> None: assert summary.final_status == "ready_for_promotion" assert summary.driver_result.agent_id == "secondary" + + +def test_stalled_retry_is_skipped_before_fallback_agent( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True) + (repo_root / "src" / "base.py").write_text("x = 1\n", encoding="utf-8") + + stall_adapter = repo_root / "drivers" / "stall_adapter.sh" + stall_adapter.parent.mkdir(parents=True, exist_ok=True) + stall_adapter.write_text( + """#!/usr/bin/env bash +set -euo pipefail +sleep 30 +""", + encoding="utf-8", + ) + stall_adapter.chmod(0o755) + + success_adapter = repo_root / "drivers" / "fallback_success.sh" + success_adapter.write_text( + """#!/usr/bin/env bash +set -euo pipefail +mkdir -p "$AEP_WORKSPACE/src" +echo 'print(9)' > "$AEP_WORKSPACE/src/fallback.py" +cat > "$AEP_RESULT_PATH" <<'JSON' +{ + "protocol_version": "aep/v0", + "run_id": "run-stall-fallback", + "agent_id": "secondary", + "attempt": 1, + "status": "succeeded", + "summary": "secondary succeeded", + "changed_paths": [], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 1, "commands": 1, "prompt_tokens": null, "completion_tokens": null}, + "recommended_action": "promote", + "error": null +} +JSON +exit 0 +""", + encoding="utf-8", + ) + success_adapter.chmod(0o755) + + _write_manifest(repo_root, "primary", "drivers/stall_adapter.sh") + _write_manifest(repo_root, "secondary", "drivers/fallback_success.sh") + + runtime_root = tmp_path / "runtime" + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=runtime_root, + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setattr(runner, "_stall_progress_timeout_sec", lambda timeout_sec: 2) + + summary = runner.run_job( + JobSpec( + run_id="run-stall-fallback", + agent_id="primary", + task="demo", + fallback=[ + FallbackStep(action="retry", max_attempts=1), + FallbackStep(action="fallback_agent", agent_id="secondary", max_attempts=1), + ], + ) + ) + + events_path = runtime_root / "run-stall-fallback" / "events.ndjson" + attempts = [ + json.loads(line) + for line in events_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + started_agents = [item["agent_id"] for item in attempts if item.get("type") == "attempt_started"] + + assert summary.final_status == "ready_for_promotion" + assert summary.driver_result.agent_id == "secondary" + assert started_agents == ["primary", "secondary"] + assert any( + item.get("type") == "fallback_skipped" and item.get("reason") == "stalled_no_progress" + for item in attempts + ) diff --git a/tests/test_agent_policy_merge.py b/tests/test_agent_policy_merge.py index 7aa5ef10..8cefb47b 100644 --- a/tests/test_agent_policy_merge.py +++ b/tests/test_agent_policy_merge.py @@ -2,6 +2,8 @@ from autoresearch.agent_protocol.models import ExecutionPolicy from autoresearch.agent_protocol.policy import build_effective_policy +import json +from pathlib import Path def test_policy_merge_deny_wins() -> None: @@ -34,7 +36,7 @@ def test_policy_merge_deny_wins() -> None: assert effective.allowed_paths == ["src/**"] assert "src/secrets/**" in effective.forbidden_paths assert effective.max_changed_files == 20 - assert effective.max_patch_lines == 500 + assert effective.max_patch_lines == 900 assert effective.allow_binary_changes is False @@ -45,3 +47,38 @@ def test_policy_merge_preserves_more_specific_file_scope() -> None: effective = build_effective_policy(manifest_policy, job_policy).merged assert effective.allowed_paths == ["src/generated_worker.py"] + + +def test_policy_merge_allows_script_targets_when_manifest_and_job_both_allow_them() -> None: + manifest_policy = ExecutionPolicy(allowed_paths=["src/**", "tests/**", "scripts/**"]) + job_policy = ExecutionPolicy( + allowed_paths=["scripts/check_prompt_hygiene.py", "tests/test_check_prompt_hygiene.py"] + ) + + effective = build_effective_policy(manifest_policy, job_policy).merged + + assert effective.allowed_paths == [ + "tests/test_check_prompt_hygiene.py", + "scripts/check_prompt_hygiene.py", + ] + + +def test_policy_merge_allows_isolated_apps_targets_when_job_requests_business_surface() -> None: + manifest_policy = ExecutionPolicy(allowed_paths=["src/**", "tests/**", "apps/**"]) + job_policy = ExecutionPolicy( + allowed_paths=["apps/malu/**", "tests/apps/test_malu_landing_page.py"] + ) + + effective = build_effective_policy(manifest_policy, job_policy).merged + + assert effective.allowed_paths == [ + "tests/apps/test_malu_landing_page.py", + "apps/malu/**", + ] + + +def test_openhands_manifest_default_policy_includes_apps_surface() -> None: + manifest_path = Path(__file__).resolve().parents[1] / "configs" / "agents" / "openhands.yaml" + payload = json.loads(manifest_path.read_text(encoding="utf-8")) + + assert "apps/**" in payload["policy_defaults"]["allowed_paths"] diff --git a/tests/test_agent_runner_outcomes.py b/tests/test_agent_runner_outcomes.py index b4ffaf38..e9e0fa30 100644 --- a/tests/test_agent_runner_outcomes.py +++ b/tests/test_agent_runner_outcomes.py @@ -2,8 +2,16 @@ import json from pathlib import Path +import sys -from autoresearch.agent_protocol.models import JobSpec +from autoresearch.agent_protocol.models import ( + DriverResult, + ExecutionPolicy, + FallbackStep, + JobSpec, + ValidatorSpec, +) +from autoresearch.agent_protocol.policy import build_effective_policy from autoresearch.executions.runner import AgentExecutionRunner @@ -63,6 +71,52 @@ def test_failed_driver_is_terminal_failure_not_human_review(tmp_path: Path) -> N assert checks["builtin.driver_success"].passed is False +def test_runner_persists_summary_when_attempt_crashes_unexpectedly( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True) + (repo_root / "src" / "base.py").write_text("x = 1\n", encoding="utf-8") + + adapter = repo_root / "drivers" / "explosive_adapter.sh" + adapter.parent.mkdir(parents=True, exist_ok=True) + adapter.write_text("#!/usr/bin/env bash\nexit 0\n", encoding="utf-8") + adapter.chmod(0o755) + _write_manifest(repo_root, "explosive", "drivers/explosive_adapter.sh") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + + def _boom(**_: object) -> DriverResult: + raise RuntimeError("synthetic invoke failure") + + monkeypatch.setattr(runner, "_invoke_adapter", _boom) + + summary = runner.run_job(JobSpec(run_id="run-explosive", agent_id="explosive", task="demo")) + + summary_path = tmp_path / "runtime" / "run-explosive" / "summary.json" + events_path = tmp_path / "runtime" / "run-explosive" / "events.ndjson" + + assert summary.final_status == "failed" + assert summary.driver_result.status == "contract_error" + assert "RuntimeError" in (summary.driver_result.error or "") + assert summary_path.exists() + payload = json.loads(summary_path.read_text(encoding="utf-8")) + assert payload["final_status"] == "failed" + assert payload["driver_result"]["status"] == "contract_error" + events = [ + json.loads(line) + for line in events_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + assert any(item.get("type") == "runner_exception" for item in events) + + def test_zero_change_success_is_blocked(tmp_path: Path) -> None: repo_root = tmp_path / "repo" repo_root.mkdir() @@ -105,3 +159,332 @@ def test_zero_change_success_is_blocked(tmp_path: Path) -> None: assert summary.final_status == "blocked" checks = {item.id: item for item in summary.validation.checks} assert checks["builtin.nonempty_change_for_promote"].passed is False + + +def test_openhands_environment_preflight_blocks_dirty_runtime_before_attempt( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True) + (repo_root / "src" / "base.py").write_text("x = 1\n", encoding="utf-8") + + adapter = repo_root / "drivers" / "openhands_adapter.sh" + adapter.parent.mkdir(parents=True, exist_ok=True) + adapter.write_text( + """#!/usr/bin/env bash +set -euo pipefail +echo "adapter should not run" > "$AEP_WORKSPACE/should_not_exist.txt" +cat > "$AEP_RESULT_PATH" <<'JSON' +{ + "protocol_version": "aep/v0", + "run_id": "run-preflight", + "agent_id": "openhands", + "attempt": 1, + "status": "succeeded", + "summary": "should not happen", + "changed_paths": ["should_not_exist.txt"], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 0, "commands": 0, "prompt_tokens": null, "completion_tokens": null}, + "recommended_action": "promote", + "error": null +} +JSON +""", + encoding="utf-8", + ) + adapter.chmod(0o755) + _write_manifest(repo_root, "openhands", "drivers/openhands_adapter.sh") + + preflight = repo_root / "scripts" / "fake_preflight.sh" + preflight.parent.mkdir(parents=True, exist_ok=True) + preflight.write_text( + """#!/usr/bin/env bash +set -euo pipefail +echo "docker socket is stale for current user" >&2 +exit 1 +""", + encoding="utf-8", + ) + preflight.chmod(0o755) + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setenv("OPENHANDS_RUNTIME", "ai-lab") + monkeypatch.setenv("OPENHANDS_PREFLIGHT_CMD", f"bash {preflight}") + + summary = runner.run_job( + JobSpec( + run_id="run-preflight", + agent_id="openhands", + task="demo", + ) + ) + + events_path = tmp_path / "runtime" / "run-preflight" / "events.ndjson" + events = [ + json.loads(line) + for line in events_path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + assert summary.final_status == "failed" + assert summary.driver_result.status == "contract_error" + assert summary.driver_result.error is not None + assert summary.driver_result.error.startswith("EnvironmentCheckFailed:") + assert not any(item.get("type") == "attempt_started" for item in events) + assert any( + item.get("type") == "attempt_blocked" + and item.get("reason") == "environment_preflight_failed" + for item in events + ) + + +def test_openhands_ai_lab_env_strips_ambient_path_overrides(tmp_path: Path, monkeypatch) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "ai_lab.env").write_text( + "WORKSPACE_DIR=/Volumes/AI_LAB/ai_lab/workspace\n" + "LOG_DIR=/Volumes/AI_LAB/ai_lab/logs\n", + encoding="utf-8", + ) + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setenv("OPENHANDS_RUNTIME", "ai-lab") + monkeypatch.setenv("ENV_FILE", "/tmp/foreign.env") + monkeypatch.setenv("OPENHANDS_ENV_FILE", "/tmp/foreign-openhands.env") + monkeypatch.setenv("WORKSPACE_DIR", "/Users/ai_lab/workspace") + monkeypatch.setenv("LOG_DIR", "/Users/ai_lab/logs") + monkeypatch.setenv("OPENHANDS_HOME_DIR", "/Users/ai_lab/logs/openhands-home") + + env = runner._build_openhands_ai_lab_env() + + assert env["ENV_FILE"] == str(repo_root / "ai_lab.env") + assert env["OPENHANDS_ENV_FILE"] == str(repo_root / "ai_lab.env") + assert "WORKSPACE_DIR" not in env + assert "LOG_DIR" not in env + assert "OPENHANDS_HOME_DIR" not in env + + +def test_openhands_environment_preflight_uses_repo_managed_ai_lab_env( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "ai_lab.env").write_text( + "WORKSPACE_DIR=/Volumes/AI_LAB/ai_lab/workspace\n" + "LOG_DIR=/Volumes/AI_LAB/ai_lab/logs\n", + encoding="utf-8", + ) + snapshot_path = repo_root / "preflight-env.txt" + preflight = repo_root / "scripts" / "fake_preflight.sh" + preflight.parent.mkdir(parents=True, exist_ok=True) + preflight.write_text( + f"""#!/usr/bin/env bash +set -euo pipefail +cat > "{snapshot_path}" < None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True) + (repo_root / "src" / "retry_target.py").write_text("VALUE = 'seed'\\n", encoding="utf-8") + + adapter = repo_root / "drivers" / "retry_feedback_adapter.py" + adapter.parent.mkdir(parents=True, exist_ok=True) + adapter.write_text( + """#!/usr/bin/env python3 +import json +import os +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +result_path = Path(os.environ["AEP_RESULT_PATH"]) +job = json.loads(Path(os.environ["AEP_JOB_SPEC"]).read_text(encoding="utf-8")) +attempt = int(os.environ["AEP_ATTEMPT"]) +target = workspace / "src" / "retry_target.py" + +if attempt == 1: + target.write_text("raise TypeError('Invalid args for response field!')\\n", encoding="utf-8") +else: + task = job["task"] + if "Invalid args for response field!" not in task: + raise SystemExit("retry feedback missing raw validator detail") + target.write_text("VALUE = 'fixed'\\n", encoding="utf-8") + +payload = { + "protocol_version": "aep/v0", + "run_id": "run-retry-feedback", + "agent_id": "openhands", + "attempt": attempt, + "status": "succeeded", + "summary": f"attempt {attempt} completed", + "changed_paths": ["src/retry_target.py"], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 0, "commands": 0, "prompt_tokens": None, "completion_tokens": None}, + "recommended_action": "promote", + "error": None, +} +result_path.write_text(json.dumps(payload), encoding="utf-8") +""", + encoding="utf-8", + ) + adapter.chmod(0o755) + _write_manifest(repo_root, "openhands", "drivers/retry_feedback_adapter.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + + summary = runner.run_job( + JobSpec( + run_id="run-retry-feedback", + agent_id="openhands", + task="Fix src/retry_target.py.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} src/retry_target.py", + ) + ], + fallback=[FallbackStep(action="retry", max_attempts=1)], + policy=ExecutionPolicy( + allowed_paths=["src/retry_target.py"], + cleanup_on_success=False, + ), + metadata={"pipeline_target": "patch"}, + ) + ) + + job_payload = json.loads( + (tmp_path / "runtime" / "run-retry-feedback" / "job.json").read_text(encoding="utf-8") + ) + + assert summary.final_status == "ready_for_promotion" + assert summary.driver_result.attempt == 2 + assert "Invalid args for response field!" in job_payload["task"] diff --git a/tests/test_autoresearch_planner.py b/tests/test_autoresearch_planner.py new file mode 100644 index 00000000..c861369a --- /dev/null +++ b/tests/test_autoresearch_planner.py @@ -0,0 +1,564 @@ +from __future__ import annotations + +import os +from pathlib import Path +import subprocess +from urllib.parse import parse_qs, urlparse + +import pytest +from fastapi.testclient import TestClient + +from autoresearch.agent_protocol.models import DriverResult, JobSpec, RunSummary, ValidationReport +from autoresearch.api.dependencies import ( + get_autoresearch_planner_service, + get_panel_access_service, + get_telegram_notifier_service, +) +from autoresearch.api.main import app +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService +from autoresearch.core.services.panel_access import PanelAccessService +from autoresearch.shared.autoresearch_planner_contract import ( + AutoResearchPlanDispatchStatus, + AutoResearchPlannerRequest, + UpstreamWatchDecision, + UpstreamWatchRead, +) +from autoresearch.shared.models import JobStatus +from autoresearch.shared.store import InMemoryRepository + + +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +class StubTelegramNotifier: + def __init__(self, *, send_results: list[bool] | None = None) -> None: + self.messages: list[dict[str, object]] = [] + self._send_results = list(send_results or []) + + @property + def enabled(self) -> bool: + return True + + def send_message( + self, + *, + chat_id: str, + text: str, + disable_web_page_preview: bool = True, + reply_markup: dict[str, object] | None = None, + ) -> bool: + self.messages.append( + { + "chat_id": chat_id, + "text": text, + "disable_web_page_preview": disable_web_page_preview, + "reply_markup": reply_markup, + } + ) + if self._send_results: + return self._send_results.pop(0) + return True + + +class StubUpstreamWatcher: + def __init__(self, result: UpstreamWatchRead) -> None: + self._result = result + self.calls = 0 + + def inspect(self) -> UpstreamWatchRead: + self.calls += 1 + return self._result + + +def _successful_run_summary(job: JobSpec) -> RunSummary: + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="worker completed successfully", + changed_paths=list(job.policy.allowed_paths), + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri="/tmp/autoresearch.patch", + ) + + +def test_planner_selects_high_signal_marker_and_emits_worker_specs(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "demo_service.py", + "\n".join( + [ + "def handle() -> str:", + " # FIXME: normalize promotion preflight before returning", + " return 'ok'", + "", + ] + ), + ) + _write( + repo_root / "src" / "misc.py", + "\n".join( + [ + "def noop() -> None:", + " # TODO: clean this up later", + " return None", + "", + ] + ), + ) + + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + + plan = service.create(AutoResearchPlannerRequest()) + + assert plan.status is JobStatus.COMPLETED + assert plan.selected_candidate is not None + assert plan.selected_candidate.source_path == "src/autoresearch/core/services/demo_service.py" + assert plan.worker_spec is not None + assert plan.controlled_request is not None + assert plan.agent_job is not None + assert plan.worker_spec.allowed_paths == [ + "src/autoresearch/core/services/demo_service.py", + "tests/test_demo_service.py", + ] + assert plan.worker_spec.test_command == "pytest -q tests/test_demo_service.py" + assert plan.controlled_request.backend.value == "openhands_cli" + assert plan.controlled_request.pipeline_target.value == "draft_pr" + assert plan.agent_job.mode == "patch_only" + assert plan.agent_job.metadata["planner_candidate_id"] == plan.selected_candidate.candidate_id + assert "FIXME" in plan.selected_candidate.title + + +def test_planner_falls_back_to_test_gap_when_repo_has_no_markers(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + large_body = "\n".join([f"def fn_{index}() -> int:\n return {index}\n" for index in range(60)]) + _write(repo_root / "src" / "autoresearch" / "core" / "services" / "large_module.py", large_body) + + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + + plan = service.create(AutoResearchPlannerRequest(max_candidates=3, pipeline_target="patch")) + + assert plan.status is JobStatus.COMPLETED + assert plan.selected_candidate is not None + assert plan.selected_candidate.category == "test_gap" + assert plan.selected_candidate.source_path == "src/autoresearch/core/services/large_module.py" + assert plan.worker_spec is not None + assert plan.worker_spec.pipeline_target == "patch" + assert plan.worker_spec.allowed_paths[-1] == "tests/test_large_module.py" + + +def test_planner_records_optional_upstream_watch_result(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir(parents=True) + watcher = StubUpstreamWatcher( + UpstreamWatchRead( + upstream_url="https://github.com/openclaw/openclaw.git", + decision=UpstreamWatchDecision.SKIP, + summary="Recent upstream changes remain in non-core areas (LINE, Zalo); auto-skipped.", + focus_areas=["extension:line", "extension:zalo"], + cleaned_up=True, + ) + ) + + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + upstream_watcher=watcher, + ) + + plan = service.create(AutoResearchPlannerRequest(include_upstream_watch=True)) + + assert watcher.calls == 1 + assert plan.upstream_watch is not None + assert plan.upstream_watch.decision is UpstreamWatchDecision.SKIP + assert "Upstream watcher auto-skipped merge noise" in plan.summary + + +@pytest.fixture +def autoresearch_plan_client(tmp_path: Path) -> TestClient: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "planner_target.py", + "\n".join( + [ + "def check() -> bool:", + " # FIXME: add strict regression coverage", + " return True", + "", + ] + ), + ) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="https://panel.example/api/v1/panel/view", + mini_app_url="https://panel.example/api/v1/panel/view", + allowed_uids={"10001"}, + ) + notifier = StubTelegramNotifier() + + app.dependency_overrides[get_autoresearch_planner_service] = lambda: service + app.dependency_overrides[get_panel_access_service] = lambda: panel_access + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + with TestClient(app) as client: + setattr(client, "_planner", service) + setattr(client, "_notifier", notifier) + yield client + app.dependency_overrides.clear() + + +def test_autoresearch_plan_api_round_trip(autoresearch_plan_client: TestClient) -> None: + notifier = getattr(autoresearch_plan_client, "_notifier") + response = autoresearch_plan_client.post( + "/api/v1/autoresearch/plans", + json={ + "goal": "Find the next safe promotion candidate.", + "max_candidates": 2, + "pipeline_target": "patch", + }, + ) + + assert response.status_code == 202 + payload = response.json() + assert payload["status"] == "completed" + assert payload["selected_candidate"]["source_path"] == "src/autoresearch/core/services/planner_target.py" + assert payload["worker_spec"]["pipeline_target"] == "patch" + assert payload["telegram_uid"] == "10001" + assert payload["notification_sent"] is True + parsed = urlparse(payload["panel_action_url"]) + assert parsed.netloc == "panel.example" + query = parse_qs(parsed.query) + assert query["planId"] == [payload["plan_id"]] + assert "token" in query + assert notifier.messages[0]["chat_id"] == "10001" + assert "AutoResearch 发现新优化点" in str(notifier.messages[0]["text"]) + assert notifier.messages[0]["reply_markup"] == { + "inline_keyboard": [[{"text": "打开 Mini App 审批", "web_app": {"url": payload["panel_action_url"]}}]] + } + + list_response = autoresearch_plan_client.get("/api/v1/autoresearch/plans") + assert list_response.status_code == 200 + items = list_response.json() + assert len(items) == 1 + plan_id = items[0]["plan_id"] + + get_response = autoresearch_plan_client.get(f"/api/v1/autoresearch/plans/{plan_id}") + assert get_response.status_code == 200 + assert get_response.json()["plan_id"] == plan_id + + +def test_autoresearch_plan_api_falls_back_to_text_only_notify_when_panel_url_is_not_https( + tmp_path: Path, +) -> None: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "planner_target.py", + "\n".join( + [ + "def check() -> bool:", + " # FIXME: add strict regression coverage", + " return True", + "", + ] + ), + ) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="http://127.0.0.1:8000/api/v1/panel/view", + allowed_uids={"10001"}, + ) + notifier = StubTelegramNotifier() + + app.dependency_overrides[get_autoresearch_planner_service] = lambda: service + app.dependency_overrides[get_panel_access_service] = lambda: panel_access + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + with TestClient(app) as client: + response = client.post( + "/api/v1/autoresearch/plans", + json={ + "goal": "Find the next safe promotion candidate.", + "max_candidates": 2, + "pipeline_target": "patch", + }, + ) + app.dependency_overrides.clear() + + assert response.status_code == 202 + payload = response.json() + assert payload["notification_sent"] is True + assert notifier.messages[0]["reply_markup"] is None + + +def test_autoresearch_plan_api_falls_back_to_url_button_when_web_app_send_is_rejected( + tmp_path: Path, +) -> None: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "planner_target.py", + "\n".join( + [ + "def check() -> bool:", + " # FIXME: add strict regression coverage", + " return True", + "", + ] + ), + ) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="https://panel.example/api/v1/panel/view", + mini_app_url="https://panel.example/api/v1/panel/view", + allowed_uids={"10001"}, + ) + notifier = StubTelegramNotifier(send_results=[False, True]) + + app.dependency_overrides[get_autoresearch_planner_service] = lambda: service + app.dependency_overrides[get_panel_access_service] = lambda: panel_access + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + with TestClient(app) as client: + response = client.post( + "/api/v1/autoresearch/plans", + json={ + "goal": "Find the next safe promotion candidate.", + "max_candidates": 2, + }, + ) + app.dependency_overrides.clear() + + assert response.status_code == 202 + payload = response.json() + assert payload["notification_sent"] is True + assert len(notifier.messages) == 2 + assert notifier.messages[0]["reply_markup"] == { + "inline_keyboard": [[{"text": "打开 Mini App 审批", "web_app": {"url": payload["panel_action_url"]}}]] + } + assert notifier.messages[1]["reply_markup"] == { + "inline_keyboard": [[{"text": "打开 Panel 审批", "url": payload["panel_action_url"]}]] + } + assert "Mini App 审批执行" in str(notifier.messages[0]["text"]) + assert "Panel 审批执行" in str(notifier.messages[1]["text"]) + + +def test_autoresearch_plan_api_uses_secure_url_button_when_only_https_panel_url_is_available( + tmp_path: Path, +) -> None: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "planner_target.py", + "\n".join( + [ + "def check() -> bool:", + " # FIXME: add strict regression coverage", + " return True", + "", + ] + ), + ) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="https://panel.example/api/v1/panel/view", + allowed_uids={"10001"}, + ) + notifier = StubTelegramNotifier() + + app.dependency_overrides[get_autoresearch_planner_service] = lambda: service + app.dependency_overrides[get_panel_access_service] = lambda: panel_access + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + with TestClient(app) as client: + response = client.post( + "/api/v1/autoresearch/plans", + json={ + "goal": "Find the next safe promotion candidate.", + "max_candidates": 2, + "pipeline_target": "patch", + }, + ) + app.dependency_overrides.clear() + + assert response.status_code == 202 + payload = response.json() + parsed = urlparse(payload["panel_action_url"]) + query = parse_qs(parsed.query) + assert query["planId"] == [payload["plan_id"]] + assert "token" in query + assert notifier.messages[0]["reply_markup"] == { + "inline_keyboard": [[{"text": "打开 Panel 审批", "url": payload["panel_action_url"]}]] + } + + +def test_planner_dispatch_lifecycle_records_run_summary(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "dispatch_target.py", + "\n".join( + [ + "def check() -> bool:", + " # FIXME: dispatch this through the worker", + " return True", + "", + ] + ), + ) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + dispatch_runner=_successful_run_summary, + ) + + plan = service.create(AutoResearchPlannerRequest(telegram_uid="10001")) + queued = service.request_dispatch(plan.plan_id, requested_by="10001") + dispatched = service.execute_dispatch(plan.plan_id) + + assert queued.dispatch_status is AutoResearchPlanDispatchStatus.DISPATCHING + assert queued.dispatch_run is not None + assert queued.dispatch_run.status.value == "queued" + assert queued.dispatch_run.lane.value == "local" + assert dispatched.dispatch_status is AutoResearchPlanDispatchStatus.DISPATCHED + assert dispatched.dispatch_requested_by == "10001" + assert dispatched.dispatch_completed_at is not None + assert dispatched.dispatch_run is not None + assert dispatched.dispatch_run.status.value == "succeeded" + assert dispatched.run_summary is not None + assert dispatched.run_summary.final_status == "ready_for_promotion" + assert dispatched.run_summary.promotion_patch_uri == "/tmp/autoresearch.patch" + + +def test_request_dispatch_records_remote_fallback_preview(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "fallback_target.py", + "def check() -> bool:\n # FIXME: run through fallback preview\n return True\n", + ) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + dispatch_runner=_successful_run_summary, + ) + + plan = service.create( + AutoResearchPlannerRequest( + telegram_uid="10001", + metadata={"runtime_mode": "night", "remote_available": False}, + ) + ) + queued = service.request_dispatch(plan.plan_id, requested_by="10001") + + assert queued.dispatch_run is not None + assert queued.dispatch_run.requested_lane.value == "remote" + assert queued.dispatch_run.lane.value == "local" + assert queued.dispatch_run.fallback_reason is not None + + +def test_autoresearch_plan_api_sends_low_noise_upstream_skip_report(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir(parents=True) + service = AutoResearchPlannerService( + repository=InMemoryRepository(), + repo_root=repo_root, + upstream_watcher=StubUpstreamWatcher( + UpstreamWatchRead( + upstream_url="https://github.com/openclaw/openclaw.git", + decision=UpstreamWatchDecision.SKIP, + summary="Recent upstream changes remain in non-core areas (LINE, Zalo); auto-skipped.", + focus_areas=["extension:line", "extension:zalo"], + cleaned_up=True, + ) + ), + ) + panel_access = PanelAccessService( + secret="panel-secret", + telegram_bot_token="123456:TEST_BOT_TOKEN", + telegram_init_data_max_age_seconds=900, + base_url="https://panel.example/api/v1/panel/view", + allowed_uids={"10001"}, + ) + notifier = StubTelegramNotifier() + + app.dependency_overrides[get_autoresearch_planner_service] = lambda: service + app.dependency_overrides[get_panel_access_service] = lambda: panel_access + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + with TestClient(app) as client: + response = client.post( + "/api/v1/autoresearch/plans", + json={ + "goal": "Scan planner backlog and upstream noise.", + "include_upstream_watch": True, + }, + ) + app.dependency_overrides.clear() + + assert response.status_code == 202 + payload = response.json() + assert payload["selected_candidate"] is None + assert payload["upstream_watch"]["decision"] == "skip" + assert payload["notification_sent"] is True + assert payload["panel_action_url"] is None + assert len(notifier.messages) == 1 + assert "已完成上游巡检" in str(notifier.messages[0]["text"]) + assert "LINE/Zalo" in str(notifier.messages[0]["text"]) + + +def _git(repo: Path, *args: str, cwd: Path | None = None) -> str: + env = os.environ.copy() + env.update( + { + "GIT_AUTHOR_NAME": "Codex Tests", + "GIT_AUTHOR_EMAIL": "codex-tests@example.com", + "GIT_COMMITTER_NAME": "Codex Tests", + "GIT_COMMITTER_EMAIL": "codex-tests@example.com", + } + ) + completed = subprocess.run( + ["git", *args], + cwd=str(cwd or repo), + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr or completed.stdout + return completed.stdout.strip() + + +def _commit(repo: Path, rel_path: str, content: str, message: str) -> None: + target = repo / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + _git(repo, "add", rel_path) + _git(repo, "commit", "-m", message) diff --git a/tests/test_doctor_linux.py b/tests/test_doctor_linux.py new file mode 100644 index 00000000..27d8974b --- /dev/null +++ b/tests/test_doctor_linux.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +import importlib.util +import sys +from pathlib import Path + + +def _load_doctor_module(): + script_path = Path(__file__).resolve().parents[1] / "scripts" / "doctor.py" + spec = importlib.util.spec_from_file_location("doctor_script", script_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def test_linux_runtime_mode_warns_when_unset(tmp_path, monkeypatch): + doctor = _load_doctor_module() + monkeypatch.delenv("OPENHANDS_RUNTIME", raising=False) + + result = doctor._check_linux_runtime_mode(tmp_path, "linux-remote") + + assert result is not None + assert result.status == "WARN" + assert "OPENHANDS_RUNTIME is not set" in result.detail + assert "OPENHANDS_RUNTIME=host" in (result.hint or "") + + +def test_linux_docker_host_warns_for_mac_colima(monkeypatch): + doctor = _load_doctor_module() + monkeypatch.setenv("DOCKER_HOST", "unix:///Users/demo/.colima/default/docker.sock") + + result = doctor._check_linux_docker_host("linux-remote") + + assert result is not None + assert result.status == "WARN" + assert "colima" in result.detail + + +def test_linux_runtime_paths_pass_when_runtime_dirs_are_writable(tmp_path): + doctor = _load_doctor_module() + + result = doctor._check_linux_runtime_paths(tmp_path, "linux-remote") + + assert result is not None + assert result.status == "PASS" diff --git a/tests/test_failure_classifier.py b/tests/test_failure_classifier.py new file mode 100644 index 00000000..391b0e9c --- /dev/null +++ b/tests/test_failure_classifier.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +from autoresearch.agent_protocol.models import DriverResult, RunSummary, ValidationCheck, ValidationReport +from autoresearch.core.dispatch.failure_classifier import classify_remote_status, classify_run_summary +from autoresearch.shared.remote_run_contract import FailureClass, RecoveryAction, RemoteRunStatus + + +def _summary( + *, + driver_status: str, + final_status: str, + agent_id: str = "openhands", + validation_passed: bool = True, + error: str | None = None, +) -> RunSummary: + checks = [] + if not validation_passed: + checks.append(ValidationCheck(id="worker.test_command", passed=False, detail="assertion failed")) + return RunSummary( + run_id="run-classifier", + final_status=final_status, + driver_result=DriverResult( + run_id="run-classifier", + agent_id=agent_id, + status=driver_status, + summary="classifier probe", + changed_paths=["src/demo.py"], + recommended_action="fallback" if agent_id == "mock" else "human_review", + error=error, + ), + validation=ValidationReport( + run_id="run-classifier", + passed=validation_passed, + checks=checks, + ), + ) + + +def test_classifier_maps_stalled_executor_to_retry() -> None: + disposition = classify_run_summary( + _summary(driver_status="stalled_no_progress", final_status="failed", error="no workspace progress") + ) + assert disposition.failure_class is FailureClass.EXECUTOR_STALLED + assert disposition.recovery_action is RecoveryAction.RETRY + + +def test_classifier_maps_timeout_to_retry() -> None: + disposition = classify_run_summary(_summary(driver_status="timed_out", final_status="failed")) + assert disposition.failure_class is FailureClass.TOOL_TIMEOUT + assert disposition.recovery_action is RecoveryAction.RETRY + + +def test_classifier_maps_env_missing_to_abort() -> None: + disposition = classify_run_summary( + _summary( + driver_status="contract_error", + final_status="failed", + error="EnvironmentCheckFailed: launch_ai_lab.sh not found at scripts/launch_ai_lab.sh", + ) + ) + assert disposition.failure_class is FailureClass.ENV_MISSING + assert disposition.recovery_action is RecoveryAction.ABORT + + +def test_classifier_maps_workspace_dirty_to_abort() -> None: + disposition = classify_run_summary( + _summary( + driver_status="contract_error", + final_status="failed", + error="repository worktree is not clean; promotion requires a clean base", + ) + ) + assert disposition.failure_class is FailureClass.WORKSPACE_DIRTY + assert disposition.recovery_action is RecoveryAction.ABORT + + +def test_classifier_maps_mock_success_to_downgrade() -> None: + disposition = classify_run_summary( + _summary( + driver_status="succeeded", + final_status="ready_for_promotion", + agent_id="mock", + ) + ) + assert disposition.failure_class is FailureClass.MODEL_FALLBACK + assert disposition.recovery_action is RecoveryAction.DOWNGRADE_TO_DRAFT + + +def test_classifier_maps_mock_validation_failure_to_human_review() -> None: + disposition = classify_run_summary( + _summary( + driver_status="failed", + final_status="human_review", + agent_id="mock", + validation_passed=False, + ) + ) + assert disposition.failure_class is FailureClass.ASSERTION_FAILED_AFTER_FALLBACK + assert disposition.recovery_action is RecoveryAction.REQUIRE_HUMAN_REVIEW + + +def test_classifier_maps_planner_stall_to_human_review() -> None: + disposition = classify_remote_status(RemoteRunStatus.STALLED, stage="planner") + assert disposition.failure_class is FailureClass.PLANNER_STALLED + assert disposition.recovery_action is RecoveryAction.REQUIRE_HUMAN_REVIEW + + +def test_classifier_maps_transient_network_to_retry() -> None: + disposition = classify_remote_status( + RemoteRunStatus.FAILED, + error_text="ssh: connection reset by peer", + ) + assert disposition.failure_class is FailureClass.TRANSIENT_NETWORK + assert disposition.recovery_action is RecoveryAction.RETRY diff --git a/tests/test_gateway_telegram.py b/tests/test_gateway_telegram.py index 54c934df..e66310d4 100644 --- a/tests/test_gateway_telegram.py +++ b/tests/test_gateway_telegram.py @@ -13,6 +13,9 @@ get_approval_store_service, get_capability_provider_registry, get_claude_agent_service, + get_github_issue_service, + get_housekeeper_service, + get_manager_agent_service, get_openclaw_memory_service, get_openclaw_compat_service, get_panel_access_service, @@ -20,14 +23,20 @@ ) from autoresearch.api.main import app from autoresearch.api.routers import gateway_telegram +from autoresearch.agent_protocol.models import DriverResult, RunSummary, ValidationReport +from autoresearch.agents.manager_agent import ManagerAgentService from autoresearch.core.adapters import CapabilityProviderDescriptorRead, CapabilityProviderRegistry from autoresearch.core.adapters.contracts import CapabilityDomain, SkillCatalogRead from autoresearch.core.services.admin_config import AdminConfigService from autoresearch.core.services.approval_store import ApprovalStoreService from autoresearch.core.services.claude_agents import ClaudeAgentService +from autoresearch.core.services.github_issue_service import GitHubIssueCommentRead, GitHubIssueRead, GitHubIssueReference +from autoresearch.core.services.housekeeper import HousekeeperService from autoresearch.core.services.openclaw_compat import OpenClawCompatService from autoresearch.core.services.openclaw_memory import OpenClawMemoryService from autoresearch.core.services.panel_access import PanelAccessService +from autoresearch.shared.housekeeper_contract import HousekeeperChangeReason, HousekeeperMode, HousekeeperModeUpdateRequest +from autoresearch.shared.manager_agent_contract import ManagerDispatchRead from autoresearch.shared.models import ( AdminAgentConfigRead, AdminChannelConfigRead, @@ -37,8 +46,11 @@ ApprovalRequestCreateRequest, OpenClawMemoryRecordRead, OpenClawSessionRead, + PromotionDiffStats, + PromotionResult, + utc_now, ) -from autoresearch.shared.store import SQLiteModelRepository +from autoresearch.shared.store import InMemoryRepository, SQLiteModelRepository class _StubSkillProvider: @@ -81,6 +93,23 @@ def get_skill(self, skill_name: str): return OpenClawSkillDetailRead(**self._skill, content="# Daily Brief\nUse this skill.\n") +def _night_housekeeper_service() -> HousekeeperService: + service = HousekeeperService( + state_repository=InMemoryRepository(), + budget_repository=InMemoryRepository(), + exploration_repository=InMemoryRepository(), + ) + service.update_mode( + HousekeeperModeUpdateRequest( + action="set_manual_override", + target_mode=HousekeeperMode.NIGHT_READONLY_EXPLORE, + changed_by="test", + reason=HousekeeperChangeReason.MANUAL_API, + ) + ) + return service + + class _StubTelegramNotifier: def __init__(self) -> None: self.status_events: list[dict[str, str]] = [] @@ -127,6 +156,77 @@ def notify_manual_action(self, *, chat_id: str, entry: object, run_status: str) return True +class _StubGitHubIssueService: + def __init__(self) -> None: + self.comments: list[dict[str, str]] = [] + + def fetch_issue(self, raw_reference: str) -> GitHubIssueRead: + return GitHubIssueRead( + reference=GitHubIssueReference(owner="owner", repo="repo", number=123), + title="Audit trail crashes when comment body is empty", + body="Steps:\n1. Trigger the task.\n2. Observe the failure.\n\nExpected: dispatch succeeds.", + url="https://github.com/owner/repo/issues/123", + state="OPEN", + author="founder", + labels=("bug", "telegram"), + comments=( + GitHubIssueCommentRead(author="reviewer", body="Please keep the fix scoped and tested."), + ), + ) + + def build_manager_prompt(self, issue: GitHubIssueRead, *, operator_note: str | None = None) -> str: + note = operator_note or "" + return f"Fix GitHub issue {issue.reference.display}. {note}".strip() + + def post_comment(self, raw_reference: str, body: str) -> str: + self.comments.append({"issue_reference": raw_reference, "body": body}) + return f"commented on {raw_reference}" + + +def _build_manager_service(db_path: Path) -> ManagerAgentService: + repository = SQLiteModelRepository( + db_path=db_path, + table_name="manager_agent_dispatches_gateway_it", + model_cls=ManagerDispatchRead, + ) + + def _dispatch_runner(job_spec) -> RunSummary: + return RunSummary( + run_id=job_spec.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job_spec.run_id, + agent_id=job_spec.agent_id, + status="succeeded", + summary="stub manager runner completed", + changed_paths=["src/autoresearch/api/routers/admin.py"], + recommended_action="promote", + ), + validation=ValidationReport(run_id=job_spec.run_id, passed=True), + promotion_patch_uri="artifacts/promotion.patch", + promotion=None, + ).model_copy( + update={ + "promotion": PromotionResult( + run_id=job_spec.run_id, + success=True, + mode="draft_pr", + pr_url=f"https://github.com/owner/repo/pull/{job_spec.run_id[-3:]}", + changed_files=["src/autoresearch/api/routers/admin.py"], + diff_stats=PromotionDiffStats(files_changed=1, insertions=12, deletions=2), + created_at=utc_now(), + updated_at=utc_now(), + ) + } + ) + + return ManagerAgentService( + repository=repository, + repo_root=Path(__file__).resolve().parents[1], + dispatch_runner=_dispatch_runner, + ) + + @pytest.fixture def telegram_client(tmp_path: Path) -> TestClient: db_path = tmp_path / "telegram-gateway.sqlite3" @@ -716,6 +816,8 @@ def test_telegram_help_command_returns_available_commands( help_text = notifier.messages[0]["text"] assert "[Telegram Commands]" in help_text assert "/status" in help_text + assert "/task <需求>" in help_text + assert "/task --approve <需求>" in help_text assert "/approve approve" in help_text assert "/memory <内容>" in help_text assert "/mode shared" in help_text @@ -724,6 +826,240 @@ def test_telegram_help_command_returns_available_commands( app.dependency_overrides.pop(get_telegram_notifier_service, None) +def test_telegram_start_command_returns_available_commands( + telegram_client: TestClient, +) -> None: + notifier = _StubTelegramNotifier() + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + + try: + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 3152, + "message": { + "message_id": 151, + "text": "/start", + "chat": {"id": 9535, "type": "private"}, + "from": {"id": 9535, "username": "start-user"}, + }, + }, + ) + assert response.status_code == 200 + payload = response.json() + assert payload["accepted"] is True + assert payload["agent_run_id"] is None + assert payload["metadata"]["source"] == "telegram_help" + + assert len(notifier.messages) == 1 + help_text = notifier.messages[0]["text"] + assert "[Telegram Commands]" in help_text + assert "/start 查看欢迎信息和命令列表" in help_text + assert "/status" in help_text + assert "/help" in help_text + finally: + app.dependency_overrides.pop(get_telegram_notifier_service, None) + + +def test_telegram_task_issue_dispatches_manager_and_queues_issue_reply_approval( + telegram_client: TestClient, + tmp_path: Path, +) -> None: + notifier = _StubTelegramNotifier() + github_issue_service = _StubGitHubIssueService() + manager_service = _build_manager_service(tmp_path / "manager.sqlite3") + approval_service = getattr(telegram_client, "_approval_store") + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + app.dependency_overrides[get_github_issue_service] = lambda: github_issue_service + app.dependency_overrides[get_housekeeper_service] = _night_housekeeper_service + app.dependency_overrides[get_manager_agent_service] = lambda: manager_service + + try: + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 3161, + "message": { + "message_id": 154, + "text": "/task issue owner/repo#123 优先检查 Telegram 审批回路", + "chat": {"id": 9537, "type": "private"}, + "from": {"id": 9537, "username": "task-user"}, + }, + }, + ) + assert response.status_code == 200 + payload = response.json() + assert payload["accepted"] is True + assert payload["agent_run_id"] is None + assert payload["metadata"]["source"] == "telegram_manager_task" + assert payload["metadata"]["task_source"] == "issue" + assert payload["metadata"]["dispatch_id"] + assert payload["metadata"]["issue_reference"] == "owner/repo#123" + + dispatch = manager_service.get_dispatch(payload["metadata"]["dispatch_id"]) + assert dispatch is not None + assert dispatch.status.value == "completed" + assert dispatch.run_summary is not None + assert dispatch.run_summary.promotion is not None + assert dispatch.run_summary.promotion.pr_url + + approvals = approval_service.list_requests(telegram_uid="9537", limit=10) + assert len(approvals) == 1 + approval = approvals[0] + assert approval.metadata["action_type"] == "github_issue_comment" + assert approval.metadata["issue_reference"] == "owner/repo#123" + assert "Automated progress update" in approval.metadata["comment_body"] + + assert len(notifier.messages) >= 3 + assert any("已接收,开始拆解并执行" in item["text"] for item in notifier.messages) + assert any("draft_pr:" in item["text"] for item in notifier.messages) + assert any("[GitHub Reply Pending]" in item["text"] for item in notifier.messages) + finally: + app.dependency_overrides.pop(get_telegram_notifier_service, None) + app.dependency_overrides.pop(get_github_issue_service, None) + app.dependency_overrides.pop(get_housekeeper_service, None) + app.dependency_overrides.pop(get_manager_agent_service, None) + + +def test_telegram_task_approve_flag_grants_owner_dispatch_context( + telegram_client: TestClient, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + notifier = _StubTelegramNotifier() + manager_service = _build_manager_service(tmp_path / "manager-approve-flag.sqlite3") + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + app.dependency_overrides[get_manager_agent_service] = lambda: manager_service + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_OWNER_UIDS", "9541") + + try: + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 3164, + "message": { + "message_id": 157, + "text": "/task --approve 为美妆品牌玛露开发 6g 遮瑕膏落地页", + "chat": {"id": 9541, "type": "private"}, + "from": {"id": 9541, "username": "owner-user"}, + }, + }, + ) + assert response.status_code == 200 + payload = response.json() + dispatch = manager_service.get_dispatch(payload["metadata"]["dispatch_id"]) + + assert dispatch is not None + backend_task = dispatch.execution_plan.tasks[0] + assert backend_task.worker_spec is not None + assert backend_task.agent_job is not None + assert backend_task.worker_spec.metadata["approval_granted"] is True + assert backend_task.agent_job.metadata["approval_granted"] is True + finally: + app.dependency_overrides.pop(get_telegram_notifier_service, None) + app.dependency_overrides.pop(get_manager_agent_service, None) + + +def test_telegram_task_approve_flag_is_ignored_for_non_admin_user( + telegram_client: TestClient, + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + notifier = _StubTelegramNotifier() + manager_service = _build_manager_service(tmp_path / "manager-non-admin-approve.sqlite3") + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + app.dependency_overrides[get_manager_agent_service] = lambda: manager_service + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_ALLOWED_UIDS", "9542") + + try: + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 3165, + "message": { + "message_id": 158, + "text": "/task --approve 为美妆品牌玛露开发 6g 遮瑕膏落地页", + "chat": {"id": 9542, "type": "private"}, + "from": {"id": 9542, "username": "member-user"}, + }, + }, + ) + assert response.status_code == 200 + payload = response.json() + dispatch = manager_service.get_dispatch(payload["metadata"]["dispatch_id"]) + + assert dispatch is not None + backend_task = dispatch.execution_plan.tasks[0] + assert backend_task.worker_spec is not None + assert backend_task.agent_job is not None + assert backend_task.worker_spec.metadata["approval_granted"] is False + assert backend_task.agent_job.metadata["approval_granted"] is False + assert any("仅对 owner/partner 生效" in item["text"] for item in notifier.messages) + finally: + app.dependency_overrides.pop(get_telegram_notifier_service, None) + app.dependency_overrides.pop(get_manager_agent_service, None) + + +def test_telegram_approve_command_posts_github_issue_reply_for_issue_tasks( + telegram_client: TestClient, + tmp_path: Path, +) -> None: + notifier = _StubTelegramNotifier() + github_issue_service = _StubGitHubIssueService() + manager_service = _build_manager_service(tmp_path / "manager-approve.sqlite3") + approval_service = getattr(telegram_client, "_approval_store") + app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier + app.dependency_overrides[get_github_issue_service] = lambda: github_issue_service + app.dependency_overrides[get_housekeeper_service] = _night_housekeeper_service + app.dependency_overrides[get_manager_agent_service] = lambda: manager_service + + try: + task_response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 3162, + "message": { + "message_id": 155, + "text": "/task issue #123 带上修复摘要", + "chat": {"id": 9538, "type": "private"}, + "from": {"id": 9538, "username": "task-user"}, + }, + }, + ) + assert task_response.status_code == 200 + approvals = approval_service.list_requests(telegram_uid="9538", limit=10) + assert len(approvals) == 1 + approval = approvals[0] + + approve_response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 3163, + "message": { + "message_id": 156, + "text": f"/approve {approval.approval_id} approve 发出去", + "chat": {"id": 9538, "type": "private"}, + "from": {"id": 9538, "username": "task-user"}, + }, + }, + ) + assert approve_response.status_code == 200 + assert approve_response.json()["accepted"] is True + + resolved = approval_service.get_request(approval.approval_id) + assert resolved is not None + assert resolved.status.value == "approved" + assert resolved.metadata["comment_posted"] is True + assert github_issue_service.comments[0]["issue_reference"] == "owner/repo#123" + assert "Automated progress update" in github_issue_service.comments[0]["body"] + assert any("[GitHub Reply Posted]" in item["text"] for item in notifier.messages) + finally: + app.dependency_overrides.pop(get_telegram_notifier_service, None) + app.dependency_overrides.pop(get_github_issue_service, None) + app.dependency_overrides.pop(get_housekeeper_service, None) + app.dependency_overrides.pop(get_manager_agent_service, None) + + def test_telegram_approve_command_lists_and_reads_pending_approvals( telegram_client: TestClient, ) -> None: diff --git a/tests/test_gateway_telegram_guards.py b/tests/test_gateway_telegram_guards.py new file mode 100644 index 00000000..4a1028d3 --- /dev/null +++ b/tests/test_gateway_telegram_guards.py @@ -0,0 +1,182 @@ +from __future__ import annotations + +import sys + +import pytest +from fastapi.testclient import TestClient + +from autoresearch.api.routers import gateway_telegram +from tests.test_gateway_telegram import clear_gateway_guards, telegram_client # noqa: F401 + + +def test_mainline_webhook_happy_path_with_secret_header( + telegram_client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_SECRET_TOKEN", "mainline-secret") + monkeypatch.setenv( + "AUTORESEARCH_TELEGRAM_CLAUDE_COMMAND_OVERRIDE", + f"{sys.executable} -c \"print('guard-happy-ok')\"", + ) + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_APPEND_PROMPT", "false") + + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + headers={"x-telegram-bot-api-secret-token": "mainline-secret"}, + json={ + "update_id": 4101, + "message": { + "message_id": 501, + "text": "happy path", + "chat": {"id": 88001, "type": "private"}, + }, + }, + ) + + assert response.status_code == 200 + payload = response.json() + assert payload["accepted"] is True + assert payload["agent_run_id"] is not None + + +def test_mainline_webhook_rejects_missing_secret_before_replay_guard( + telegram_client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_SECRET_TOKEN", "ordered-secret") + + events: list[str] = [] + original_validate = gateway_telegram._validate_secret_token + original_guard = gateway_telegram._guard_webhook_replay_and_rate + + def wrapped_validate(raw_request) -> None: + events.append("secret") + return original_validate(raw_request) + + def wrapped_guard(update) -> None: + events.append("guard") + return original_guard(update) + + monkeypatch.setattr(gateway_telegram, "_validate_secret_token", wrapped_validate) + monkeypatch.setattr(gateway_telegram, "_guard_webhook_replay_and_rate", wrapped_guard) + + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + json={ + "update_id": 4102, + "message": { + "message_id": 502, + "text": "missing secret", + "chat": {"id": 88002, "type": "private"}, + }, + }, + ) + + assert response.status_code == 401 + assert events == ["secret"] + + +def test_mainline_webhook_runs_secret_check_before_replay_guard_on_success( + telegram_client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_SECRET_TOKEN", "ordered-secret") + monkeypatch.setenv( + "AUTORESEARCH_TELEGRAM_CLAUDE_COMMAND_OVERRIDE", + f"{sys.executable} -c \"print('ordered-ok')\"", + ) + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_APPEND_PROMPT", "false") + + events: list[str] = [] + original_validate = gateway_telegram._validate_secret_token + original_guard = gateway_telegram._guard_webhook_replay_and_rate + + def wrapped_validate(raw_request) -> None: + events.append("secret") + return original_validate(raw_request) + + def wrapped_guard(update) -> None: + events.append("guard") + return original_guard(update) + + monkeypatch.setattr(gateway_telegram, "_validate_secret_token", wrapped_validate) + monkeypatch.setattr(gateway_telegram, "_guard_webhook_replay_and_rate", wrapped_guard) + + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + headers={"x-telegram-bot-api-secret-token": "ordered-secret"}, + json={ + "update_id": 4103, + "message": { + "message_id": 503, + "text": "ordered success", + "chat": {"id": 88003, "type": "private"}, + }, + }, + ) + + assert response.status_code == 200 + assert response.json()["accepted"] is True + assert events[:2] == ["secret", "guard"] + + +def test_mainline_webhook_rejects_replayed_update_id( + telegram_client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_SECRET_TOKEN", "replay-secret") + monkeypatch.setenv( + "AUTORESEARCH_TELEGRAM_CLAUDE_COMMAND_OVERRIDE", + f"{sys.executable} -c \"print('replay-guard-ok')\"", + ) + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_APPEND_PROMPT", "false") + + headers = {"x-telegram-bot-api-secret-token": "replay-secret"} + payload = { + "update_id": 4104, + "message": { + "message_id": 504, + "text": "replay check", + "chat": {"id": 88004, "type": "private"}, + }, + } + + first = telegram_client.post("/api/v1/gateway/telegram/webhook", headers=headers, json=payload) + second = telegram_client.post("/api/v1/gateway/telegram/webhook", headers=headers, json=payload) + + assert first.status_code == 200 + assert second.status_code == 409 + + +def test_mainline_webhook_rejects_per_chat_rate_limit_overflow( + telegram_client: TestClient, + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_SECRET_TOKEN", "rate-secret") + monkeypatch.setenv( + "AUTORESEARCH_TELEGRAM_CLAUDE_COMMAND_OVERRIDE", + f"{sys.executable} -c \"print('rate-guard-ok')\"", + ) + monkeypatch.setenv("AUTORESEARCH_TELEGRAM_APPEND_PROMPT", "false") + + headers = {"x-telegram-bot-api-secret-token": "rate-secret"} + chat_id = 88005 + + for i in range(1, 32): + response = telegram_client.post( + "/api/v1/gateway/telegram/webhook", + headers=headers, + json={ + "update_id": 4200 + i, + "message": { + "message_id": 600 + i, + "text": f"rate-{i}", + "chat": {"id": chat_id, "type": "private"}, + }, + }, + ) + + if i <= 30: + assert response.status_code == 200 + else: + assert response.status_code == 429 diff --git a/tests/test_git_promotion_gate.py b/tests/test_git_promotion_gate.py index d615050d..67baae64 100644 --- a/tests/test_git_promotion_gate.py +++ b/tests/test_git_promotion_gate.py @@ -126,6 +126,27 @@ def _write_patch(root: Path, *, filename: str = "src/demo.py") -> Path: return patch_path +def _write_large_patch(root: Path, *, filename: str, added_lines: int) -> Path: + patch_path = root / "promotion.patch" + additions = [f'+LINE_{index} = "{index}"' for index in range(added_lines)] + patch_path.write_text( + "\n".join( + [ + f"diff --git a/{filename} b/{filename}", + "new file mode 100644", + "index 0000000..1111111", + "--- /dev/null", + f"+++ b/{filename}", + f"@@ -0,0 +1,{added_lines} @@", + *additions, + "", + ] + ), + encoding="utf-8", + ) + return patch_path + + def _intent( patch_path: Path, *, @@ -233,6 +254,48 @@ def test_finalize_defaults_to_patch_when_patch_gates_pass(tmp_path: Path) -> Non assert (artifacts_dir / "promotion_result.json").exists() +def test_finalize_ignores_benign_runtime_artifacts_and_accepts_large_business_patch( + tmp_path: Path, +) -> None: + patch_path = _write_large_patch( + tmp_path, + filename="apps/malu/lead_capture.py", + added_lines=600, + ) + artifacts_dir = tmp_path / "artifacts" + + service = GitPromotionGateService(repo_root=tmp_path) + preflight, result = service.finalize( + intent=PromotionIntent( + run_id="run-business-patch", + actor_role=PromotionActorRole.AGGREGATOR, + actor_id="aggregator-1", + writer_id="worker-1", + writer_lease_key="writer:business-patch", + patch_uri=str(patch_path), + changed_files=[ + "apps/malu/lead_capture.py", + "apps/malu/README.md", + ".pytest_cache/README.md", + "tests/apps/__pycache__/test_malu_landing_page.cpython-314.pyc", + ], + base_ref="HEAD", + preferred_mode=GitPromotionMode.PATCH, + target_base_branch="main", + approval_granted=False, + metadata={}, + ), + artifacts_dir=artifacts_dir, + ) + + checks = {item.id: item for item in preflight.checks} + + assert result.success is True + assert result.mode is GitPromotionMode.PATCH + assert checks["gate.no_runtime_artifacts"].passed is True + assert checks["gate.max_patch_lines"].passed is True + + def test_finalize_upgrades_patch_to_draft_pr_when_all_preconditions_pass(tmp_path: Path) -> None: patch_path = _write_patch(tmp_path) artifacts_dir = tmp_path / "artifacts" diff --git a/tests/test_github_issue_service.py b/tests/test_github_issue_service.py new file mode 100644 index 00000000..09059067 --- /dev/null +++ b/tests/test_github_issue_service.py @@ -0,0 +1,56 @@ +from __future__ import annotations + +from pathlib import Path + +from autoresearch.core.services.github_issue_service import ( + GitHubIssueCommentRead, + GitHubIssueRead, + GitHubIssueReference, + GitHubIssueService, +) + + +def test_resolve_issue_reference_supports_url_and_shorthand(tmp_path: Path) -> None: + service = GitHubIssueService(repo_root=tmp_path) + + from_url = service.resolve_issue_reference("https://github.com/openai/example/issues/42") + assert from_url.owner == "openai" + assert from_url.repo == "example" + assert from_url.number == 42 + + from_ref = service.resolve_issue_reference("openai/example#43") + assert from_ref.owner == "openai" + assert from_ref.repo == "example" + assert from_ref.number == 43 + + +def test_resolve_issue_reference_supports_current_repo_issue_numbers(tmp_path: Path) -> None: + service = GitHubIssueService(repo_root=tmp_path) + service._resolve_current_repo = lambda: ("owner", "repo") # type: ignore[method-assign] + + reference = service.resolve_issue_reference("#44") + assert reference.owner == "owner" + assert reference.repo == "repo" + assert reference.number == 44 + + +def test_build_manager_prompt_includes_issue_context_and_note(tmp_path: Path) -> None: + service = GitHubIssueService(repo_root=tmp_path) + issue = GitHubIssueRead( + reference=GitHubIssueReference(owner="owner", repo="repo", number=45), + title="Telegram task dispatch should create approvals", + body="Expected behavior: issue tasks should ask before replying externally.", + url="https://github.com/owner/repo/issues/45", + state="OPEN", + author="founder", + labels=("bug", "telegram"), + comments=( + GitHubIssueCommentRead(author="reviewer", body="Please keep the fix narrow."), + ), + ) + + prompt = service.build_manager_prompt(issue, operator_note="先保证审批链别断。") + assert "owner/repo#45" in prompt + assert "先保证审批链别断。" in prompt + assert "Please keep the fix narrow." in prompt + assert "prepare a draft PR when possible" in prompt diff --git a/tests/test_housekeeper.py b/tests/test_housekeeper.py new file mode 100644 index 00000000..e6158eaf --- /dev/null +++ b/tests/test_housekeeper.py @@ -0,0 +1,198 @@ +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +from autoresearch.agent_protocol.models import DriverResult, JobSpec, RunSummary, ValidationReport +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.core.services.telegram_notify import TelegramNotifierService +from autoresearch.shared.housekeeper_contract import ( + CircuitBreakerStatus, + ExplorationBlockerReason, + HousekeeperChangeReason, + HousekeeperMode, + HousekeeperModeUpdateRequest, +) +from autoresearch.shared.manager_agent_contract import ManagerDispatchRequest +from autoresearch.shared.models import ApprovalRequestCreateRequest +from autoresearch.shared.store import InMemoryRepository + + +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _seed_admin_dashboard_repo(repo_root: Path) -> None: + _write(repo_root / "panel" / "app.tsx", "export const App = () => null;\n") + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "panel.py", "router = object()\n") + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "admin.py", "router = object()\n") + _write(repo_root / "src" / "autoresearch" / "core" / "services" / "metrics_dashboard.py", "def ok():\n return True\n") + _write(repo_root / "tests" / "test_panel_security.py", "def test_ok():\n assert True\n") + _write(repo_root / "tests" / "test_admin_managed_skills.py", "def test_admin_ok():\n assert True\n") + + +def _successful_run_summary(job: JobSpec) -> RunSummary: + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="ok", + changed_paths=list(job.policy.allowed_paths), + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri="/tmp/demo.patch", + ) + + +def _service() -> HousekeeperService: + return HousekeeperService( + state_repository=InMemoryRepository(), + budget_repository=InMemoryRepository(), + exploration_repository=InMemoryRepository(), + ) + + +def test_housekeeper_manual_override_replaces_prior_override() -> None: + service = _service() + base_now = datetime(2026, 3, 31, 12, 0, tzinfo=timezone.utc) + state = service.get_state(now=base_now) + assert state.scheduled_mode is HousekeeperMode.DAY_SAFE + + first = service.update_mode( + HousekeeperModeUpdateRequest( + action="set_manual_override", + target_mode=HousekeeperMode.NIGHT_READONLY_EXPLORE, + changed_by="test", + reason=HousekeeperChangeReason.MANUAL_API, + ), + now=base_now, + ) + assert first.manual_override_mode is HousekeeperMode.NIGHT_READONLY_EXPLORE + assert first.effective_mode is HousekeeperMode.NIGHT_READONLY_EXPLORE + + replaced = service.update_mode( + HousekeeperModeUpdateRequest( + action="set_manual_override", + target_mode=HousekeeperMode.DAY_SAFE, + changed_by="test", + reason=HousekeeperChangeReason.MANUAL_API, + ), + now=base_now, + ) + assert replaced.manual_override_mode is HousekeeperMode.DAY_SAFE + assert replaced.effective_mode is HousekeeperMode.DAY_SAFE + + +def test_housekeeper_prepare_manager_request_defers_heavy_dispatch_in_day_mode(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_admin_dashboard_repo(repo_root) + manager_service = ManagerAgentService(repository=InMemoryRepository(), repo_root=repo_root) + housekeeper = _service() + + prepared, assessment, state = housekeeper.prepare_manager_request( + ManagerDispatchRequest( + prompt="在 Admin Panel 里加一个带图表的实时服务器资源监控大屏。", + auto_dispatch=True, + ), + manager_service=manager_service, + trigger_source="test", + now=datetime(2026, 3, 31, 12, 0, tzinfo=timezone.utc), + ) + + assert state.effective_mode is HousekeeperMode.DAY_SAFE + assert assessment.plan_shape == "task_dag" + assert assessment.fanout_count == 3 + assert prepared.auto_dispatch is False + assert prepared.pipeline_target == "patch" + assert prepared.metadata["deferred_reason"] == "deferred_to_night" + assert prepared.metadata["execution_profile"]["profile_name"] == "day_safe" + + +def test_housekeeper_morning_summary_uses_four_sections(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_admin_dashboard_repo(repo_root) + manager_service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + dispatch_runner=_successful_run_summary, + ) + dispatch = manager_service.create_dispatch( + ManagerDispatchRequest( + prompt="在 Admin Panel 里加一个带图表的实时服务器资源监控大屏。", + auto_dispatch=False, + ) + ) + manager_service.execute_dispatch(dispatch.dispatch_id) + + approval_service = ApprovalStoreService(repository=InMemoryRepository()) + approval_service.create_request( + ApprovalRequestCreateRequest( + title="Review manager result", + telegram_uid="10001", + ) + ) + housekeeper = _service() + summary = housekeeper.create_morning_summary( + manager_service=manager_service, + planner_service=type("PlannerStub", (), {"list": lambda self: [], "list_pending": lambda self, limit=20: []})(), + approval_service=approval_service, + notifier=TelegramNotifierService(bot_token=None), + media_jobs=[], + now=datetime(2026, 4, 1, 0, 30, tzinfo=timezone.utc), + ) + + assert "昨夜完成了什么" in summary.summary_text + assert "失败/阻塞了什么" in summary.summary_text + assert "今天需要你决定什么" in summary.summary_text + assert "系统当前模式与待执行队列" in summary.summary_text + assert summary.decision_items + + +def test_housekeeper_night_tick_reports_circuit_breaker_blocker_reason() -> None: + housekeeper = _service() + housekeeper.update_mode( + HousekeeperModeUpdateRequest( + action="set_manual_override", + target_mode=HousekeeperMode.NIGHT_READONLY_EXPLORE, + changed_by="test", + reason=HousekeeperChangeReason.MANUAL_API, + ), + now=datetime(2026, 3, 31, 16, 0, tzinfo=timezone.utc), + ) + housekeeper.update_mode( + HousekeeperModeUpdateRequest( + action="apply_schedule", + target_mode=HousekeeperMode.DAY_SAFE, + changed_by="system", + reason=HousekeeperChangeReason.CIRCUIT_BREAKER, + ), + now=datetime(2026, 3, 31, 16, 0, tzinfo=timezone.utc), + ) + state = housekeeper.get_state(now=datetime(2026, 3, 31, 16, 0, tzinfo=timezone.utc)) + tripped = state.model_copy( + update={ + "circuit_breaker_state": state.circuit_breaker_state.model_copy( + update={"status": CircuitBreakerStatus.OPEN} + ), + } + ) + housekeeper._state_repository.save(tripped.state_id, tripped) + + tick = housekeeper.execute_night_explore_tick( + manager_service=type("ManagerStub", (), {})(), + planner_service=type("PlannerStub", (), {})(), + notifier=TelegramNotifierService(bot_token=None), + media_jobs=[], + now=datetime(2026, 3, 31, 16, 0, tzinfo=timezone.utc), + ) + + assert tick.executed is False + assert tick.skipped_reason == "circuit_breaker_open" + assert tick.blocker_reason is ExplorationBlockerReason.CIRCUIT_BREAKER_OPEN diff --git a/tests/test_launch_ai_lab.py b/tests/test_launch_ai_lab.py new file mode 100644 index 00000000..a78a5509 --- /dev/null +++ b/tests/test_launch_ai_lab.py @@ -0,0 +1,162 @@ +from __future__ import annotations + +import os +import stat +import subprocess +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[1] +SCRIPT = REPO_ROOT / "scripts" / "launch_ai_lab.sh" + + +def _write_executable(path: Path, content: str) -> None: + path.write_text(content, encoding="utf-8") + path.chmod(path.stat().st_mode | stat.S_IXUSR) + + +def _base_env(tmp_path: Path) -> dict[str, str]: + workspace = tmp_path / "workspace" + cache_dir = tmp_path / "cache" + log_dir = tmp_path / "logs" + workspace.mkdir() + cache_dir.mkdir() + log_dir.mkdir() + + fake_bin = tmp_path / "bin" + fake_bin.mkdir() + fake_docker = fake_bin / "docker" + _write_executable( + fake_docker, + """#!/usr/bin/env bash +set -euo pipefail +cmd="${1:-}" +sub="${2:-}" +case "${cmd} ${sub}" in + "context show") + printf '%s\n' "${FAKE_DOCKER_CONTEXT:-desktop-linux}" + ;; + "version --format") + if [[ "${FAKE_DOCKER_INFO_OK:-0}" == "1" ]] || [[ "${DOCKER_HOST:-}" == "${FAKE_REPO_DOCKER_HOST:-}" ]]; then + printf '%s\n' "27.0.0" + exit 0 + fi + exit 1 + ;; + "info ") + if [[ "${FAKE_DOCKER_INFO_OK:-0}" == "1" ]] || [[ "${DOCKER_HOST:-}" == "${FAKE_REPO_DOCKER_HOST:-}" ]]; then + exit 0 + fi + exit 1 + ;; + "image inspect") + exit 0 + ;; + *) + exit 0 + ;; +esac +""", + ) + + inaccessible_socket = tmp_path / "other-user.sock" + inaccessible_socket.write_text("", encoding="utf-8") + inaccessible_socket.chmod(0) + + env = os.environ.copy() + env.update( + { + "PATH": f"{fake_bin}:{env['PATH']}", + "ENV_FILE": str(tmp_path / "missing.env"), + "WORKSPACE_DIR": str(workspace), + "CACHE_DIR": str(cache_dir), + "LOG_DIR": str(log_dir), + "OPENHANDS_HOME_DIR": str(log_dir / "openhands-home"), + "AUTO_OPEN_DOCKER": "0", + "AI_LAB_FORCE_DOCKER_RUN": "1", + "AI_LAB_GUARDRAIL_DOCKER_CONTEXT": "colima", + "DOCKER_CONTEXT": "colima", + "DOCKER_HOST": f"unix://{inaccessible_socket}", + } + ) + return env + + +def test_launch_ai_lab_rejects_inaccessible_configured_socket(tmp_path: Path) -> None: + env = _base_env(tmp_path) + env["AUTO_START_COLIMA"] = "0" + + completed = subprocess.run( + ["bash", str(SCRIPT), "status"], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 1 + assert "Docker socket is configured but not accessible" in completed.stderr + + +def test_launch_ai_lab_can_fallback_to_repo_managed_colima(tmp_path: Path) -> None: + env = _base_env(tmp_path) + env["AUTO_START_COLIMA"] = "1" + colima_home = tmp_path / "colima-home" + repo_socket = colima_home / "default" / "docker.sock" + helper = tmp_path / "fake_colima_helper.sh" + _write_executable( + helper, + """#!/usr/bin/env bash +set -euo pipefail +mkdir -p "${COLIMA_HOME_PATH}/${COLIMA_PROFILE:-default}" +: > "${COLIMA_HOME_PATH}/${COLIMA_PROFILE:-default}/docker.sock" +""", + ) + env["COLIMA_HOME_PATH"] = str(colima_home) + env["AI_LAB_COLIMA_HELPER"] = str(helper) + env["FAKE_REPO_DOCKER_HOST"] = f"unix://{repo_socket}" + + completed = subprocess.run( + ["bash", str(SCRIPT), "status"], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0 + assert "repo-managed Colima is ready" in completed.stdout + + +def test_launch_ai_lab_can_fallback_to_current_user_colima(tmp_path: Path) -> None: + env = _base_env(tmp_path) + env["AUTO_START_COLIMA"] = "1" + + home_dir = tmp_path / "home" + home_dir.mkdir() + current_user_socket = home_dir / ".colima" / "default" / "docker.sock" + fake_colima = tmp_path / "bin" / "colima" + _write_executable( + fake_colima, + """#!/usr/bin/env bash +set -euo pipefail +mkdir -p "${HOME}/.colima/${COLIMA_PROFILE:-default}" +: > "${HOME}/.colima/${COLIMA_PROFILE:-default}/docker.sock" +""", + ) + env["HOME"] = str(home_dir) + env["FAKE_REPO_DOCKER_HOST"] = f"unix://{current_user_socket}" + + completed = subprocess.run( + ["bash", str(SCRIPT), "status"], + cwd=REPO_ROOT, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0 + assert "current-user Colima is ready" in completed.stdout diff --git a/tests/test_manager_agent.py b/tests/test_manager_agent.py new file mode 100644 index 00000000..ff0f7513 --- /dev/null +++ b/tests/test_manager_agent.py @@ -0,0 +1,264 @@ +from __future__ import annotations + +from pathlib import Path +import sys + +from fastapi.testclient import TestClient + +from autoresearch.agent_protocol.models import DriverResult, JobSpec, RunSummary, ValidationReport +from autoresearch.agents.manager_agent import ManagerAgentService +from autoresearch.api.dependencies import get_housekeeper_service, get_manager_agent_service +from autoresearch.api.main import app +from autoresearch.core.services.housekeeper import HousekeeperService +from autoresearch.shared.housekeeper_contract import HousekeeperChangeReason, HousekeeperMode, HousekeeperModeUpdateRequest +from autoresearch.shared.manager_agent_contract import ( + ManagerDispatchRequest, + ManagerPlanStrategy, +) +from autoresearch.shared.models import JobStatus +from autoresearch.shared.store import InMemoryRepository + + +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _successful_run_summary(job: JobSpec) -> RunSummary: + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="manager dispatch completed successfully", + changed_paths=list(job.policy.allowed_paths), + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri="/tmp/manager-dispatch.patch", + ) + + +def _night_housekeeper_service() -> HousekeeperService: + service = HousekeeperService( + state_repository=InMemoryRepository(), + budget_repository=InMemoryRepository(), + exploration_repository=InMemoryRepository(), + ) + service.update_mode( + HousekeeperModeUpdateRequest( + action="set_manual_override", + target_mode=HousekeeperMode.NIGHT_READONLY_EXPLORE, + changed_by="test", + reason=HousekeeperChangeReason.MANUAL_API, + ) + ) + return service + + +def _seed_basic_panel_repo(repo_root: Path) -> None: + _write(repo_root / "panel" / "app.tsx", "export const App = () => null;\n") + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "panel.py", "router = object()\n") + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "openclaw.py", "router = object()\n") + _write(repo_root / "tests" / "test_panel_security.py", "def test_ok():\n assert True\n") + + +def _seed_admin_dashboard_repo(repo_root: Path) -> None: + _seed_basic_panel_repo(repo_root) + _write(repo_root / "src" / "autoresearch" / "api" / "routers" / "admin.py", "router = object()\n") + _write( + repo_root / "src" / "autoresearch" / "core" / "services" / "metrics_dashboard.py", + "def collect_metrics() -> dict[str, int]:\n return {'cpu': 1}\n", + ) + _write(repo_root / "tests" / "test_admin_managed_skills.py", "def test_admin_ok():\n assert True\n") + _write(repo_root / "tests" / "test_admin_backend.py", "def test_admin_backend_ok():\n assert True\n") + + +def test_manager_agent_translates_fuzzy_game_prompt_into_worker_contract(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_basic_panel_repo(repo_root) + + service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + + dispatch = service.create_dispatch( + ManagerDispatchRequest( + prompt="我想做个小游戏,先在现有 panel 里做一个最小可玩的版本。", + auto_dispatch=False, + ) + ) + + assert dispatch.status is JobStatus.CREATED + assert dispatch.selected_intent is not None + assert dispatch.selected_intent.intent_id == "game_prototype" + assert dispatch.execution_plan is not None + assert dispatch.execution_plan.strategy is ManagerPlanStrategy.SINGLE_TASK + assert len(dispatch.execution_plan.tasks) == 1 + assert "panel/**" in dispatch.worker_spec.allowed_paths + assert "tests/test_panel_security.py" in dispatch.worker_spec.allowed_paths + assert dispatch.worker_spec.test_command == "pytest -q tests/test_panel_security.py" + assert dispatch.agent_job is not None + assert dispatch.agent_job.metadata["manager_intent_label"] == "game_prototype" + assert "小游戏" in dispatch.worker_spec.metadata["manager_prompt"] + + +def test_manager_agent_decomposes_complex_prompt_into_task_dag(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_admin_dashboard_repo(repo_root) + + service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + + dispatch = service.create_dispatch( + ManagerDispatchRequest( + prompt="在 Admin Panel 里加一个带图表的实时服务器资源监控大屏。", + auto_dispatch=False, + ) + ) + + assert dispatch.execution_plan is not None + assert dispatch.execution_plan.strategy is ManagerPlanStrategy.TASK_DAG + assert len(dispatch.execution_plan.tasks) == 3 + + backend_task, tests_task, frontend_task = dispatch.execution_plan.tasks + assert backend_task.stage.value == "backend" + assert tests_task.stage.value == "tests" + assert frontend_task.stage.value == "frontend" + assert tests_task.depends_on == [backend_task.task_id] + assert frontend_task.depends_on == [backend_task.task_id, tests_task.task_id] + assert any(path.startswith("src/autoresearch/api/routers/admin.py") for path in backend_task.worker_spec.allowed_paths) + assert tests_task.worker_spec.allowed_paths == [ + "tests/test_panel_security.py", + "tests/test_admin_managed_skills.py", + ] + assert "panel/**" in frontend_task.worker_spec.allowed_paths + assert frontend_task.worker_spec.metadata["manager_task_stage"] == "frontend" + + +def test_manager_agent_routes_issue_style_landing_page_prompt_to_business_dag(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_admin_dashboard_repo(repo_root) + + service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + + dispatch = service.create_dispatch( + ManagerDispatchRequest( + prompt=( + "Resolve the following GitHub issue in the current repository through the existing patch-only " + "manager pipeline.\n\n" + "Title: Chaos Run: 玛露遮瑕膏落地页商业化压力测试\n" + "Issue body:\n" + "1. 为玛露 6g 罐装遮瑕膏设计一个最小可用的高端浅色风落地页。\n" + "2. 提供预约/留资后端接口。\n" + "3. 补齐至少一组边界测试。\n" + "Deliver the smallest useful fix, stay within scoped files, update tests when needed." + ), + auto_dispatch=False, + ) + ) + + assert dispatch.selected_intent is not None + assert dispatch.selected_intent.intent_id == "product_landing_page" + assert dispatch.execution_plan is not None + assert dispatch.execution_plan.strategy is ManagerPlanStrategy.TASK_DAG + backend_task, tests_task, frontend_task = dispatch.execution_plan.tasks + assert dispatch.selected_intent.metadata["surface_slug"] == "malu" + assert dispatch.selected_intent.metadata["surface_root"] == "apps/malu" + assert backend_task.worker_spec.allowed_paths == [ + "apps/malu/**", + "tests/apps/__init__.py", + "tests/apps/test_malu_landing_page.py", + ] + assert backend_task.worker_spec.test_command == "pytest -q tests/apps/test_malu_landing_page.py" + assert tests_task.worker_spec.allowed_paths == [ + "tests/apps/__init__.py", + "tests/apps/test_malu_landing_page.py", + ] + assert tests_task.worker_spec.test_command == "pytest -q tests/apps/test_malu_landing_page.py" + assert frontend_task.worker_spec.allowed_paths == ["apps/malu/**"] + assert frontend_task.worker_spec.metadata["manager_intent_label"] == "product_landing_page" + + +def test_manager_agent_routes_direct_malu_landing_page_prompt_to_product_intent(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_admin_dashboard_repo(repo_root) + + service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + ) + + dispatch = service.create_dispatch( + ManagerDispatchRequest( + prompt="给我做一个玛露 6g 遮瑕膏落地页,带浅色品牌 UI、预约留资接口和基础测试。", + auto_dispatch=False, + ) + ) + + assert dispatch.selected_intent is not None + assert dispatch.selected_intent.intent_id == "product_landing_page" + assert dispatch.execution_plan is not None + assert dispatch.execution_plan.strategy is ManagerPlanStrategy.TASK_DAG + assert dispatch.selected_intent.allowed_paths == [ + "apps/malu/**", + "tests/apps/__init__.py", + "tests/apps/test_malu_landing_page.py", + ] + assert dispatch.execution_plan.tasks[0].worker_spec.metadata["manager_intent_label"] == "product_landing_page" + assert dispatch.selected_intent.metadata["surface_root"] == "apps/malu" + assert dispatch.execution_plan.tasks[0].worker_spec.allowed_paths == [ + "apps/malu/**", + "tests/apps/__init__.py", + "tests/apps/test_malu_landing_page.py", + ] + assert dispatch.execution_plan.tasks[1].worker_spec.allowed_paths == [ + "tests/apps/__init__.py", + "tests/apps/test_malu_landing_page.py", + ] + assert dispatch.execution_plan.tasks[2].worker_spec.allowed_paths == ["apps/malu/**"] + + +def test_manager_agent_api_dispatch_executes_background_plan(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + _seed_admin_dashboard_repo(repo_root) + + service = ManagerAgentService( + repository=InMemoryRepository(), + repo_root=repo_root, + dispatch_runner=_successful_run_summary, + ) + + app.dependency_overrides[get_manager_agent_service] = lambda: service + app.dependency_overrides[get_housekeeper_service] = _night_housekeeper_service + with TestClient(app) as client: + response = client.post( + "/api/v1/agents/manager/dispatch", + json={"prompt": "在 Admin Panel 里加一个带图表的实时服务器资源监控大屏。"}, + ) + assert response.status_code == 202 + payload = response.json() + assert payload["status"] == "queued" + dispatch_id = payload["dispatch_id"] + + get_response = client.get(f"/api/v1/agents/manager/dispatches/{dispatch_id}") + assert get_response.status_code == 200 + current = get_response.json() + + app.dependency_overrides.clear() + + assert current["status"] == "completed" + assert current["run_summary"]["final_status"] == "ready_for_promotion" + assert current["execution_plan"]["strategy"] == "task_dag" + assert len(current["execution_plan"]["tasks"]) == 3 + assert all(task["status"] == "completed" for task in current["execution_plan"]["tasks"]) + assert current["execution_plan"]["tasks"][0]["run_summary"]["final_status"] == "ready_for_promotion" + assert current["execution_plan"]["tasks"][2]["metadata"]["manager_stage"] == "frontend" diff --git a/tests/test_media_jobs.py b/tests/test_media_jobs.py new file mode 100644 index 00000000..c354e767 --- /dev/null +++ b/tests/test_media_jobs.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from pathlib import Path +import subprocess + +import pytest + +from autoresearch.core.services.media_jobs import MediaJobService +from autoresearch.shared.media_job_contract import MediaJobMode, MediaJobRequest, MediaTargetBucket +from autoresearch.shared.store import InMemoryRepository + + +class _FakeRunner: + def __init__(self) -> None: + self.commands: list[list[str]] = [] + + def __call__(self, command: list[str]) -> subprocess.CompletedProcess[str]: + self.commands.append(command) + if "--dump-single-json" in command: + return subprocess.CompletedProcess( + command, + 0, + stdout='{"title":"Demo","id":"abc123","uploader":"alice","duration":12}', + stderr="", + ) + + output_template = command[command.index("-o") + 1] + output_path = ( + output_template.replace("%(title)s", "Demo") + .replace("%(id)s", "abc123") + .replace("%(uploader)s", "alice") + .replace("%(upload_date)s", "20260331") + .replace("%(ext)s", "mp3" if "-x" in command else "mp4") + ) + path = Path(output_path) + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("media", encoding="utf-8") + return subprocess.CompletedProcess(command, 0, stdout="", stderr="") + + +def _service(tmp_path: Path, runner: _FakeRunner | None = None) -> MediaJobService: + return MediaJobService( + repository=InMemoryRepository(), + event_repository=InMemoryRepository(), + media_root=tmp_path / "media", + allowed_domains={"youtube.com", "youtu.be", "bilibili.com"}, + command_runner=runner or _FakeRunner(), + ) + + +def test_media_job_service_parses_explicit_and_bare_urls(tmp_path: Path) -> None: + service = _service(tmp_path) + + explicit = service.parse_telegram_task("audio https://youtu.be/demo") + assert explicit is not None + assert explicit.mode is MediaJobMode.AUDIO + assert explicit.target_bucket is MediaTargetBucket.AUDIO + + bare = service.parse_telegram_task("https://www.youtube.com/watch?v=demo") + assert bare is not None + assert bare.mode is MediaJobMode.VIDEO + + assert service.parse_telegram_task("https://example.com/article") is None + assert service.parse_telegram_task("audio https://example.com/file.mp3") is None + + +def test_media_job_service_executes_with_whitelisted_template_and_writes_metadata(tmp_path: Path) -> None: + runner = _FakeRunner() + service = _service(tmp_path, runner=runner) + job = service.create( + MediaJobRequest( + url="https://youtu.be/demo", + mode=MediaJobMode.AUDIO, + target_bucket=MediaTargetBucket.AUDIO, + filename_template="{title}-{id}", + ) + ) + + completed = service.execute(job.job_id) + + assert completed.status.value == "completed" + assert completed.title == "Demo" + assert completed.uploader == "alice" + assert completed.metadata_path is not None + assert Path(completed.metadata_path).exists() + assert completed.output_files + assert Path(completed.output_files[0]).exists() + assert Path(completed.output_files[0]).parent.name == job.job_id + assert len(runner.commands) == 2 + + +def test_media_job_service_only_returns_files_for_current_job(tmp_path: Path) -> None: + runner = _FakeRunner() + service = _service(tmp_path, runner=runner) + bucket_dir = tmp_path / "media" / "audio" + bucket_dir.mkdir(parents=True, exist_ok=True) + (bucket_dir / "stale.mp3").write_text("stale", encoding="utf-8") + + job = service.create( + MediaJobRequest( + url="https://youtu.be/demo", + mode=MediaJobMode.AUDIO, + target_bucket=MediaTargetBucket.AUDIO, + filename_template="{title}-{id}", + ) + ) + + completed = service.execute(job.job_id) + + assert completed.status.value == "completed" + assert all("stale.mp3" not in path for path in completed.output_files) + assert all(f"/audio/{job.job_id}/" in path for path in completed.output_files) + + +def test_media_job_request_rejects_unapproved_template_tokens() -> None: + with pytest.raises(ValueError): + MediaJobRequest( + url="https://youtu.be/demo", + mode=MediaJobMode.VIDEO, + target_bucket=MediaTargetBucket.VIDEO, + filename_template="{title}-{badtoken}", + ) diff --git a/tests/test_mock_adapter.py b/tests/test_mock_adapter.py new file mode 100644 index 00000000..b1a40ccc --- /dev/null +++ b/tests/test_mock_adapter.py @@ -0,0 +1,108 @@ +from __future__ import annotations + +import json +import os +from pathlib import Path +import subprocess +import sys + + +REPO_ROOT = Path(__file__).resolve().parents[1] +MOCK_ADAPTER = REPO_ROOT / "drivers" / "mock_adapter.sh" + + +def _run_mock_adapter(tmp_path: Path, *, allowed_paths: list[str], validator_command: str) -> tuple[dict[str, object], Path]: + workspace = tmp_path / "workspace" + workspace.mkdir(parents=True, exist_ok=True) + job_path = tmp_path / "job.json" + result_path = tmp_path / "driver_result.json" + job_path.write_text( + json.dumps( + { + "run_id": "run-mock", + "agent_id": "mock", + "task": "Create a bounded patch candidate.", + "policy": {"allowed_paths": allowed_paths}, + "validators": [ + { + "id": "worker.test_command", + "kind": "command", + "command": validator_command, + } + ], + }, + ensure_ascii=False, + indent=2, + ), + encoding="utf-8", + ) + + completed = subprocess.run( + [str(MOCK_ADAPTER)], + env={ + **os.environ, + "AEP_WORKSPACE": str(workspace), + "AEP_JOB_SPEC": str(job_path), + "AEP_RESULT_PATH": str(result_path), + "PY_BIN": sys.executable, + }, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0, completed.stderr + return json.loads(result_path.read_text(encoding="utf-8")), workspace + + +def test_mock_adapter_prefers_validator_targeted_test_file(tmp_path: Path) -> None: + payload, workspace = _run_mock_adapter( + tmp_path, + allowed_paths=["scripts/check_prompt_hygiene.py", "tests/test_check_prompt_hygiene.py"], + validator_command=f"{sys.executable} -m pytest -q tests/test_check_prompt_hygiene.py", + ) + + target = workspace / "tests" / "test_check_prompt_hygiene.py" + assert payload["changed_paths"] == ["tests/test_check_prompt_hygiene.py"] + assert "def test_mock_autoresearch_candidate" in target.read_text(encoding="utf-8") + + +def test_mock_adapter_falls_back_to_first_allowed_source_file(tmp_path: Path) -> None: + payload, workspace = _run_mock_adapter( + tmp_path, + allowed_paths=["scripts/check_prompt_hygiene.py"], + validator_command=f"{sys.executable} -m py_compile scripts/check_prompt_hygiene.py", + ) + + target = workspace / "scripts" / "check_prompt_hygiene.py" + assert payload["changed_paths"] == ["scripts/check_prompt_hygiene.py"] + assert "def run()" in target.read_text(encoding="utf-8") + + +def test_mock_adapter_uses_validator_target_inside_globbed_apps_scope(tmp_path: Path) -> None: + payload, workspace = _run_mock_adapter( + tmp_path, + allowed_paths=["apps/malu/**"], + validator_command=f"{sys.executable} -m py_compile apps/malu/lead_capture.py", + ) + + target = workspace / "apps" / "malu" / "lead_capture.py" + assert payload["changed_paths"] == ["apps/malu/lead_capture.py"] + assert "class PhoneValidator" in target.read_text(encoding="utf-8") + + +def test_mock_adapter_builds_source_and_test_for_pytest_validated_business_surface(tmp_path: Path) -> None: + payload, workspace = _run_mock_adapter( + tmp_path, + allowed_paths=["apps/malu/**", "tests/apps/test_malu_landing_page.py"], + validator_command="pytest -q tests/apps/test_malu_landing_page.py", + ) + + source_target = workspace / "apps" / "malu" / "lead_capture.py" + test_target = workspace / "tests" / "apps" / "test_malu_landing_page.py" + assert payload["changed_paths"] == [ + "apps/malu/lead_capture.py", + "tests/apps/test_malu_landing_page.py", + ] + assert "class PhoneValidator" in source_target.read_text(encoding="utf-8") + assert "from apps.malu.lead_capture import PhoneValidator, capture_lead" in test_target.read_text(encoding="utf-8") diff --git a/tests/test_openhands_controlled_backend.py b/tests/test_openhands_controlled_backend.py index 2047080f..80e50ba1 100644 --- a/tests/test_openhands_controlled_backend.py +++ b/tests/test_openhands_controlled_backend.py @@ -80,6 +80,7 @@ def test_scope_violation_is_policy_blocked_and_never_promoted( _create_min_repo(repo_root) service = OpenHandsControlledBackendService(repo_root=repo_root, run_root=run_root) + monkeypatch.setattr(service, "_prepare_strict_workspace", lambda **_: None) def _bad_backend(*, prompt: str, workspace: Path, log_file: Path, allowed_paths: list[str]): _ = prompt, allowed_paths @@ -210,3 +211,86 @@ def test_openhands_cli_can_fallback_to_mock_patch(tmp_path: Path) -> None: assert result.iterations_used == 1 assert result.patch_result is not None assert result.patch_result.changed_files == ["src/openhands_demo_task.py"] + + +def test_strict_workspace_uses_overlay_for_allowed_file(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + run_root = tmp_path / "runs" + repo_root.mkdir() + _create_min_repo(repo_root) + blocked_file = repo_root / "src" / "models.py" + blocked_file.write_text("VALUE = 1\n", encoding="utf-8") + allowed_file = repo_root / "src" / "landing_pages.py" + allowed_file.write_text("VALUE = 1\n", encoding="utf-8") + + service = OpenHandsControlledBackendService(repo_root=repo_root, run_root=run_root) + + def _assert_permissions(*, prompt: str, workspace: Path, log_file: Path, allowed_paths: list[str]): + _ = prompt, allowed_paths + writable_target = workspace / "src" / "landing_pages.py" + blocked_target = workspace / "src" / "models.py" + assert writable_target.is_symlink() + assert blocked_target.is_symlink() is False + writable_target.write_text("VALUE = 2\n", encoding="utf-8") + with pytest.raises(PermissionError): + blocked_target.write_text("VALUE = 2\n", encoding="utf-8") + service._append_log(log_file, "[mock-backend] checked strict workspace\n") + return _BackendExecutionOutcome(exit_code=0, stdout="strict workspace ok\n") + + service._run_mock_backend = _assert_permissions # type: ignore[method-assign] + + request = ControlledExecutionRequest( + task_id="demo-strict-view", + prompt="Update landing page helper", + allowed_paths=["src/landing_pages.py"], + test_command=[sys.executable, "-m", "py_compile", "src/landing_pages.py"], + backend=ControlledBackend.MOCK, + ) + + result = service.run(request) + + assert result.status is ControlledRunStatus.READY_FOR_PROMOTION + assert result.changed_files == ["src/landing_pages.py"] + assert "src/models.py" not in result.changed_files + + +def test_fail_fast_probe_aborts_retry_loop_on_module_error( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + repo_root = tmp_path / "repo" + run_root = tmp_path / "runs" + repo_root.mkdir() + _create_min_repo(repo_root) + + service = OpenHandsControlledBackendService(repo_root=repo_root, run_root=run_root) + attempts = {"count": 0} + + def _broken_backend(*, prompt: str, workspace: Path, log_file: Path, allowed_paths: list[str]): + _ = prompt, log_file, allowed_paths + attempts["count"] += 1 + target = workspace / "src" / "broken.py" + target.write_text("import missing_module\n", encoding="utf-8") + return _BackendExecutionOutcome( + exit_code=1, + stderr="ModuleNotFoundError: No module named 'missing_module'\n", + ) + + monkeypatch.setattr(service, "_run_mock_backend", _broken_backend) + + request = ControlledExecutionRequest( + task_id="demo-fail-fast", + prompt="Write python helper", + allowed_paths=["src/broken.py"], + test_command=[sys.executable, "-c", "import sys; sys.exit(7)"], + backend=ControlledBackend.MOCK, + max_iterations=3, + keep_workspace_on_failure=False, + ) + + result = service.run(request) + + assert attempts["count"] == 1 + assert result.status is ControlledRunStatus.FAILED + assert result.validation_status is ValidationStatus.FAILED + assert "fail-fast probe" in (result.error or "") diff --git a/tests/test_openhands_launcher.py b/tests/test_openhands_launcher.py index bb6f0914..59279e72 100644 --- a/tests/test_openhands_launcher.py +++ b/tests/test_openhands_launcher.py @@ -32,6 +32,12 @@ def test_openhands_start_dry_run_prints_ai_lab_command() -> None: assert "launch_ai_lab.sh" in completed.stdout assert "/opt/workspace" in completed.stdout assert "EXTRA_VOLUME=" in completed.stdout + assert "--exp" in completed.stdout + assert "--headless" in completed.stdout + assert ' -t ' in completed.stdout + assert "runuser -u " in completed.stdout + assert "nobody" in completed.stdout + assert "/tmp/openhands-home/.openhands/agent_settings.json" in completed.stdout def test_openhands_start_defaults_audit_path_to_workspace_for_ai_lab(tmp_path: Path) -> None: @@ -58,3 +64,119 @@ def test_openhands_start_defaults_audit_path_to_workspace_for_ai_lab(tmp_path: P assert completed.returncode == 0 assert "/opt/workspace/.openhands-audit" in completed.stdout + assert "OPENHANDS_PERSISTENCE_DIR=/tmp/openhands-home/state/" in completed.stdout + + +def test_openhands_start_legacy_template_can_be_restored_explicitly(tmp_path: Path) -> None: + repo_root = Path(__file__).resolve().parents[1] + workspace = tmp_path / "worktree" + workspace.mkdir() + env = os.environ.copy() + env.update( + { + "OPENHANDS_DRY_RUN": "1", + "OPENHANDS_CMD": "openhands", + "OPENHANDS_HEADLESS": "0", + "OPENHANDS_WORKSPACE": str(workspace), + } + ) + + completed = subprocess.run( + ["bash", str(repo_root / "scripts" / "openhands_start.sh"), "Touch README.md."], + cwd=repo_root, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0 + assert "--headless" not in completed.stdout + assert "OPENHANDS_HEADLESS=0" in completed.stdout + assert 'OPENHANDS_CMD_TEMPLATE="${OPENHANDS_CMD}" "${OPENHANDS_PROMPT}"' in completed.stdout + + +def test_openhands_start_ai_lab_runtime_prefers_container_cli(tmp_path: Path) -> None: + repo_root = Path(__file__).resolve().parents[1] + workspace = tmp_path / "worktree" + workspace.mkdir() + fake_bin = tmp_path / "bin" / "openhands" + fake_bin.parent.mkdir() + fake_bin.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + fake_bin.chmod(0o755) + env = os.environ.copy() + env.update( + { + "OPENHANDS_DRY_RUN": "1", + "OPENHANDS_WORKSPACE": str(workspace), + "OPENHANDS_LOCAL_BIN": str(fake_bin), + } + ) + + completed = subprocess.run( + ["bash", str(repo_root / "scripts" / "openhands_start.sh"), "Touch README.md."], + cwd=repo_root, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0 + assert f"OPENHANDS_CMD={fake_bin}" not in completed.stdout + assert "OPENHANDS_CMD=openhands" in completed.stdout + + +def test_openhands_start_host_runtime_changes_into_workspace(tmp_path: Path) -> None: + repo_root = Path(__file__).resolve().parents[1] + workspace = tmp_path / "worktree" + workspace.mkdir() + env = os.environ.copy() + env.update( + { + "OPENHANDS_DRY_RUN": "1", + "OPENHANDS_RUNTIME": "host", + "OPENHANDS_CMD": "openhands", + "OPENHANDS_WORKSPACE": str(workspace), + } + ) + + completed = subprocess.run( + ["bash", str(repo_root / "scripts" / "openhands_start.sh"), "Touch README.md."], + cwd=repo_root, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0 + assert 'cd "${OPENHANDS_WORKSPACE}"' in completed.stdout + + +def test_openhands_start_can_disable_experimental_mode(tmp_path: Path) -> None: + repo_root = Path(__file__).resolve().parents[1] + workspace = tmp_path / "worktree" + workspace.mkdir() + env = os.environ.copy() + env.update( + { + "OPENHANDS_DRY_RUN": "1", + "OPENHANDS_CMD": "openhands", + "OPENHANDS_EXPERIMENTAL": "0", + "OPENHANDS_WORKSPACE": str(workspace), + } + ) + + completed = subprocess.run( + ["bash", str(repo_root / "scripts" / "openhands_start.sh"), "Touch README.md."], + cwd=repo_root, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0 + assert "--exp" not in completed.stdout + assert "--headless" in completed.stdout diff --git a/tests/test_openhands_worker.py b/tests/test_openhands_worker.py index bd81e25b..fd77e63c 100644 --- a/tests/test_openhands_worker.py +++ b/tests/test_openhands_worker.py @@ -1,5 +1,7 @@ from __future__ import annotations +import sys + import pytest from autoresearch.core.services.openhands_worker import OpenHandsWorkerService @@ -21,14 +23,16 @@ def test_openhands_worker_builds_patch_only_agent_job_spec() -> None: assert job.agent_id == "openhands" assert job.mode == "patch_only" + assert job.policy.timeout_sec == 420 assert job.policy.allowed_paths == ["src/foo.py", "tests/test_foo.py"] - assert job.validators[0].command == "pytest tests/test_foo.py -q" + assert job.validators[0].command == f"{sys.executable} -m pytest tests/test_foo.py -q" assert job.metadata["worker_contract"] == "openhands-worker/v1" assert job.metadata["worker_output_mode"] == "patch" assert job.metadata["pipeline_target"] == "draft_pr" assert "Do not run git add, git commit, git push" in job.task assert "allowed_paths:" in job.task assert "test_command:" in job.task + assert f"{sys.executable} -m pytest tests/test_foo.py -q" in job.task def test_openhands_worker_builds_controlled_request_with_mock_fallback() -> None: @@ -48,7 +52,7 @@ def test_openhands_worker_builds_controlled_request_with_mock_fallback() -> None assert request.failure_strategy is FailureStrategy.FALLBACK assert request.allowed_paths == ["src/autoresearch/core/services/openhands_worker.py"] assert request.test_command == [ - "python", + sys.executable, "-m", "py_compile", "src/autoresearch/core/services/openhands_worker.py", @@ -82,3 +86,33 @@ def test_openhands_worker_can_target_patch_pipeline_explicitly() -> None: assert request.worker_output_mode == "patch" assert request.pipeline_target.value == "patch" + + +def test_openhands_worker_builds_controlled_request_with_pytest_via_active_python() -> None: + service = OpenHandsWorkerService() + spec = OpenHandsWorkerJobSpec( + job_id="job-5", + problem_statement="Keep pytest bound to the active interpreter.", + allowed_paths=["tests/test_worker.py"], + test_command="pytest -q tests/test_worker.py", + ) + + request = service.build_controlled_request(spec) + + assert request.test_command == [sys.executable, "-m", "pytest", "-q", "tests/test_worker.py"] + + +def test_openhands_worker_normalizes_python_commands_to_active_interpreter() -> None: + service = OpenHandsWorkerService() + spec = OpenHandsWorkerJobSpec( + job_id="job-6", + problem_statement="Compile a scoped business module with the active interpreter.", + allowed_paths=["apps/malu/lead_capture.py"], + test_command="python -m py_compile apps/malu/lead_capture.py", + ) + + job = service.build_agent_job_spec(spec) + request = service.build_controlled_request(spec) + + assert job.validators[0].command == f"{sys.executable} -m py_compile apps/malu/lead_capture.py" + assert request.test_command == [sys.executable, "-m", "py_compile", "apps/malu/lead_capture.py"] diff --git a/tests/test_openhands_worker_strict_chain.py b/tests/test_openhands_worker_strict_chain.py index 12a8f61b..e09b8f4a 100644 --- a/tests/test_openhands_worker_strict_chain.py +++ b/tests/test_openhands_worker_strict_chain.py @@ -1,9 +1,13 @@ from __future__ import annotations import json +import os from pathlib import Path import shutil import sys +import time + +import pytest from autoresearch.agent_protocol.models import ExecutionPolicy, JobSpec, ValidatorSpec from autoresearch.core.services.git_promotion_gate import GitPromotionGateService @@ -100,6 +104,13 @@ def _copy_worker_scripts(repo_root: Path) -> None: target.chmod(0o755) +def _write_adapter(repo_root: Path, relative: str, source: str) -> None: + target = repo_root / relative + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(source, encoding="utf-8") + target.chmod(0o755) + + def test_openhands_dry_run_emits_patch_candidate_and_reaches_draft_pr( tmp_path: Path, monkeypatch, @@ -163,3 +174,680 @@ def test_openhands_dry_run_emits_patch_candidate_and_reaches_draft_pr( patch_text = Path(summary.promotion_patch_uri or "").read_text(encoding="utf-8") assert "src/generated_worker.py" in patch_text + + +def test_runner_shadow_workspace_blocks_out_of_scope_write_with_permission_error( + tmp_path: Path, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "allowed.py").write_text("VALUE = 1\n", encoding="utf-8") + (repo_root / "src" / "forbidden.py").write_text("SECRET = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/shadow_probe.py", + """#!/usr/bin/env python3 +import json +import os +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +result_path = Path(os.environ["AEP_RESULT_PATH"]) +allowed = workspace / "src" / "allowed.py" +forbidden = workspace / "src" / "forbidden.py" + +error = "missing denial" +try: + forbidden.write_text("SECRET = 2\\n", encoding="utf-8") +except Exception as exc: # pragma: no cover - exercised via runner integration + error = f"{type(exc).__name__}: {exc}" + +allowed.write_text("VALUE = 2\\n", encoding="utf-8") +payload = { + "protocol_version": "aep/v0", + "run_id": "run-shadow-probe", + "agent_id": "openhands", + "attempt": 1, + "status": "succeeded", + "summary": error, + "changed_paths": ["src/allowed.py"], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 0, "commands": 0, "prompt_tokens": None, "completion_tokens": None}, + "recommended_action": "promote", + "error": None, +} +result_path.write_text(json.dumps(payload), encoding="utf-8") +""", + ) + _write_manifest(repo_root, "drivers/shadow_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + + summary = runner.run_job( + JobSpec( + run_id="run-shadow-probe", + agent_id="openhands", + task="Only update src/allowed.py.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/allowed.py", + ) + ], + policy=ExecutionPolicy( + allowed_paths=["src/allowed.py"], + forbidden_paths=["src/forbidden.py", ".git/**", "logs/**", ".masfactory_runtime/**", "memory/**"], + cleanup_on_success=False, + ), + metadata={"pipeline_target": "patch"}, + ) + ) + + assert summary.driver_result.status == "succeeded" + assert "PermissionError" in summary.driver_result.summary + assert (repo_root / "src" / "forbidden.py").read_text(encoding="utf-8") == "SECRET = 1\n" + assert "src/forbidden.py" not in summary.driver_result.changed_paths + + repeated = runner.run_job( + JobSpec( + run_id="run-shadow-probe", + agent_id="openhands", + task="Only update src/allowed.py.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/allowed.py", + ) + ], + policy=ExecutionPolicy( + allowed_paths=["src/allowed.py"], + forbidden_paths=["src/forbidden.py", ".git/**", "logs/**", ".masfactory_runtime/**", "memory/**"], + cleanup_on_success=False, + ), + metadata={"pipeline_target": "patch"}, + ) + ) + + assert repeated.driver_result.status == "succeeded" + + +def test_runner_shadow_workspace_allows_creating_new_scoped_app_directory_without_unlocking_repo( + tmp_path: Path, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "forbidden.py").write_text("SECRET = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/new_surface_probe.py", + """#!/usr/bin/env python3 +import json +import os +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +result_path = Path(os.environ["AEP_RESULT_PATH"]) +allowed = workspace / "apps" / "malu" / "lead_capture.py" +forbidden = workspace / "src" / "forbidden.py" + +allowed.parent.mkdir(parents=True, exist_ok=True) +allowed.write_text("PHONE_PATTERN = r'^1[3-9]\\\\d{9}$'\\n", encoding="utf-8") + +error = "missing denial" +try: + forbidden.write_text("SECRET = 2\\n", encoding="utf-8") +except Exception as exc: # pragma: no cover - exercised via runner integration + error = f"{type(exc).__name__}: {exc}" + +payload = { + "protocol_version": "aep/v0", + "run_id": "run-new-surface-probe", + "agent_id": "openhands", + "attempt": 1, + "status": "succeeded", + "summary": error, + "changed_paths": ["apps/malu/lead_capture.py"], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 0, "commands": 0, "prompt_tokens": None, "completion_tokens": None}, + "recommended_action": "promote", + "error": None, +} +result_path.write_text(json.dumps(payload), encoding="utf-8") +""", + ) + _write_manifest(repo_root, "drivers/new_surface_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + + summary = runner.run_job( + JobSpec( + run_id="run-new-surface-probe", + agent_id="openhands", + task="Create apps/malu/lead_capture.py without touching src/forbidden.py.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile apps/malu/lead_capture.py", + ) + ], + policy=ExecutionPolicy( + allowed_paths=["apps/malu/**"], + forbidden_paths=["src/forbidden.py", ".git/**", "logs/**", ".masfactory_runtime/**", "memory/**"], + cleanup_on_success=False, + ), + metadata={"pipeline_target": "patch"}, + ) + ) + + assert summary.driver_result.status == "succeeded" + assert summary.driver_result.changed_paths == ["apps/malu/lead_capture.py"] + assert "PermissionError" in summary.driver_result.summary + assert (repo_root / "src" / "forbidden.py").read_text(encoding="utf-8") == "SECRET = 1\n" + patch_text = Path(summary.promotion_patch_uri or "").read_text(encoding="utf-8") + assert "apps/malu/lead_capture.py" in patch_text + assert "PHONE_PATTERN = r'^1[3-9]\\d{9}$'" in patch_text + + +def test_runner_fast_fail_aborts_long_running_syntax_breakage(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "broken_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/slow_broken_probe.py", + """#!/usr/bin/env python3 +import os +import time +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +target = workspace / "src" / "broken_worker.py" +target.write_text("def broken(:\\n", encoding="utf-8") +time.sleep(30) +""", + ) + _write_manifest(repo_root, "drivers/slow_broken_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + + started = time.perf_counter() + summary = runner.run_job( + JobSpec( + run_id="run-fast-fail-probe", + agent_id="openhands", + task="Update src/broken_worker.py.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/broken_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/broken_worker.py"], + cleanup_on_success=False, + ), + ) + ) + duration = time.perf_counter() - started + + assert duration < 15 + assert summary.final_status == "failed" + assert summary.driver_result.status == "failed" + assert summary.driver_result.summary == "adapter aborted by fast-fail probe" + assert "SyntaxError" in (summary.driver_result.error or "") + + +def test_runner_stall_watchdog_aborts_no_progress_adapter( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "idle_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/idle_probe.py", + """#!/usr/bin/env python3 +import time + +time.sleep(30) +""", + ) + _write_manifest(repo_root, "drivers/idle_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setattr(runner, "_stall_progress_timeout_sec", lambda timeout_sec: 2) + + started = time.perf_counter() + summary = runner.run_job( + JobSpec( + run_id="run-stall-probe", + agent_id="openhands", + task="Wait forever without writing any files.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/idle_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/idle_worker.py"], + cleanup_on_success=False, + ), + ) + ) + duration = time.perf_counter() - started + + assert duration < 10 + assert summary.final_status == "failed" + assert summary.driver_result.status == "stalled_no_progress" + assert summary.driver_result.summary == "adapter stalled after 2s without workspace progress" + assert summary.driver_result.error == "no workspace progress for 2s" + assert summary.driver_result.metrics.first_scoped_write_ms is None + + +def test_runner_records_first_progress_metrics_for_state_and_scoped_write(tmp_path: Path) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "active_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/progress_probe.py", + """#!/usr/bin/env python3 +import json +import os +import time +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +result_path = Path(os.environ["AEP_RESULT_PATH"]) +state_dir = workspace / ".openhands-state" +target = workspace / "src" / "active_worker.py" + +time.sleep(1) +state_dir.mkdir(parents=True, exist_ok=True) +(state_dir / "heartbeat.json").write_text("{\\"ok\\": true}\\n", encoding="utf-8") +time.sleep(1.5) +target.write_text("VALUE = 2\\n", encoding="utf-8") +time.sleep(2.5) +payload = { + "protocol_version": "aep/v0", + "run_id": "run-progress-probe", + "agent_id": "openhands", + "attempt": 1, + "status": "succeeded", + "summary": "progress recorded", + "changed_paths": ["src/active_worker.py"], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 1, "commands": 1, "prompt_tokens": None, "completion_tokens": None}, + "recommended_action": "promote", + "error": None, +} +result_path.write_text(json.dumps(payload), encoding="utf-8") +""", + ) + _write_manifest(repo_root, "drivers/progress_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + + summary = runner.run_job( + JobSpec( + run_id="run-progress-probe", + agent_id="openhands", + task="Touch .openhands-state first, then update src/active_worker.py.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/active_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/active_worker.py"], + cleanup_on_success=False, + ), + ) + ) + + assert summary.final_status == "blocked" + assert summary.driver_result.status == "policy_blocked" + assert summary.driver_result.metrics.first_progress_ms is not None + assert summary.driver_result.metrics.first_scoped_write_ms is not None + assert summary.driver_result.metrics.first_state_heartbeat_ms is not None + assert summary.driver_result.metrics.first_progress_ms <= summary.driver_result.metrics.first_state_heartbeat_ms + assert ( + summary.driver_result.metrics.first_state_heartbeat_ms + <= summary.driver_result.metrics.first_scoped_write_ms + ) + + +def test_runner_does_not_treat_stdout_as_runtime_heartbeat( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "chatty_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/output_heartbeat_probe.py", + """#!/usr/bin/env python3 +import json +import os +import sys +import time +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +result_path = Path(os.environ["AEP_RESULT_PATH"]) +target = workspace / "src" / "chatty_worker.py" + +for step in range(4): + print(f"heartbeat {step}", flush=True) + time.sleep(1) + +target.write_text("VALUE = 2\\n", encoding="utf-8") +time.sleep(2.5) +payload = { + "protocol_version": "aep/v0", + "run_id": "run-output-heartbeat-probe", + "agent_id": "openhands", + "attempt": 1, + "status": "succeeded", + "summary": "stdout heartbeat should not keep adapter alive", + "changed_paths": ["src/chatty_worker.py"], + "output_artifacts": [], + "metrics": {"duration_ms": 0, "steps": 1, "commands": 1, "prompt_tokens": None, "completion_tokens": None}, + "recommended_action": "promote", + "error": None, +} +result_path.write_text(json.dumps(payload), encoding="utf-8") +""", + ) + _write_manifest(repo_root, "drivers/output_heartbeat_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setattr(runner, "_stall_progress_timeout_sec", lambda timeout_sec: 2) + + summary = runner.run_job( + JobSpec( + run_id="run-output-heartbeat-probe", + agent_id="openhands", + task="Emit stdout heartbeats before touching the workspace.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/chatty_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/chatty_worker.py"], + cleanup_on_success=False, + ), + ) + ) + + assert summary.final_status == "failed" + assert summary.driver_result.status == "stalled_no_progress" + assert summary.driver_result.metrics.first_state_heartbeat_ms is None + assert summary.driver_result.metrics.first_scoped_write_ms is None + + +def test_runner_ignores_agent_is_working_spinner_noise( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "chatty_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/agent_working_noise_probe.py", + """#!/usr/bin/env python3 +import time + +for _ in range(10): + print("Agent is working", flush=True) + time.sleep(0.5) +""", + ) + _write_manifest(repo_root, "drivers/agent_working_noise_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setattr(runner, "_stall_progress_timeout_sec", lambda timeout_sec: 2) + + started = time.perf_counter() + summary = runner.run_job( + JobSpec( + run_id="run-agent-working-noise-probe", + agent_id="openhands", + task="Emit only spinner-like Agent is working noise forever.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/chatty_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/chatty_worker.py"], + cleanup_on_success=False, + ), + ) + ) + duration = time.perf_counter() - started + + assert duration < 8 + assert summary.final_status == "failed" + assert summary.driver_result.status == "stalled_no_progress" + assert summary.driver_result.metrics.first_state_heartbeat_ms is None + assert summary.driver_result.metrics.first_scoped_write_ms is None + + +def test_runner_kills_process_group_and_persists_summary_for_invalid_log_hang( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "chatty_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/process_group_noise_probe.py", + """#!/usr/bin/env python3 +import os +import subprocess +import sys +import time +from pathlib import Path + +artifacts_dir = Path(os.environ["AEP_ARTIFACT_DIR"]) +pid_file = artifacts_dir / "child.pid" + +child = subprocess.Popen( + [ + sys.executable, + "-c", + "import time\\nwhile True:\\n print('Agent is working', flush=True)\\n time.sleep(0.5)\\n", + ] +) +pid_file.write_text(str(child.pid), encoding="utf-8") + +while True: + print("Agent is working", flush=True) + time.sleep(0.5) +""", + ) + _write_manifest(repo_root, "drivers/process_group_noise_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setattr(runner, "_stall_progress_timeout_sec", lambda timeout_sec: 2) + + summary = runner.run_job( + JobSpec( + run_id="run-process-group-noise-probe", + agent_id="openhands", + task="Spawn a child process that only emits invalid progress noise forever.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/chatty_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/chatty_worker.py"], + cleanup_on_success=False, + ), + ) + ) + + run_dir = tmp_path / "runtime" / "run-process-group-noise-probe" + summary_path = run_dir / "summary.json" + pid_path = run_dir / "artifacts" / "child.pid" + + assert summary.final_status == "failed" + assert summary.driver_result.status == "stalled_no_progress" + assert summary_path.exists() + assert pid_path.exists() + + child_pid = int(pid_path.read_text(encoding="utf-8").strip()) + time.sleep(0.2) + with pytest.raises(ProcessLookupError): + os.kill(child_pid, 0) + + +def test_runner_ignores_log_heartbeat_after_first_scoped_write( + tmp_path: Path, + monkeypatch, +) -> None: + repo_root = tmp_path / "repo" + repo_root.mkdir() + (repo_root / "src").mkdir(parents=True, exist_ok=True) + (repo_root / "src" / "__init__.py").write_text("", encoding="utf-8") + (repo_root / "src" / "chatty_worker.py").write_text("VALUE = 1\n", encoding="utf-8") + _write_adapter( + repo_root, + "drivers/output_then_spin_probe.py", + """#!/usr/bin/env python3 +import os +import shutil +import time +from pathlib import Path + +workspace = Path(os.environ["AEP_WORKSPACE"]) +target = workspace / "src" / "chatty_worker.py" +state_dir = workspace / ".openhands-state" + +for step in range(3): + state_dir.mkdir(parents=True, exist_ok=True) + (state_dir / "heartbeat.json").write_text(f"{{\\"step\\": {step}}}\\n", encoding="utf-8") + print(f"warmup heartbeat {step}", flush=True) + time.sleep(1) + +target.write_text("VALUE = 2\\n", encoding="utf-8") +shutil.rmtree(state_dir) + +step = 0 +while True: + print(f"post-write heartbeat {step}", flush=True) + step += 1 + time.sleep(1) +""", + ) + _write_manifest(repo_root, "drivers/output_then_spin_probe.py") + + runner = AgentExecutionRunner( + repo_root=repo_root, + runtime_root=tmp_path / "runtime", + manifests_dir=repo_root / "configs" / "agents", + ) + monkeypatch.setattr(runner, "_stall_progress_timeout_sec", lambda timeout_sec: 2) + + started = time.perf_counter() + summary = runner.run_job( + JobSpec( + run_id="run-output-then-spin-probe", + agent_id="openhands", + task="Emit stdout heartbeats, write once, then spin forever without state updates.", + validators=[ + ValidatorSpec( + id="worker.test_command", + kind="command", + command=f"{sys.executable} -m py_compile src/chatty_worker.py", + ) + ], + policy=ExecutionPolicy( + timeout_sec=60, + allowed_paths=["src/chatty_worker.py"], + cleanup_on_success=False, + ), + ) + ) + duration = time.perf_counter() - started + + assert duration < 12 + assert summary.final_status == "failed" + assert summary.driver_result.status == "stalled_no_progress" + assert summary.driver_result.metrics.first_state_heartbeat_ms is not None + assert summary.driver_result.metrics.first_scoped_write_ms is not None diff --git a/tests/test_panel_security.py b/tests/test_panel_security.py index 9fd950f7..81a5eb55 100644 --- a/tests/test_panel_security.py +++ b/tests/test_panel_security.py @@ -11,8 +11,10 @@ from fastapi.testclient import TestClient import pytest +from autoresearch.agent_protocol.models import DriverResult, JobSpec, RunSummary, ValidationReport from autoresearch.api.dependencies import ( get_approval_store_service, + get_autoresearch_planner_service, get_capability_provider_registry, get_claude_agent_service, get_openclaw_compat_service, @@ -24,10 +26,12 @@ from autoresearch.core.adapters import CapabilityProviderDescriptorRead, CapabilityProviderRegistry from autoresearch.core.adapters.contracts import CapabilityDomain from autoresearch.core.services.approval_store import ApprovalStoreService +from autoresearch.core.services.autoresearch_planner import AutoResearchPlannerService from autoresearch.core.services.claude_agents import ClaudeAgentService from autoresearch.core.services.openclaw_compat import OpenClawCompatService from autoresearch.core.services.panel_access import PanelAccessService, assert_safe_bind_host from autoresearch.core.services.panel_audit import PanelAuditService +from autoresearch.shared.autoresearch_planner_contract import AutoResearchPlanRead, AutoResearchPlannerRequest from autoresearch.shared.models import ( ApprovalRequestCreateRequest, ApprovalRequestRead, @@ -42,6 +46,7 @@ class StubTelegramNotifier: def __init__(self) -> None: + self.messages: list[dict[str, object]] = [] self.manual_events: list[dict[str, str]] = [] self.status_events: list[dict[str, str]] = [] @@ -49,6 +54,24 @@ def __init__(self) -> None: def enabled(self) -> bool: return True + def send_message( + self, + *, + chat_id: str, + text: str, + disable_web_page_preview: bool = True, + reply_markup: dict[str, object] | None = None, + ) -> bool: + self.messages.append( + { + "chat_id": chat_id, + "text": text, + "disable_web_page_preview": disable_web_page_preview, + "reply_markup": reply_markup, + } + ) + return True + def notify_manual_action(self, *, chat_id: str, entry: PanelAuditLogRead, run_status: str) -> bool: self.manual_events.append( { @@ -97,9 +120,32 @@ def describe(self) -> CapabilityProviderDescriptorRead: return self._descriptor +def _write(path: Path, content: str) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + +def _successful_run_summary(job: JobSpec) -> RunSummary: + return RunSummary( + run_id=job.run_id, + final_status="ready_for_promotion", + driver_result=DriverResult( + run_id=job.run_id, + agent_id=job.agent_id, + status="succeeded", + summary="panel dispatch completed", + changed_paths=list(job.policy.allowed_paths), + recommended_action="promote", + ), + validation=ValidationReport(run_id=job.run_id, passed=True), + promotion_patch_uri="/tmp/panel-dispatch.patch", + ) + + @pytest.fixture def panel_client(tmp_path: Path) -> TestClient: db_path = tmp_path / "panel-security.sqlite3" + planner_repo_root = tmp_path / "planner-repo" openclaw_service = OpenClawCompatService( repository=SQLiteModelRepository( db_path=db_path, @@ -141,6 +187,15 @@ def panel_client(tmp_path: Path) -> TestClient: capability_registry = CapabilityProviderRegistry() capability_registry.register(_StubCapabilityProvider("apple-calendar", CapabilityDomain.CALENDAR, "Apple Calendar")) capability_registry.register(_StubCapabilityProvider("openclaw-skills", CapabilityDomain.SKILL, "OpenClaw Skills")) + planner_service = AutoResearchPlannerService( + repository=SQLiteModelRepository( + db_path=db_path, + table_name="autoresearch_plans_panel_it", + model_cls=AutoResearchPlanRead, + ), + repo_root=planner_repo_root, + dispatch_runner=_successful_run_summary, + ) notifier = StubTelegramNotifier() app.dependency_overrides[get_openclaw_compat_service] = lambda: openclaw_service @@ -148,6 +203,7 @@ def panel_client(tmp_path: Path) -> TestClient: app.dependency_overrides[get_panel_access_service] = lambda: panel_access app.dependency_overrides[get_panel_audit_service] = lambda: panel_audit app.dependency_overrides[get_approval_store_service] = lambda: approval_service + app.dependency_overrides[get_autoresearch_planner_service] = lambda: planner_service app.dependency_overrides[get_capability_provider_registry] = lambda: capability_registry app.dependency_overrides[get_telegram_notifier_service] = lambda: notifier @@ -156,6 +212,8 @@ def panel_client(tmp_path: Path) -> TestClient: setattr(client, "_claude", claude_service) setattr(client, "_panel_access", panel_access) setattr(client, "_approval_store", approval_service) + setattr(client, "_planner", planner_service) + setattr(client, "_planner_repo_root", planner_repo_root) setattr(client, "_capability_registry", capability_registry) setattr(client, "_notifier", notifier) yield client @@ -185,6 +243,8 @@ def test_panel_view_contains_capability_section(panel_client: TestClient) -> Non assert "capability_providers" in response.text assert "待审批" in response.text assert "pending_approvals" in response.text + assert "AutoResearch Plans" in response.text + assert "pending_autoresearch_plans" in response.text def test_panel_state_is_scoped_by_telegram_uid(panel_client: TestClient) -> None: @@ -409,6 +469,63 @@ def test_panel_rejects_tampered_telegram_init_data(panel_client: TestClient) -> assert response.status_code == 401 +def test_panel_lists_and_dispatches_autoresearch_plans(panel_client: TestClient) -> None: + panel_access = getattr(panel_client, "_panel_access") + planner = getattr(panel_client, "_planner") + planner_repo_root = getattr(panel_client, "_planner_repo_root") + notifier = getattr(panel_client, "_notifier") + + _write( + planner_repo_root / "src" / "autoresearch" / "core" / "services" / "panel_target.py", + "\n".join( + [ + "def panel_target() -> bool:", + " # FIXME: add regression coverage for panel dispatch", + " return True", + "", + ] + ), + ) + plan = planner.create(AutoResearchPlannerRequest(telegram_uid="9527")) + + token = _token_from_magic_link(panel_access.create_magic_link("9527").url) + headers = {"x-autoresearch-panel-token": token} + + state = panel_client.get("/api/v1/panel/state", headers=headers) + assert state.status_code == 200 + pending_plans = state.json()["pending_autoresearch_plans"] + assert len(pending_plans) == 1 + assert pending_plans[0]["plan_id"] == plan.plan_id + assert pending_plans[0]["selected_candidate"]["source_path"] == ( + "src/autoresearch/core/services/panel_target.py" + ) + + dispatch = panel_client.post( + f"/api/v1/panel/autoresearch/plans/{plan.plan_id}/dispatch", + headers=headers, + json={"note": "ship it", "metadata": {"source": "panel-test"}}, + ) + assert dispatch.status_code == 200 + assert dispatch.json()["dispatch_status"] == "dispatching" + + stored = planner.get(plan.plan_id) + assert stored is not None + assert stored.dispatch_status.value == "dispatched" + assert stored.run_summary is not None + assert stored.run_summary.final_status == "ready_for_promotion" + + refreshed = panel_client.get("/api/v1/panel/state", headers=headers) + assert refreshed.status_code == 200 + assert refreshed.json()["pending_autoresearch_plans"] == [] + + audit = panel_client.get("/api/v1/panel/audit/logs?limit=20", headers=headers) + assert audit.status_code == 200 + assert any(item["action"] == "dispatch" for item in audit.json()) + + assert any("[AutoResearch Dispatch]" in str(message["text"]) for message in notifier.messages) + assert any(message["chat_id"] == "9527" for message in notifier.messages) + + def _token_from_magic_link(url: str) -> str: parsed = urlparse(url) return parse_qs(parsed.query)["token"][0] diff --git a/tests/test_remote_run_contract.py b/tests/test_remote_run_contract.py new file mode 100644 index 00000000..982d15fd --- /dev/null +++ b/tests/test_remote_run_contract.py @@ -0,0 +1,70 @@ +from __future__ import annotations + +import importlib.util +import json +import sys +from pathlib import Path + +import pytest +from pydantic import ValidationError + +from autoresearch.agent_protocol.models import JobSpec +from autoresearch.shared.remote_run_contract import DispatchLane, RemoteRunRecord, RemoteTaskSpec, RemoteRunSummary + + +def _load_export_module(): + script_path = Path(__file__).resolve().parents[1] / "scripts" / "export_remote_run_schemas.py" + spec = importlib.util.spec_from_file_location("export_remote_run_schemas", script_path) + assert spec is not None + assert spec.loader is not None + module = importlib.util.module_from_spec(spec) + sys.modules[spec.name] = module + spec.loader.exec_module(module) + return module + + +def test_remote_contract_rejects_absolute_artifact_paths() -> None: + with pytest.raises(ValidationError): + RemoteRunRecord( + run_id="run-contract", + artifact_paths={"summary": "/tmp/summary.json"}, + ) + + +def test_remote_contract_rejects_parent_traversal_artifact_paths() -> None: + with pytest.raises(ValidationError): + RemoteRunSummary( + run_id="run-contract", + artifact_paths={"summary": "../outside.json"}, + ) + + +def test_remote_task_spec_rejects_invalid_lane() -> None: + with pytest.raises(ValidationError): + RemoteTaskSpec( + run_id="run-contract", + requested_lane=DispatchLane.LOCAL, + lane="bogus", + runtime_mode="day", + job=JobSpec(run_id="run-contract", agent_id="openhands", task="demo"), + ) + + +def test_remote_task_spec_requires_run_id() -> None: + with pytest.raises(ValidationError): + RemoteTaskSpec( + run_id="", + job=JobSpec(run_id="run-contract", agent_id="openhands", task="demo"), + ) + + +def test_exported_remote_schemas_match_model_json_schema(tmp_path: Path) -> None: + module = _load_export_module() + written = module.export_schemas(tmp_path) + + task_schema = json.loads((tmp_path / "task_run.schema.json").read_text(encoding="utf-8")) + summary_schema = json.loads((tmp_path / "run_summary.schema.json").read_text(encoding="utf-8")) + + assert {path.name for path in written} == {"task_run.schema.json", "run_summary.schema.json"} + assert task_schema == RemoteTaskSpec.model_json_schema() + assert summary_schema == RemoteRunSummary.model_json_schema() diff --git a/tests/test_runtime_select_mode.py b/tests/test_runtime_select_mode.py new file mode 100644 index 00000000..fa932875 --- /dev/null +++ b/tests/test_runtime_select_mode.py @@ -0,0 +1,22 @@ +from __future__ import annotations + +from autoresearch.core.runtime.select_mode import load_mode_policy, select_mode +from autoresearch.shared.remote_run_contract import DispatchLane + + +def test_load_runtime_mode_policies() -> None: + day = load_mode_policy(mode_name="day") + night = load_mode_policy(mode_name="night") + + assert day.preferred_lane is DispatchLane.LOCAL + assert day.allow_draft_pr is False + assert night.preferred_lane is DispatchLane.REMOTE + assert night.allow_exploration is True + + +def test_select_mode_falls_back_to_local_when_remote_is_unavailable() -> None: + selected = select_mode(requested_mode="night", remote_available=False) + + assert selected.requested_lane is DispatchLane.REMOTE + assert selected.lane is DispatchLane.LOCAL + assert selected.fallback_reason is not None diff --git a/tests/test_sync_openclaw_upstream.py b/tests/test_sync_openclaw_upstream.py new file mode 100644 index 00000000..4ef8ca5b --- /dev/null +++ b/tests/test_sync_openclaw_upstream.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +import os +from pathlib import Path +import subprocess + + +def _git(repo: Path, *args: str, cwd: Path | None = None) -> str: + env = os.environ.copy() + env.update( + { + "GIT_AUTHOR_NAME": "Codex Tests", + "GIT_AUTHOR_EMAIL": "codex-tests@example.com", + "GIT_COMMITTER_NAME": "Codex Tests", + "GIT_COMMITTER_EMAIL": "codex-tests@example.com", + } + ) + completed = subprocess.run( + ["git", *args], + cwd=str(cwd or repo), + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr or completed.stdout + return completed.stdout.strip() + + +def _commit(repo: Path, rel_path: str, content: str, message: str) -> None: + target = repo / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + _git(repo, "add", rel_path) + _git(repo, "commit", "-m", message) + + +def test_sync_openclaw_upstream_script_cleans_temp_worktree(tmp_path: Path) -> None: + repo_root = Path(__file__).resolve().parents[1] + upstream_repo = tmp_path / "upstream" + _git(tmp_path, "init", "-b", "main", str(upstream_repo), cwd=tmp_path) + _commit(upstream_repo, "README.md", "# upstream\n", "docs: bootstrap repo") + _commit(upstream_repo, "extensions/line/src/channel.ts", "export const ok = true;\n", "fix: line health check") + + workspace_root = tmp_path / "workspace" + env = os.environ.copy() + env.update( + { + "OPENCLAW_UPSTREAM_URL": upstream_repo.resolve().as_uri(), + "OPENCLAW_SYNC_WORKSPACE_ROOT": str(workspace_root), + "OPENCLAW_SYNC_MAX_COMMITS": "2", + } + ) + + completed = subprocess.run( + ["bash", str(repo_root / "scripts" / "sync_openclaw_upstream.sh")], + cwd=repo_root, + env=env, + capture_output=True, + text=True, + check=False, + ) + + assert completed.returncode == 0, completed.stderr or completed.stdout + assert "[sync] latest commit" in completed.stdout + assert "fix: line health check" in completed.stdout + assert "analysis complete; cleaning" in completed.stdout + assert not list(workspace_root.glob("openclaw-upstream.*")) diff --git a/tests/test_upstream_watcher.py b/tests/test_upstream_watcher.py new file mode 100644 index 00000000..b00efcfa --- /dev/null +++ b/tests/test_upstream_watcher.py @@ -0,0 +1,94 @@ +from __future__ import annotations + +import os +from pathlib import Path +import subprocess + +from autoresearch.core.services.upstream_watcher import UpstreamWatcherService +from autoresearch.shared.autoresearch_planner_contract import UpstreamWatchDecision + + +def _git(repo: Path, *args: str, cwd: Path | None = None) -> str: + env = os.environ.copy() + env.update( + { + "GIT_AUTHOR_NAME": "Codex Tests", + "GIT_AUTHOR_EMAIL": "codex-tests@example.com", + "GIT_COMMITTER_NAME": "Codex Tests", + "GIT_COMMITTER_EMAIL": "codex-tests@example.com", + } + ) + completed = subprocess.run( + ["git", *args], + cwd=str(cwd or repo), + env=env, + capture_output=True, + text=True, + check=False, + ) + assert completed.returncode == 0, completed.stderr or completed.stdout + return completed.stdout.strip() + + +def _commit(repo: Path, rel_path: str, content: str, message: str) -> None: + target = repo / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + _git(repo, "add", rel_path) + _git(repo, "commit", "-m", message) + + +def _init_upstream_repo(tmp_path: Path) -> Path: + repo = tmp_path / "upstream" + _git(tmp_path, "init", "-b", "main", str(repo), cwd=tmp_path) + _commit(repo, "README.md", "# upstream\n", "docs: bootstrap repo") + return repo + + +def test_upstream_watcher_skips_channel_only_updates_and_cleans_workspace(tmp_path: Path) -> None: + upstream_repo = _init_upstream_repo(tmp_path) + _commit(upstream_repo, "extensions/line/src/channel.ts", "export const ok = true;\n", "fix: line health check") + _commit( + upstream_repo, + "test/helpers/extensions/zalo-lifecycle.ts", + "export const helper = true;\n", + "test: harden zalo lifecycle", + ) + _commit(upstream_repo, "CHANGELOG.md", "- line and zalo hardening\n", "docs: changelog refresh") + + workspace_root = tmp_path / "workspace" + service = UpstreamWatcherService( + upstream_url=upstream_repo.resolve().as_uri(), + workspace_root=workspace_root, + max_commits=3, + ) + + result = service.inspect() + + assert result.decision is UpstreamWatchDecision.SKIP + assert result.cleaned_up is True + assert result.latest_commit_title == "docs: changelog refresh" + assert "extension:line" in result.focus_areas + assert "extension:zalo-lifecycle.ts" not in result.focus_areas + assert result.relevant_paths == [] + assert not list(workspace_root.glob("openclaw-upstream.*")) + + +def test_upstream_watcher_flags_review_when_core_paths_change(tmp_path: Path) -> None: + upstream_repo = _init_upstream_repo(tmp_path) + _commit(upstream_repo, "src/runtime/core.py", "def boot() -> str:\n return 'ok'\n", "feat: tweak runtime core") + _commit(upstream_repo, "extensions/line/src/channel.ts", "export const ok = true;\n", "fix: line health check") + + workspace_root = tmp_path / "workspace" + service = UpstreamWatcherService( + upstream_url=upstream_repo.resolve().as_uri(), + workspace_root=workspace_root, + max_commits=2, + ) + + result = service.inspect() + + assert result.decision is UpstreamWatchDecision.REVIEW + assert "src/runtime/core.py" in result.relevant_paths + assert result.cleaned_up is True + assert not list(workspace_root.glob("openclaw-upstream.*"))