From ae397504db7374352ea5d0168d9f585e25787ca1 Mon Sep 17 00:00:00 2001 From: jingze Date: Thu, 26 Mar 2026 11:11:58 +0800 Subject: [PATCH 01/11] feat(orchestrator): refactor docker-proxy into unified Worker lifecycle service Rename docker-proxy/ to orchestrator/ and restructure into a multi-package Go service that exposes both a unified Worker lifecycle REST API and the existing Docker API passthrough. - Add WorkerBackend/GatewayBackend interfaces for pluggable backends - Implement DockerBackend (Create/Delete/Start/Stop/Status/List via socket) - Add /workers/* REST API with proper HTTP status mapping (409/404/503) - Add /gateway/* API stubs (501, Phase 2 will implement APIG backend) - Preserve Docker API passthrough with SecurityValidator for backward compat - Add Backend Registry with auto-detection (Docker first, SAE in Phase 2) - Update Makefile, CI workflows, install scripts with new names - Comprehensive test coverage: backend, registry, handler, security Co-Authored-By: Claude Opus 4.6 --- .github/workflows/build-rc.yml | 6 +- .github/workflows/build.yml | 6 +- .github/workflows/release.yml | 4 +- .github/workflows/test-integration.yml | 18 +- CLAUDE.md | 100 +++++ Makefile | 50 +-- docker-proxy/go.mod | 3 - docker-proxy/main.go | 133 ------ install/hiclaw-install.ps1 | 24 +- install/hiclaw-install.sh | 60 +-- manager/scripts/lib/container-api.sh | 2 +- {docker-proxy => orchestrator}/Dockerfile | 9 +- orchestrator/api/gateway_handler.go | 30 ++ orchestrator/api/types.go | 58 +++ orchestrator/api/worker_handler.go | 212 ++++++++++ orchestrator/api/worker_handler_test.go | 377 ++++++++++++++++++ orchestrator/backend/backend.go | 72 ++++ orchestrator/backend/docker.go | 350 ++++++++++++++++ orchestrator/backend/docker_test.go | 362 +++++++++++++++++ orchestrator/backend/gateway.go | 43 ++ orchestrator/backend/registry.go | 93 +++++ orchestrator/backend/registry_test.go | 116 ++++++ orchestrator/config.go | 36 ++ orchestrator/go.mod | 3 + orchestrator/main.go | 64 +++ orchestrator/proxy/proxy.go | 120 ++++++ .../proxy}/security.go | 2 +- .../proxy}/security_test.go | 2 +- 28 files changed, 2132 insertions(+), 223 deletions(-) create mode 100644 CLAUDE.md delete mode 100644 docker-proxy/go.mod delete mode 100644 docker-proxy/main.go rename {docker-proxy => orchestrator}/Dockerfile (53%) create mode 100644 orchestrator/api/gateway_handler.go create mode 100644 orchestrator/api/types.go create mode 100644 orchestrator/api/worker_handler.go create mode 100644 orchestrator/api/worker_handler_test.go create mode 100644 orchestrator/backend/backend.go create mode 100644 orchestrator/backend/docker.go create mode 100644 orchestrator/backend/docker_test.go create mode 100644 orchestrator/backend/gateway.go create mode 100644 orchestrator/backend/registry.go create mode 100644 orchestrator/backend/registry_test.go create mode 100644 orchestrator/config.go create mode 100644 orchestrator/go.mod create mode 100644 orchestrator/main.go create mode 100644 orchestrator/proxy/proxy.go rename {docker-proxy => orchestrator/proxy}/security.go (99%) rename {docker-proxy => orchestrator/proxy}/security_test.go (99%) diff --git a/.github/workflows/build-rc.yml b/.github/workflows/build-rc.yml index e5b7ec36..8078fa40 100644 --- a/.github/workflows/build-rc.yml +++ b/.github/workflows/build-rc.yml @@ -78,7 +78,7 @@ jobs: - name: Build and push multi-arch images run: | - make push-manager push-manager-aliyun push-worker push-copaw-worker push-docker-proxy \ + make push-manager push-manager-aliyun push-worker push-copaw-worker push-orchestrator \ VERSION=${{ inputs.version }} \ OPENCLAW_BASE_VERSION=${{ inputs.version }} \ REGISTRY=${{ env.REGISTRY }} \ @@ -93,7 +93,7 @@ jobs: MANAGER_ALIYUN_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-manager-aliyun WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-worker COPAW_WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-copaw-worker - DOCKER_PROXY_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-docker-proxy + ORCHESTRATOR_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-orchestrator run: | STABLE_TAG="${{ steps.meta.outputs.base_stable_tag }}" echo "### RC Build Summary" >> $GITHUB_STEP_SUMMARY @@ -103,7 +103,7 @@ jobs: echo "- Manager Aliyun: \`${MANAGER_ALIYUN_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Worker: \`${WORKER_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- CoPaw Worker: \`${COPAW_WORKER_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY - echo "- Docker Proxy: \`${DOCKER_PROXY_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY + echo "- Orchestrator: \`${ORCHESTRATOR_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Base (RC): \`${BASE_IMAGE}:${{ inputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Base (stable): \`${BASE_IMAGE}:${STABLE_TAG}\`" >> $GITHUB_STEP_SUMMARY echo "- Platforms: \`linux/amd64, linux/arm64\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index e361fe1d..a3709ec3 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -63,7 +63,7 @@ jobs: # also pushes :latest in the same buildx call, so no separate step needed. - name: Build and push multi-arch images run: | - make push-manager push-manager-aliyun push-worker push-copaw-worker push-docker-proxy \ + make push-manager push-manager-aliyun push-worker push-copaw-worker push-orchestrator \ VERSION=${{ steps.meta.outputs.version }} \ REGISTRY=${{ env.REGISTRY }} \ REPO=${{ env.REPO }} \ @@ -77,13 +77,13 @@ jobs: MANAGER_ALIYUN_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-manager-aliyun WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-worker COPAW_WORKER_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-copaw-worker - DOCKER_PROXY_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-docker-proxy + ORCHESTRATOR_IMAGE: ${{ env.REGISTRY }}/${{ env.REPO }}/hiclaw-orchestrator run: | echo "### Build Summary" >> $GITHUB_STEP_SUMMARY echo "- Manager: \`${MANAGER_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Manager Aliyun: \`${MANAGER_ALIYUN_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Worker: \`${WORKER_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- CoPaw Worker: \`${COPAW_WORKER_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY - echo "- Docker Proxy: \`${DOCKER_PROXY_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY + echo "- Orchestrator: \`${ORCHESTRATOR_IMAGE}:${{ steps.meta.outputs.version }}\`" >> $GITHUB_STEP_SUMMARY echo "- Base: \`${OPENCLAW_BASE_IMAGE}:latest\` (pre-built, not rebuilt here)" >> $GITHUB_STEP_SUMMARY echo "- Platforms: \`linux/amd64, linux/arm64\`" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 068cb7a7..5426ea6b 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -91,8 +91,8 @@ jobs: # Worker docker pull ${REGISTRY}/${REPO}/hiclaw-worker:${VERSION} - # Docker Proxy - docker pull ${REGISTRY}/${REPO}/hiclaw-docker-proxy:${VERSION} + # Orchestrator + docker pull ${REGISTRY}/${REPO}/hiclaw-orchestrator:${VERSION} \`\`\` --- diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml index 0e5bc419..04647ebc 100644 --- a/.github/workflows/test-integration.yml +++ b/.github/workflows/test-integration.yml @@ -6,7 +6,7 @@ on: - main paths: - 'manager/**' - - 'docker-proxy/**' + - 'orchestrator/**' - 'hiclaw-controller/**' - 'tests/**' - '.github/workflows/test-integration.yml' @@ -17,7 +17,7 @@ on: - 'v*' paths: - 'manager/**' - - 'docker-proxy/**' + - 'orchestrator/**' - 'hiclaw-controller/**' - 'tests/**' workflow_dispatch: @@ -47,7 +47,7 @@ env: MANAGER_IMAGE: hiclaw/manager-agent:ci-test WORKER_IMAGE: hiclaw/worker-agent:ci-test COPAW_WORKER_IMAGE: hiclaw/copaw-worker:ci-test - DOCKER_PROXY_IMAGE: hiclaw/docker-proxy:ci-test + ORCHESTRATOR_IMAGE: hiclaw/orchestrator:ci-test # Tests that do not require a GitHub token NON_GITHUB_TESTS: "01 02 03 04 05 06 14 15 17 18 19 20 100" @@ -87,10 +87,10 @@ jobs: run: | RUNTIME="${{ inputs.worker_runtime || 'openclaw' }}" if [ "$RUNTIME" = "copaw" ]; then - make build-manager build-copaw-worker build-docker-proxy VERSION=ci-test HIGRESS_REGISTRY=higress-registry.us-west-1.cr.aliyuncs.com \ + make build-manager build-copaw-worker build-orchestrator VERSION=ci-test HIGRESS_REGISTRY=higress-registry.us-west-1.cr.aliyuncs.com \ DOCKER_BUILD_ARGS="--build-arg APT_MIRROR= --build-arg PIP_INDEX_URL=https://pypi.org/simple/" else - make build-manager build-worker build-docker-proxy VERSION=ci-test HIGRESS_REGISTRY=higress-registry.us-west-1.cr.aliyuncs.com + make build-manager build-worker build-orchestrator VERSION=ci-test HIGRESS_REGISTRY=higress-registry.us-west-1.cr.aliyuncs.com fi - name: Install dependencies @@ -116,7 +116,7 @@ jobs: HICLAW_DEFAULT_MODEL="$MODEL" \ HICLAW_INSTALL_MANAGER_IMAGE=${{ env.MANAGER_IMAGE }} \ HICLAW_INSTALL_WORKER_IMAGE="$WORKER_IMG" \ - HICLAW_INSTALL_DOCKER_PROXY_IMAGE=${{ env.DOCKER_PROXY_IMAGE }} \ + HICLAW_INSTALL_ORCHESTRATOR_IMAGE=${{ env.ORCHESTRATOR_IMAGE }} \ bash ./install/hiclaw-install.sh manager - name: Wait for Manager to be ready @@ -394,10 +394,10 @@ jobs: REGISTRY=higress-registry.cn-hangzhou.cr.aliyuncs.com/higress docker pull ${REGISTRY}/hiclaw-manager:${VERSION} docker pull ${REGISTRY}/hiclaw-worker:${VERSION} - docker pull ${REGISTRY}/hiclaw-docker-proxy:${VERSION} + docker pull ${REGISTRY}/hiclaw-orchestrator:${VERSION} docker tag ${REGISTRY}/hiclaw-manager:${VERSION} hiclaw/manager-agent:${VERSION} docker tag ${REGISTRY}/hiclaw-worker:${VERSION} hiclaw/worker-agent:${VERSION} - docker tag ${REGISTRY}/hiclaw-docker-proxy:${VERSION} hiclaw/docker-proxy:${VERSION} + docker tag ${REGISTRY}/hiclaw-orchestrator:${VERSION} hiclaw/orchestrator:${VERSION} - name: Install HiClaw env: @@ -411,7 +411,7 @@ jobs: HICLAW_LLM_PROVIDER=qwen \ HICLAW_INSTALL_MANAGER_IMAGE=hiclaw/manager-agent:${VERSION} \ HICLAW_INSTALL_WORKER_IMAGE=hiclaw/worker-agent:${VERSION} \ - HICLAW_INSTALL_DOCKER_PROXY_IMAGE=hiclaw/docker-proxy:${VERSION} \ + HICLAW_INSTALL_ORCHESTRATOR_IMAGE=hiclaw/orchestrator:${VERSION} \ bash ./install/hiclaw-install.sh manager - name: Wait for Manager to be ready diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..43bd350b --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,100 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What is HiClaw + +HiClaw is an open-source Collaborative Multi-Agent OS using Matrix protocol for human-in-the-loop task coordination. A Manager Agent coordinates Worker Agents, with all communication visible in Matrix rooms. Infrastructure: Higress AI Gateway, Tuwunel Matrix Server, MinIO file storage, Element Web client. + +## Build & Test Commands + +```bash +make build # Build all images (native arch) +make build-manager # Build Manager image only +make build-worker # Build Worker image only +make build-copaw-worker # Build CoPaw Worker image only +make build-orchestrator # Build Orchestrator image only (Go) +make build-openclaw-base # Build base image (rarely needed) + +make test # Build + install + run all integration tests +make test SKIP_INSTALL=1 # Run tests against existing Manager +make test TEST_FILTER="01 02" # Run specific tests only +make test-quick # Smoke test (test-01 only) + +make install # Build + install Manager locally +make uninstall # Stop + remove all containers + +make status # Show all hiclaw container statuses +make logs # Show recent logs (LINES=N to override) +``` + +Orchestrator has its own Go test suite: +```bash +cd orchestrator && go test ./... # Run Go unit tests +cd orchestrator && go test ./backend/... # Run backend tests only +cd orchestrator && go test ./proxy/... # Run security validation tests only +``` + +## Local Full Build (from modified openclaw-base) + +Image dependency: `openclaw-base` → `manager` / `worker`. CoPaw and orchestrator are independent. + +When building from a locally modified openclaw-base, you must override both variables: +```bash +make build-openclaw-base +make build-manager build-worker OPENCLAW_BASE_IMAGE=hiclaw/openclaw-base OPENCLAW_BASE_VERSION=latest +``` + +Without `OPENCLAW_BASE_IMAGE=hiclaw/openclaw-base`, it pulls from the remote registry instead of using your local build. + +## Architecture + +``` +manager/ # All-in-one container: Higress + Tuwunel + MinIO + Element Web + OpenClaw Agent + agent/ # Agent personality (SOUL.md), skills, tools — read by Agent at runtime + scripts/init/ # Supervisord startup scripts for each service + configs/ # Configuration templates (rendered at container start) + supervisord.conf # Process orchestration + +worker/ # OpenClaw Worker container (Node.js 22) +copaw/ # CoPaw Worker container (Python 3.11, alternative runtime) +orchestrator/ # Go-based Worker lifecycle service (unified API + Docker proxy) +openclaw-base/ # Shared base image for manager + worker +shared/lib/ # Shared shell libraries (env bootstrap, credential mgmt, mc wrapper) +install/ # One-click installation scripts (bash + PowerShell) +tests/ # Integration test suite (14 cases) + lib/ # Test helpers: assertions, Matrix client, Higress client, MinIO client +``` + +## Key Conventions + +**Agent-facing content** (`manager/agent/**`): Written in second-person voice addressing the Agent directly ("You are...", "Your responsibilities..."). Never use third-person ("The Manager does X"). This applies to SOUL.md, AGENTS.md, HEARTBEAT.md, SKILL.md, TOOLS.md, and all worker-agent configs. + +**Changelog policy**: Any change to `manager/`, `worker/`, `copaw/`, or `openclaw-base/` must be recorded in `changelog/current.md` before committing. Format: one bullet per logical change with linked commit hash. + +**Shared build context**: Manager, Worker, and CoPaw Dockerfiles use `--build-context shared=./shared/lib` for shared shell libraries. The Makefile handles this automatically. + +**Worker container naming**: All Worker containers must be prefixed `hiclaw-worker-` (enforced by orchestrator security validation). + +## Integration Tests + +Tests live in `tests/` and use bash-based helpers (`tests/lib/`). Each test is a standalone script (`tests/test-NN-*.sh`) that communicates with the Manager via Matrix API. Tests require a running Manager container with all services healthy. + +Key test helpers: +- `tests/lib/test-helpers.sh` — assertions, lifecycle, logging +- `tests/lib/matrix-client.sh` — Matrix API wrapper (send messages, read rooms) +- `tests/lib/higress-client.sh` — Higress Console API wrapper +- `tests/lib/minio-client.sh` — MinIO verification + +## Deployment Modes + +- **Local**: All-in-one container with supervisord, Docker socket mounted for Worker management +- **Cloud (Alibaba SAE)**: Distributed containers, STS credential management, orchestrator for secure container API access + +## Verified Technical Details + +- Tuwunel uses `CONDUWUIT_` env prefix (not `TUWUNEL_`) +- Higress Console uses Session Cookie auth (not Basic Auth) +- MCP Server created via `PUT` (not `POST`) +- Auth plugin takes ~40s to activate after first configuration +- OpenClaw Skills auto-load from `workspace/skills//SKILL.md` diff --git a/Makefile b/Makefile index cad87ad1..cb1dbeb4 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,7 @@ MANAGER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-manager MANAGER_ALIYUN_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-manager-aliyun WORKER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-worker COPAW_WORKER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-copaw-worker -DOCKER_PROXY_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-docker-proxy +ORCHESTRATOR_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-orchestrator OPENCLAW_BASE_IMAGE ?= $(REGISTRY)/$(REPO)/openclaw-base CONTROLLER_IMAGE ?= $(REGISTRY)/$(REPO)/hiclaw-controller @@ -36,7 +36,7 @@ MANAGER_TAG ?= $(MANAGER_IMAGE):$(VERSION) MANAGER_ALIYUN_TAG ?= $(MANAGER_ALIYUN_IMAGE):$(VERSION) WORKER_TAG ?= $(WORKER_IMAGE):$(VERSION) COPAW_WORKER_TAG ?= $(COPAW_WORKER_IMAGE):$(VERSION) -DOCKER_PROXY_TAG ?= $(DOCKER_PROXY_IMAGE):$(VERSION) +ORCHESTRATOR_TAG ?= $(ORCHESTRATOR_IMAGE):$(VERSION) OPENCLAW_BASE_TAG ?= $(OPENCLAW_BASE_IMAGE):$(VERSION) CONTROLLER_TAG ?= $(CONTROLLER_IMAGE):$(VERSION) @@ -45,7 +45,7 @@ LOCAL_MANAGER = hiclaw/manager-agent:$(VERSION) LOCAL_MANAGER_ALIYUN = hiclaw/manager-aliyun:$(VERSION) LOCAL_WORKER = hiclaw/worker-agent:$(VERSION) LOCAL_COPAW_WORKER = hiclaw/copaw-worker:$(VERSION) -LOCAL_DOCKER_PROXY = hiclaw/docker-proxy:$(VERSION) +LOCAL_ORCHESTRATOR = hiclaw/orchestrator:$(VERSION) LOCAL_OPENCLAW_BASE = hiclaw/openclaw-base:$(VERSION) LOCAL_CONTROLLER = hiclaw/hiclaw-controller:$(VERSION) @@ -95,8 +95,8 @@ LINES ?= 50 # ---------- Phony targets ---------- -.PHONY: all build build-openclaw-base build-hiclaw-controller build-manager build-manager-aliyun build-worker build-copaw-worker build-docker-proxy \ - tag push push-openclaw-base push-hiclaw-controller push-manager push-manager-aliyun push-worker push-copaw-worker push-docker-proxy \ +.PHONY: all build build-openclaw-base build-hiclaw-controller build-manager build-manager-aliyun build-worker build-copaw-worker build-orchestrator \ + tag push push-openclaw-base push-hiclaw-controller push-manager push-manager-aliyun push-worker push-copaw-worker push-orchestrator \ push-native push-native-manager push-native-worker push-native-copaw-worker \ buildx-setup \ test test-quick test-installed \ @@ -111,7 +111,7 @@ all: build # ---------- Build ---------- -build: build-manager build-manager-aliyun build-worker build-copaw-worker build-docker-proxy ## Build all images (base image pulled from registry, not rebuilt locally) +build: build-manager build-manager-aliyun build-worker build-copaw-worker build-orchestrator ## Build all images (base image pulled from registry, not rebuilt locally) build-openclaw-base: ## Build OpenClaw base image @echo "==> Building OpenClaw base image: $(LOCAL_OPENCLAW_BASE) (registry: $(HIGRESS_REGISTRY))" @@ -158,11 +158,11 @@ build-copaw-worker: ## Build CoPaw Worker image -t $(LOCAL_COPAW_WORKER) \ ./copaw/ -build-docker-proxy: ## Build Docker API proxy image - @echo "==> Building Docker Proxy image: $(LOCAL_DOCKER_PROXY)" +build-orchestrator: ## Build Orchestrator image + @echo "==> Building Orchestrator image: $(LOCAL_ORCHESTRATOR)" docker build $(PLATFORM_FLAG) $(REGISTRY_ARG) $(DOCKER_BUILD_ARGS) \ - -t $(LOCAL_DOCKER_PROXY) \ - ./docker-proxy/ + -t $(LOCAL_ORCHESTRATOR) \ + ./orchestrator/ # ---------- Tag ---------- @@ -171,13 +171,13 @@ tag: build ## Tag images for registry push docker tag $(LOCAL_MANAGER_ALIYUN) $(MANAGER_ALIYUN_TAG) docker tag $(LOCAL_WORKER) $(WORKER_TAG) docker tag $(LOCAL_COPAW_WORKER) $(COPAW_WORKER_TAG) - docker tag $(LOCAL_DOCKER_PROXY) $(DOCKER_PROXY_TAG) + docker tag $(LOCAL_ORCHESTRATOR) $(ORCHESTRATOR_TAG) ifeq ($(PUSH_LATEST),yes) docker tag $(LOCAL_MANAGER) $(MANAGER_IMAGE):latest docker tag $(LOCAL_MANAGER_ALIYUN) $(MANAGER_ALIYUN_IMAGE):latest docker tag $(LOCAL_WORKER) $(WORKER_IMAGE):latest docker tag $(LOCAL_COPAW_WORKER) $(COPAW_WORKER_IMAGE):latest - docker tag $(LOCAL_DOCKER_PROXY) $(DOCKER_PROXY_IMAGE):latest + docker tag $(LOCAL_ORCHESTRATOR) $(ORCHESTRATOR_IMAGE):latest @echo "==> Images tagged as $(VERSION) and latest" else @echo "==> Images tagged as $(VERSION) (latest not pushed for pre-release)" @@ -205,7 +205,7 @@ else fi endif -push: push-manager push-manager-aliyun push-worker push-copaw-worker push-docker-proxy ## Build + push multi-arch images (amd64 + arm64); base image built separately via build-base.yml +push: push-manager push-manager-aliyun push-worker push-copaw-worker push-orchestrator ## Build + push multi-arch images (amd64 + arm64); base image built separately via build-base.yml push-openclaw-base: buildx-setup ## Build + push multi-arch OpenClaw base image @echo "==> Building + pushing multi-arch OpenClaw base: $(OPENCLAW_BASE_TAG) [$(MULTIARCH_PLATFORMS)]" @@ -361,29 +361,29 @@ else ./copaw/ endif -push-docker-proxy: buildx-setup ## Build + push multi-arch Docker Proxy image - @echo "==> Building + pushing multi-arch Docker Proxy: $(DOCKER_PROXY_TAG) [$(MULTIARCH_PLATFORMS)]" +push-orchestrator: buildx-setup ## Build + push multi-arch Orchestrator image + @echo "==> Building + pushing multi-arch Orchestrator: $(ORCHESTRATOR_TAG) [$(MULTIARCH_PLATFORMS)]" ifeq ($(IS_PODMAN),1) - -podman manifest rm $(DOCKER_PROXY_TAG) 2>/dev/null + -podman manifest rm $(ORCHESTRATOR_TAG) 2>/dev/null $(foreach plat,$(subst $(comma), ,$(MULTIARCH_PLATFORMS)), \ - echo " -> Building Docker Proxy for $(plat)..." && \ + echo " -> Building Orchestrator for $(plat)..." && \ podman build --platform $(plat) \ $(DOCKER_BUILD_ARGS) \ - --manifest $(DOCKER_PROXY_TAG) \ - ./docker-proxy/ && ) true - podman manifest push --all $(DOCKER_PROXY_TAG) docker://$(DOCKER_PROXY_TAG) + --manifest $(ORCHESTRATOR_TAG) \ + ./orchestrator/ && ) true + podman manifest push --all $(ORCHESTRATOR_TAG) docker://$(ORCHESTRATOR_TAG) $(if $(PUSH_LATEST), \ - podman manifest push --all $(DOCKER_PROXY_TAG) docker://$(DOCKER_PROXY_IMAGE):latest && \ + podman manifest push --all $(ORCHESTRATOR_TAG) docker://$(ORCHESTRATOR_IMAGE):latest && \ echo " -> Also pushed :latest tag") else docker buildx build \ --builder $(BUILDX_BUILDER) \ --platform $(MULTIARCH_PLATFORMS) \ $(DOCKER_BUILD_ARGS) \ - -t $(DOCKER_PROXY_TAG) \ - $(if $(PUSH_LATEST),-t $(DOCKER_PROXY_IMAGE):latest) \ + -t $(ORCHESTRATOR_TAG) \ + $(if $(PUSH_LATEST),-t $(ORCHESTRATOR_IMAGE):latest) \ --push \ - ./docker-proxy/ + ./orchestrator/ endif # ---------- Push native-arch only (dev use) ---------- @@ -475,7 +475,7 @@ endif HICLAW_INSTALL_MANAGER_IMAGE=$(LOCAL_MANAGER) \ HICLAW_INSTALL_WORKER_IMAGE=$(LOCAL_WORKER) \ HICLAW_INSTALL_COPAW_WORKER_IMAGE=$(LOCAL_COPAW_WORKER) \ - HICLAW_INSTALL_DOCKER_PROXY_IMAGE=$(LOCAL_DOCKER_PROXY) \ + HICLAW_INSTALL_ORCHESTRATOR_IMAGE=$(LOCAL_ORCHESTRATOR) \ bash ./install/hiclaw-install.sh manager install-interactive: ## Install Manager interactively (prompts for config) diff --git a/docker-proxy/go.mod b/docker-proxy/go.mod deleted file mode 100644 index 6d5005fc..00000000 --- a/docker-proxy/go.mod +++ /dev/null @@ -1,3 +0,0 @@ -module github.com/alibaba/hiclaw/docker-proxy - -go 1.23 diff --git a/docker-proxy/main.go b/docker-proxy/main.go deleted file mode 100644 index 79008562..00000000 --- a/docker-proxy/main.go +++ /dev/null @@ -1,133 +0,0 @@ -package main - -import ( - "bytes" - "context" - "encoding/json" - "fmt" - "io" - "log" - "net" - "net/http" - "net/http/httputil" - "os" - "regexp" -) - -var ( - // URL patterns for POST/DELETE allowlist - containerAction = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/(start|stop|kill|restart|wait|resize|attach|logs)$`) - containerExec = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/exec$`) - containerCreate = regexp.MustCompile(`^(/v[\d.]+)?/containers/create$`) - containerDelete = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+$`) - execStart = regexp.MustCompile(`^(/v[\d.]+)?/exec/[a-zA-Z0-9]+/(start|resize|json)$`) - imageCreate = regexp.MustCompile(`^(/v[\d.]+)?/images/create$`) -) - -func main() { - socketPath := os.Getenv("HICLAW_PROXY_SOCKET") - if socketPath == "" { - socketPath = "/var/run/docker.sock" - } - - listenAddr := os.Getenv("HICLAW_PROXY_LISTEN") - if listenAddr == "" { - listenAddr = ":2375" - } - - validator := NewSecurityValidator() - - transport := &http.Transport{ - DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { - return net.Dial("unix", socketPath) - }, - } - - proxy := &httputil.ReverseProxy{ - Director: func(req *http.Request) { - req.URL.Scheme = "http" - req.URL.Host = "localhost" - }, - Transport: transport, - } - - handler := http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { - path := r.URL.Path - - // GET requests are read-only, always allow - if r.Method == http.MethodGet || r.Method == http.MethodHead { - proxy.ServeHTTP(w, r) - return - } - - // POST/DELETE allowlist - switch { - case r.Method == http.MethodPost && containerCreate.MatchString(path): - handleContainerCreate(w, r, proxy, validator) - return - - case r.Method == http.MethodPost && containerAction.MatchString(path): - // start/stop/kill/restart/wait/resize/attach/logs — allow - case r.Method == http.MethodPost && containerExec.MatchString(path): - // exec create — allow - case r.Method == http.MethodPost && execStart.MatchString(path): - // exec start — allow - case r.Method == http.MethodPost && imageCreate.MatchString(path): - // image pull — allow - case r.Method == http.MethodDelete && containerDelete.MatchString(path): - // container remove — allow - - default: - log.Printf("[DENIED] %s %s", r.Method, r.URL.String()) - http.Error(w, fmt.Sprintf(`{"message":"hiclaw-docker-proxy: %s %s is not allowed"}`, r.Method, path), http.StatusForbidden) - return - } - - proxy.ServeHTTP(w, r) - }) - - log.Printf("hiclaw-docker-proxy listening on %s, backend: %s", listenAddr, socketPath) - if len(validator.AllowedRegistries) > 0 { - log.Printf("Allowed registries: %v", validator.AllowedRegistries) - } - if err := http.ListenAndServe(listenAddr, handler); err != nil { - log.Fatalf("Failed to start server: %v", err) - } -} - -func handleContainerCreate(w http.ResponseWriter, r *http.Request, proxy *httputil.ReverseProxy, v *SecurityValidator) { - // Read body - body, err := io.ReadAll(r.Body) - r.Body.Close() - if err != nil { - http.Error(w, `{"message":"hiclaw-docker-proxy: failed to read request body"}`, http.StatusBadRequest) - return - } - - // Parse container name from query param - containerName := r.URL.Query().Get("name") - - // Parse request - var req ContainerCreateRequest - if err := json.Unmarshal(body, &req); err != nil { - http.Error(w, `{"message":"hiclaw-docker-proxy: invalid JSON in request body"}`, http.StatusBadRequest) - return - } - - // Validate - if err := v.ValidateContainerCreate(req, containerName); err != nil { - log.Printf("[BLOCKED] POST /containers/create name=%s: %s", containerName, err) - msg, _ := json.Marshal(map[string]string{"message": fmt.Sprintf("hiclaw-docker-proxy: %s", err)}) - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(http.StatusForbidden) - w.Write(msg) - return - } - - log.Printf("[ALLOWED] POST /containers/create name=%s image=%s", containerName, req.Image) - - // Restore body and forward - r.Body = io.NopCloser(bytes.NewReader(body)) - r.ContentLength = int64(len(body)) - proxy.ServeHTTP(w, r) -} diff --git a/install/hiclaw-install.ps1 b/install/hiclaw-install.ps1 index b70b134b..7c84c79a 100644 --- a/install/hiclaw-install.ps1 +++ b/install/hiclaw-install.ps1 @@ -1797,10 +1797,10 @@ function Install-Manager { "$($script:HICLAW_REGISTRY)/higress/hiclaw-copaw-worker:$($script:HICLAW_VERSION)" } - $script:DOCKER_PROXY_IMAGE = if ($env:HICLAW_INSTALL_DOCKER_PROXY_IMAGE) { - $env:HICLAW_INSTALL_DOCKER_PROXY_IMAGE + $script:ORCHESTRATOR_IMAGE = if ($env:HICLAW_INSTALL_ORCHESTRATOR_IMAGE) { + $env:HICLAW_INSTALL_ORCHESTRATOR_IMAGE } else { - "$($script:HICLAW_REGISTRY)/higress/hiclaw-docker-proxy:$($script:HICLAW_VERSION)" + "$($script:HICLAW_REGISTRY)/higress/hiclaw-orchestrator:$($script:HICLAW_VERSION)" } Write-Log (Get-Msg "install.registry" -f $script:HICLAW_REGISTRY) @@ -2005,17 +2005,21 @@ function Install-Manager { # Start Docker API proxy if enabled if ($config.DOCKER_PROXY -eq "1") { - $proxyImage = $script:DOCKER_PROXY_IMAGE + $proxyImage = $script:ORCHESTRATOR_IMAGE + # Ensure Docker network exists (reuse if already present) + docker network inspect hiclaw-net *>$null + if ($LASTEXITCODE -ne 0) { docker network create hiclaw-net *>$null } Write-Log "Starting Docker API proxy..." - docker rm -f hiclaw-docker-proxy *>$null - docker run -d --name hiclaw-docker-proxy ` + docker rm -f hiclaw-orchestrator *>$null + docker run -d --name hiclaw-orchestrator ` --network hiclaw-net ` -v "//var/run/docker.sock:/var/run/docker.sock" ` --security-opt label=disable ` $(if ($config.PROXY_ALLOWED_REGISTRIES) { @("-e", "HICLAW_PROXY_ALLOWED_REGISTRIES=$($config.PROXY_ALLOWED_REGISTRIES)") }) ` --restart unless-stopped ` $proxyImage - $dockerArgs += @("-e", "HICLAW_CONTAINER_API=http://hiclaw-docker-proxy:2375") + $dockerArgs += @("-e", "HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375") + $dockerArgs += @("--network", "hiclaw-net") Write-Log (Get-Msg "docker_proxy.selected_enabled") } else { $dockerArgs += @("-v", "//var/run/docker.sock:/var/run/docker.sock") @@ -2124,10 +2128,10 @@ function Install-Manager { # Stop and remove existing containers (deferred until after all # configuration is collected and images are pulled successfully) - $existingProxy = docker ps -a --format "{{.Names}}" 2>$null | Select-String "^hiclaw-docker-proxy$" + $existingProxy = docker ps -a --format "{{.Names}}" 2>$null | Select-String "^hiclaw-orchestrator$" if ($existingProxy) { - docker stop hiclaw-docker-proxy *>$null - docker rm hiclaw-docker-proxy *>$null + docker stop hiclaw-orchestrator *>$null + docker rm hiclaw-orchestrator *>$null } $existingContainer = docker ps -a --format "{{.Names}}" 2>$null | Select-String "^hiclaw-manager$" if ($existingContainer) { diff --git a/install/hiclaw-install.sh b/install/hiclaw-install.sh index 6374d655..da503805 100644 --- a/install/hiclaw-install.sh +++ b/install/hiclaw-install.sh @@ -287,8 +287,8 @@ msg() { "install.reinstall.warn_workspace.en") text=" - Manager workspace: %s" ;; "install.reinstall.warn_workers.zh") text=" - 所有 worker 容器" ;; "install.reinstall.warn_workers.en") text=" - All worker containers" ;; - "install.reinstall.warn_proxy.zh") text=" - Docker API 代理容器: hiclaw-docker-proxy" ;; - "install.reinstall.warn_proxy.en") text=" - Docker API proxy container: hiclaw-docker-proxy" ;; + "install.reinstall.warn_proxy.zh") text=" - Docker API 代理容器: hiclaw-orchestrator" ;; + "install.reinstall.warn_proxy.en") text=" - Docker API proxy container: hiclaw-orchestrator" ;; "install.reinstall.warn_network.zh") text=" - Docker 网络: hiclaw-net" ;; "install.reinstall.warn_network.en") text=" - Docker network: hiclaw-net" ;; "install.reinstall.confirm_type.zh") text="请输入工作空间路径以确认删除(或按 Ctrl+C 取消):" ;; @@ -305,8 +305,8 @@ msg() { "install.reinstall.removing_volume.en") text="Removing Docker volume: hiclaw-data" ;; "install.reinstall.warn_volume_fail.zh") text=" 警告: 无法移除卷(可能有引用)" ;; "install.reinstall.warn_volume_fail.en") text=" Warning: Could not remove volume (may have references)" ;; - "install.reinstall.removing_proxy.zh") text="正在移除 Docker API 代理容器: hiclaw-docker-proxy" ;; - "install.reinstall.removing_proxy.en") text="Removing Docker API proxy container: hiclaw-docker-proxy" ;; + "install.reinstall.removing_proxy.zh") text="正在移除 Docker API 代理容器: hiclaw-orchestrator" ;; + "install.reinstall.removing_proxy.en") text="Removing Docker API proxy container: hiclaw-orchestrator" ;; "install.reinstall.removing_network.zh") text="正在移除 Docker 网络: hiclaw-net" ;; "install.reinstall.removing_network.en") text="Removing Docker network: hiclaw-net" ;; "install.reinstall.removing_workspace.zh") text="正在移除工作空间目录: %s" ;; @@ -842,39 +842,39 @@ HICLAW_REGISTRY="${HICLAW_REGISTRY:-$(detect_registry)}" MANAGER_IMAGE="${HICLAW_INSTALL_MANAGER_IMAGE:-}" WORKER_IMAGE="${HICLAW_INSTALL_WORKER_IMAGE:-}" COPAW_WORKER_IMAGE="${HICLAW_INSTALL_COPAW_WORKER_IMAGE:-}" -DOCKER_PROXY_IMAGE="${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-}" +ORCHESTRATOR_IMAGE="${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-}" resolve_image_tags() { MANAGER_IMAGE="${HICLAW_INSTALL_MANAGER_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-manager:${HICLAW_VERSION}}" WORKER_IMAGE="${HICLAW_INSTALL_WORKER_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-worker:${HICLAW_VERSION}}" COPAW_WORKER_IMAGE="${HICLAW_INSTALL_COPAW_WORKER_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-copaw-worker:${HICLAW_VERSION}}" - # docker-proxy: prefer versioned tag, fall back to :latest at pull time - # via resolve_docker_proxy_image(). - DOCKER_PROXY_IMAGE="${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-docker-proxy:${HICLAW_VERSION}}" + # orchestrator: prefer versioned tag, fall back to :latest at pull time + # via resolve_orchestrator_image(). + ORCHESTRATOR_IMAGE="${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-${HICLAW_REGISTRY}/higress/hiclaw-orchestrator:${HICLAW_VERSION}}" } -# Resolve the docker-proxy image: try the versioned tag first; if the registry +# Resolve the orchestrator image: try the versioned tag first; if the registry # doesn't have it (component didn't exist yet in that release), fall back to :latest. -# Sets DOCKER_PROXY_IMAGE to the tag that will actually be pulled. -resolve_docker_proxy_image() { +# Sets ORCHESTRATOR_IMAGE to the tag that will actually be pulled. +resolve_orchestrator_image() { # If the user explicitly overrode the image, respect it as-is. - [ -n "${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-}" ] && return 0 + [ -n "${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-}" ] && return 0 - local _versioned="${HICLAW_REGISTRY}/higress/hiclaw-docker-proxy:${HICLAW_VERSION}" - local _latest="${HICLAW_REGISTRY}/higress/hiclaw-docker-proxy:latest" + local _versioned="${HICLAW_REGISTRY}/higress/hiclaw-orchestrator:${HICLAW_VERSION}" + local _latest="${HICLAW_REGISTRY}/higress/hiclaw-orchestrator:latest" # Skip probe when HICLAW_VERSION is "latest" — no point trying the same tag twice. if [ "${HICLAW_VERSION}" = "latest" ]; then - DOCKER_PROXY_IMAGE="${_latest}" + ORCHESTRATOR_IMAGE="${_latest}" return 0 fi if ${DOCKER_CMD} pull "${_versioned}" >/dev/null 2>&1; then - DOCKER_PROXY_IMAGE="${_versioned}" + ORCHESTRATOR_IMAGE="${_versioned}" else - log "docker-proxy ${HICLAW_VERSION} not found, using latest" + log "orchestrator ${HICLAW_VERSION} not found, using latest" ${DOCKER_CMD} pull "${_latest}" >/dev/null 2>&1 || true - DOCKER_PROXY_IMAGE="${_latest}" + ORCHESTRATOR_IMAGE="${_latest}" fi } @@ -1448,10 +1448,10 @@ step_existing() { ${DOCKER_CMD} rm "${w}" 2>/dev/null || true log "$(msg install.reinstall.removed_worker "${w}")" done - if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-docker-proxy$"; then + if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-orchestrator$"; then log "$(msg install.reinstall.removing_proxy)" - ${DOCKER_CMD} stop hiclaw-docker-proxy 2>/dev/null || true - ${DOCKER_CMD} rm hiclaw-docker-proxy 2>/dev/null || true + ${DOCKER_CMD} stop hiclaw-orchestrator 2>/dev/null || true + ${DOCKER_CMD} rm hiclaw-orchestrator 2>/dev/null || true fi if ${DOCKER_CMD} network ls --format '{{.Name}}' | grep -q "^hiclaw-net$"; then log "$(msg install.reinstall.removing_network)" @@ -2243,16 +2243,16 @@ EOF fi fi - # Resolve and pull docker-proxy image (probes versioned tag, falls back to latest) + # Resolve and pull orchestrator image (probes versioned tag, falls back to latest) if [ "${HICLAW_DOCKER_PROXY:-0}" = "1" ]; then - resolve_docker_proxy_image + resolve_orchestrator_image fi # Stop and remove existing containers (deferred from upgrade detection # so that all configuration is collected and images are pulled first) - if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-docker-proxy$"; then - ${DOCKER_CMD} stop hiclaw-docker-proxy 2>/dev/null || true - ${DOCKER_CMD} rm hiclaw-docker-proxy 2>/dev/null || true + if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-orchestrator$"; then + ${DOCKER_CMD} stop hiclaw-orchestrator 2>/dev/null || true + ${DOCKER_CMD} rm hiclaw-orchestrator 2>/dev/null || true fi if ${DOCKER_CMD} ps -a --format '{{.Names}}' | grep -q "^hiclaw-manager$"; then log "$(msg install.removing_existing)" @@ -2295,17 +2295,19 @@ EOF # Start Docker API proxy if enabled (security layer between Manager and Docker daemon) PROXY_ARGS="" if [ "${HICLAW_DOCKER_PROXY:-0}" = "1" ] && [ -n "${CONTAINER_SOCK:-}" ]; then - local _proxy_image="${DOCKER_PROXY_IMAGE}" + local _proxy_image="${ORCHESTRATOR_IMAGE}" + # Ensure Docker network exists (reuse if already present) + ${DOCKER_CMD} network inspect hiclaw-net >/dev/null 2>&1 || ${DOCKER_CMD} network create hiclaw-net log "Starting Docker API proxy..." ${DOCKER_CMD} run -d \ - --name hiclaw-docker-proxy \ + --name hiclaw-orchestrator \ --network hiclaw-net \ -v "${CONTAINER_SOCK}:/var/run/docker.sock" \ --security-opt label=disable \ ${HICLAW_PROXY_ALLOWED_REGISTRIES:+-e HICLAW_PROXY_ALLOWED_REGISTRIES="${HICLAW_PROXY_ALLOWED_REGISTRIES}"} \ --restart unless-stopped \ "${_proxy_image}" - PROXY_ARGS="-e HICLAW_CONTAINER_API=http://hiclaw-docker-proxy:2375" + PROXY_ARGS="-e HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375 --network hiclaw-net" SOCKET_MOUNT_ARGS="" # Manager no longer needs direct socket access fi diff --git a/manager/scripts/lib/container-api.sh b/manager/scripts/lib/container-api.sh index 961c7179..53296d91 100755 --- a/manager/scripts/lib/container-api.sh +++ b/manager/scripts/lib/container-api.sh @@ -4,7 +4,7 @@ # container runtime socket (Docker or Podman compatible). # # Supports two modes: -# 1. HTTP proxy mode: set HICLAW_CONTAINER_API=http://hiclaw-docker-proxy:2375 +# 1. HTTP proxy mode: set HICLAW_CONTAINER_API=http://hiclaw-orchestrator:2375 # 2. Unix socket mode (legacy): mount docker.sock into the container # # Usage: diff --git a/docker-proxy/Dockerfile b/orchestrator/Dockerfile similarity index 53% rename from docker-proxy/Dockerfile rename to orchestrator/Dockerfile index a04f7acb..ee5584b4 100644 --- a/docker-proxy/Dockerfile +++ b/orchestrator/Dockerfile @@ -4,9 +4,12 @@ FROM ${HIGRESS_REGISTRY}/higress/golang:1.23-alpine AS builder WORKDIR /app COPY go.mod ./ COPY *.go ./ -RUN CGO_ENABLED=0 go build -o /hiclaw-docker-proxy . +COPY proxy/ ./proxy/ +COPY backend/ ./backend/ +COPY api/ ./api/ +RUN CGO_ENABLED=0 go build -o /hiclaw-orchestrator . FROM ${HIGRESS_REGISTRY}/higress/alpine:3.20 -COPY --from=builder /hiclaw-docker-proxy /usr/local/bin/ +COPY --from=builder /hiclaw-orchestrator /usr/local/bin/ EXPOSE 2375 -CMD ["hiclaw-docker-proxy"] +CMD ["hiclaw-orchestrator"] diff --git a/orchestrator/api/gateway_handler.go b/orchestrator/api/gateway_handler.go new file mode 100644 index 00000000..6eb9a2cf --- /dev/null +++ b/orchestrator/api/gateway_handler.go @@ -0,0 +1,30 @@ +package api + +import ( + "net/http" +) + +// GatewayHandler handles /gateway/* HTTP requests. +// Phase 1: all endpoints return 501 Not Implemented. +// Phase 2: will delegate to GatewayBackend (Higress local, APIG cloud). +type GatewayHandler struct{} + +// NewGatewayHandler creates a GatewayHandler. +func NewGatewayHandler() *GatewayHandler { + return &GatewayHandler{} +} + +// CreateConsumer handles POST /gateway/consumers. +func (h *GatewayHandler) CreateConsumer(w http.ResponseWriter, r *http.Request) { + writeError(w, http.StatusNotImplemented, "gateway consumer management not yet implemented (Phase 2)") +} + +// BindConsumer handles POST /gateway/consumers/{id}/bind. +func (h *GatewayHandler) BindConsumer(w http.ResponseWriter, r *http.Request) { + writeError(w, http.StatusNotImplemented, "gateway consumer binding not yet implemented (Phase 2)") +} + +// DeleteConsumer handles DELETE /gateway/consumers/{id}. +func (h *GatewayHandler) DeleteConsumer(w http.ResponseWriter, r *http.Request) { + writeError(w, http.StatusNotImplemented, "gateway consumer deletion not yet implemented (Phase 2)") +} diff --git a/orchestrator/api/types.go b/orchestrator/api/types.go new file mode 100644 index 00000000..1ad0a22b --- /dev/null +++ b/orchestrator/api/types.go @@ -0,0 +1,58 @@ +package api + +import "github.com/alibaba/hiclaw/orchestrator/backend" + +// --- Worker API types --- + +// CreateWorkerRequest is the JSON body for POST /workers. +type CreateWorkerRequest struct { + Name string `json:"name"` + Image string `json:"image,omitempty"` + Runtime string `json:"runtime,omitempty"` + Env map[string]string `json:"env,omitempty"` + Network string `json:"network,omitempty"` + ExtraHosts []string `json:"extra_hosts,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` + Backend string `json:"backend,omitempty"` // override auto-detection +} + +// WorkerResponse is the JSON response for worker operations. +type WorkerResponse struct { + Name string `json:"name"` + Backend string `json:"backend"` + Status backend.WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` +} + +// WorkerListResponse is the JSON response for GET /workers. +type WorkerListResponse struct { + Workers []WorkerResponse `json:"workers"` +} + +// --- Gateway API types --- + +// CreateConsumerRequest is the JSON body for POST /gateway/consumers. +type CreateConsumerRequest struct { + Name string `json:"name"` +} + +// ConsumerResponse is the JSON response for consumer operations. +type ConsumerResponse struct { + Name string `json:"name"` + ConsumerID string `json:"consumer_id"` + APIKey string `json:"api_key,omitempty"` + Status string `json:"status"` +} + +// BindConsumerRequest is the JSON body for POST /gateway/consumers/{id}/bind. +type BindConsumerRequest struct { + ModelAPIID string `json:"model_api_id"` + EnvID string `json:"env_id"` +} + +// ErrorResponse is the JSON error response. +type ErrorResponse struct { + Message string `json:"message"` +} diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go new file mode 100644 index 00000000..cc6b031c --- /dev/null +++ b/orchestrator/api/worker_handler.go @@ -0,0 +1,212 @@ +package api + +import ( + "encoding/json" + "errors" + "log" + "net/http" + + "github.com/alibaba/hiclaw/orchestrator/backend" +) + +// WorkerHandler handles /workers/* HTTP requests. +type WorkerHandler struct { + registry *backend.Registry +} + +// NewWorkerHandler creates a WorkerHandler with the given backend registry. +func NewWorkerHandler(registry *backend.Registry) *WorkerHandler { + return &WorkerHandler{registry: registry} +} + +// Create handles POST /workers. +func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { + var req CreateWorkerRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + writeError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + return + } + if req.Name == "" { + writeError(w, http.StatusBadRequest, "name is required") + return + } + if req.Image == "" { + writeError(w, http.StatusBadRequest, "image is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), req.Backend) + if err != nil { + writeError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + result, err := b.Create(r.Context(), backend.CreateRequest{ + Name: req.Name, + Image: req.Image, + Runtime: req.Runtime, + Env: req.Env, + Network: req.Network, + ExtraHosts: req.ExtraHosts, + WorkingDir: req.WorkingDir, + }) + if err != nil { + log.Printf("[ERROR] create worker %s: %v", req.Name, err) + writeBackendError(w, err) + return + } + + writeJSON(w, http.StatusCreated, toWorkerResponse(result)) +} + +// List handles GET /workers. +func (h *WorkerHandler) List(w http.ResponseWriter, r *http.Request) { + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + writeJSON(w, http.StatusOK, WorkerListResponse{Workers: []WorkerResponse{}}) + return + } + + results, err := b.List(r.Context()) + if err != nil { + log.Printf("[ERROR] list workers: %v", err) + writeBackendError(w, err) + return + } + + workers := make([]WorkerResponse, 0, len(results)) + for _, r := range results { + workers = append(workers, toWorkerResponse(&r)) + } + writeJSON(w, http.StatusOK, WorkerListResponse{Workers: workers}) +} + +// Status handles GET /workers/{name}. +func (h *WorkerHandler) Status(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + writeError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + writeError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + result, err := b.Status(r.Context(), name) + if err != nil { + log.Printf("[ERROR] status worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + writeJSON(w, http.StatusOK, toWorkerResponse(result)) +} + +// Start handles POST /workers/{name}/start. +func (h *WorkerHandler) Start(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + writeError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + writeError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + if err := b.Start(r.Context(), name); err != nil { + log.Printf("[ERROR] start worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// Stop handles POST /workers/{name}/stop. +func (h *WorkerHandler) Stop(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + writeError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + writeError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + if err := b.Stop(r.Context(), name); err != nil { + log.Printf("[ERROR] stop worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// Delete handles DELETE /workers/{name}. +func (h *WorkerHandler) Delete(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + writeError(w, http.StatusBadRequest, "worker name is required") + return + } + + b, err := h.registry.GetWorkerBackend(r.Context(), "") + if err != nil { + writeError(w, http.StatusServiceUnavailable, err.Error()) + return + } + + if err := b.Delete(r.Context(), name); err != nil { + log.Printf("[ERROR] delete worker %s: %v", name, err) + writeBackendError(w, err) + return + } + + w.WriteHeader(http.StatusNoContent) +} + +// --- helpers --- + +func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { + return WorkerResponse{ + Name: r.Name, + Backend: r.Backend, + Status: r.Status, + ContainerID: r.ContainerID, + AppID: r.AppID, + RawStatus: r.RawStatus, + } +} + +func writeJSON(w http.ResponseWriter, status int, v interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(v); err != nil { + log.Printf("[WARN] failed to write JSON response: %v", err) + } +} + +func writeError(w http.ResponseWriter, status int, message string) { + writeJSON(w, status, ErrorResponse{Message: message}) +} + +// writeBackendError maps typed backend errors to appropriate HTTP status codes. +func writeBackendError(w http.ResponseWriter, err error) { + switch { + case errors.Is(err, backend.ErrConflict): + writeError(w, http.StatusConflict, err.Error()) + case errors.Is(err, backend.ErrNotFound): + writeError(w, http.StatusNotFound, err.Error()) + default: + writeError(w, http.StatusInternalServerError, err.Error()) + } +} diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go new file mode 100644 index 00000000..145469fa --- /dev/null +++ b/orchestrator/api/worker_handler_test.go @@ -0,0 +1,377 @@ +package api + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "net/http" + "net/http/httptest" + "testing" + + "github.com/alibaba/hiclaw/orchestrator/backend" +) + +// mockBackend implements backend.WorkerBackend for handler tests. +type mockBackend struct { + name string + available bool + workers map[string]*backend.WorkerResult + + createErr error + startErr error + stopErr error + deleteErr error +} + +func newMockBackend() *mockBackend { + return &mockBackend{ + name: "mock", + available: true, + workers: map[string]*backend.WorkerResult{}, + } +} + +func (m *mockBackend) Name() string { return m.name } +func (m *mockBackend) Available(_ context.Context) bool { return m.available } + +func (m *mockBackend) Create(_ context.Context, req backend.CreateRequest) (*backend.WorkerResult, error) { + if m.createErr != nil { + return nil, m.createErr + } + r := &backend.WorkerResult{ + Name: req.Name, + Backend: "mock", + Status: backend.StatusRunning, + ContainerID: "mock-" + req.Name, + RawStatus: "running", + } + m.workers[req.Name] = r + return r, nil +} + +func (m *mockBackend) Delete(_ context.Context, name string) error { + if m.deleteErr != nil { + return m.deleteErr + } + delete(m.workers, name) + return nil +} + +func (m *mockBackend) Start(_ context.Context, name string) error { + if m.startErr != nil { + return m.startErr + } + if w, ok := m.workers[name]; ok { + w.Status = backend.StatusRunning + return nil + } + return backend.ErrNotFound +} + +func (m *mockBackend) Stop(_ context.Context, name string) error { + if m.stopErr != nil { + return m.stopErr + } + if w, ok := m.workers[name]; ok { + w.Status = backend.StatusStopped + return nil + } + return backend.ErrNotFound +} + +func (m *mockBackend) Status(_ context.Context, name string) (*backend.WorkerResult, error) { + if w, ok := m.workers[name]; ok { + return w, nil + } + return &backend.WorkerResult{ + Name: name, + Backend: "mock", + Status: backend.StatusNotFound, + }, nil +} + +func (m *mockBackend) List(_ context.Context) ([]backend.WorkerResult, error) { + results := make([]backend.WorkerResult, 0, len(m.workers)) + for _, w := range m.workers { + results = append(results, *w) + } + return results, nil +} + +func setupHandler(mb *mockBackend) (*WorkerHandler, *http.ServeMux) { + reg := backend.NewRegistry([]backend.WorkerBackend{mb}, nil) + h := NewWorkerHandler(reg) + mux := http.NewServeMux() + mux.HandleFunc("POST /workers", h.Create) + mux.HandleFunc("GET /workers", h.List) + mux.HandleFunc("GET /workers/{name}", h.Status) + mux.HandleFunc("POST /workers/{name}/start", h.Start) + mux.HandleFunc("POST /workers/{name}/stop", h.Stop) + mux.HandleFunc("DELETE /workers/{name}", h.Delete) + return h, mux +} + +func TestCreateWorker(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{ + Name: "alice", + Image: "hiclaw/worker-agent:latest", + }) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusCreated { + t.Fatalf("expected 201, got %d: %s", w.Code, w.Body.String()) + } + + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Name != "alice" { + t.Errorf("expected name alice, got %s", resp.Name) + } + if resp.Status != backend.StatusRunning { + t.Errorf("expected status running, got %s", resp.Status) + } + if resp.Backend != "mock" { + t.Errorf("expected backend mock, got %s", resp.Backend) + } +} + +func TestCreateWorkerMissingName(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestCreateWorkerMissingImage(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestCreateWorkerInvalidJSON(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader([]byte("not json"))) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} + +func TestCreateWorkerConflict(t *testing.T) { + mb := newMockBackend() + mb.createErr = backend.ErrConflict + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusConflict { + t.Errorf("expected 409, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestCreateWorkerBackendUnavailable(t *testing.T) { + mb := newMockBackend() + mb.available = false + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +func TestListWorkersEmpty(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + + var resp WorkerListResponse + json.NewDecoder(w.Body).Decode(&resp) + if len(resp.Workers) != 0 { + t.Errorf("expected empty list, got %d", len(resp.Workers)) + } +} + +func TestListWorkersNoBackend(t *testing.T) { + mb := newMockBackend() + mb.available = false + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200 even with no backend, got %d", w.Code) + } + + var resp WorkerListResponse + json.NewDecoder(w.Body).Decode(&resp) + if len(resp.Workers) != 0 { + t.Errorf("expected empty list, got %d", len(resp.Workers)) + } +} + +func TestStatusWorker(t *testing.T) { + mb := newMockBackend() + mb.workers["alice"] = &backend.WorkerResult{ + Name: "alice", Backend: "mock", Status: backend.StatusRunning, ContainerID: "mock-alice", + } + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers/alice", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running, got %s", resp.Status) + } +} + +func TestStatusWorkerNotFound(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodGet, "/workers/ghost", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d", w.Code) + } + + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusNotFound { + t.Errorf("expected not_found, got %s", resp.Status) + } +} + +func TestStartWorkerNotFound(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodPost, "/workers/ghost/start", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestStopWorkerNotFound(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodPost, "/workers/ghost/stop", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusNotFound { + t.Errorf("expected 404, got %d: %s", w.Code, w.Body.String()) + } +} + +func TestDeleteWorker(t *testing.T) { + mb := newMockBackend() + mb.workers["alice"] = &backend.WorkerResult{Name: "alice"} + _, mux := setupHandler(mb) + + req := httptest.NewRequest(http.MethodDelete, "/workers/alice", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusNoContent { + t.Errorf("expected 204, got %d", w.Code) + } + if _, exists := mb.workers["alice"]; exists { + t.Error("expected worker to be deleted") + } +} + +func TestCreateWorkerGenericError(t *testing.T) { + mb := newMockBackend() + mb.createErr = errors.New("something broke") + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + if w.Code != http.StatusInternalServerError { + t.Errorf("expected 500, got %d", w.Code) + } +} + +func TestGatewayStubs(t *testing.T) { + h := NewGatewayHandler() + mux := http.NewServeMux() + mux.HandleFunc("POST /gateway/consumers", h.CreateConsumer) + mux.HandleFunc("POST /gateway/consumers/{id}/bind", h.BindConsumer) + mux.HandleFunc("DELETE /gateway/consumers/{id}", h.DeleteConsumer) + + endpoints := []struct { + method string + path string + }{ + {http.MethodPost, "/gateway/consumers"}, + {http.MethodPost, "/gateway/consumers/test-id/bind"}, + {http.MethodDelete, "/gateway/consumers/test-id"}, + } + + for _, ep := range endpoints { + req := httptest.NewRequest(ep.method, ep.path, nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code != http.StatusNotImplemented { + t.Errorf("%s %s: expected 501, got %d", ep.method, ep.path, w.Code) + } + } +} diff --git a/orchestrator/backend/backend.go b/orchestrator/backend/backend.go new file mode 100644 index 00000000..4bc53562 --- /dev/null +++ b/orchestrator/backend/backend.go @@ -0,0 +1,72 @@ +package backend + +import ( + "context" + "errors" +) + +// Typed errors for backend operations. +var ( + ErrConflict = errors.New("resource already exists") + ErrNotFound = errors.New("resource not found") +) + +// WorkerStatus represents normalized worker status across backends. +type WorkerStatus string + +const ( + StatusRunning WorkerStatus = "running" + StatusStopped WorkerStatus = "stopped" + StatusStarting WorkerStatus = "starting" + StatusNotFound WorkerStatus = "not_found" + StatusUnknown WorkerStatus = "unknown" +) + +// CreateRequest holds parameters for creating a worker container/instance. +type CreateRequest struct { + Name string `json:"name"` + Image string `json:"image,omitempty"` + Env map[string]string `json:"env,omitempty"` + Runtime string `json:"runtime,omitempty"` // "openclaw" | "copaw" + Network string `json:"network,omitempty"` + ExtraHosts []string `json:"extra_hosts,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` +} + +// WorkerResult holds the result of a worker operation. +type WorkerResult struct { + Name string `json:"name"` + Backend string `json:"backend"` + Status WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` +} + +// WorkerBackend defines the interface for worker lifecycle operations. +// Implementations: DockerBackend (local), SAEBackend (Alibaba Cloud), future K8s/ACS. +type WorkerBackend interface { + // Name returns the backend identifier (e.g. "docker", "sae"). + Name() string + + // Available reports whether this backend is usable in the current environment. + Available(ctx context.Context) bool + + // Create creates and starts a new worker. + Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) + + // Delete removes a worker. + Delete(ctx context.Context, name string) error + + // Start starts a stopped worker. + Start(ctx context.Context, name string) error + + // Stop stops a running worker. + Stop(ctx context.Context, name string) error + + // Status returns the current status of a worker. + Status(ctx context.Context, name string) (*WorkerResult, error) + + // List returns all workers managed by this backend. + List(ctx context.Context) ([]WorkerResult, error) +} diff --git a/orchestrator/backend/docker.go b/orchestrator/backend/docker.go new file mode 100644 index 00000000..7fd6b272 --- /dev/null +++ b/orchestrator/backend/docker.go @@ -0,0 +1,350 @@ +package backend + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "net/url" + "sort" + "strings" + "time" +) + +// DockerBackend manages worker containers via the Docker Engine API over a Unix socket. +type DockerBackend struct { + socketPath string + client *http.Client + containerPrefix string +} + +// NewDockerBackend creates a DockerBackend that talks to the given Docker socket. +func NewDockerBackend(socketPath string, containerPrefix string) *DockerBackend { + if containerPrefix == "" { + containerPrefix = "hiclaw-worker-" + } + transport := &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", socketPath) + }, + } + return &DockerBackend{ + socketPath: socketPath, + client: &http.Client{Transport: transport}, + containerPrefix: containerPrefix, + } +} + +func (d *DockerBackend) Name() string { return "docker" } + +func (d *DockerBackend) Available(ctx context.Context) bool { + if !DockerSocketAvailable(d.socketPath) { + return false + } + // Ping the Docker daemon to verify it's actually responding. + pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second) + defer cancel() + req, err := http.NewRequestWithContext(pingCtx, http.MethodGet, "http://localhost/_ping", nil) + if err != nil { + return false + } + resp, err := d.client.Do(req) + if err != nil { + return false + } + resp.Body.Close() + return resp.StatusCode == http.StatusOK +} + +func (d *DockerBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) { + containerName := d.containerPrefix + req.Name + + payload := d.buildCreatePayload(req) + body, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("marshal create payload: %w", err) + } + + u := fmt.Sprintf("http://localhost/containers/create?name=%s", url.QueryEscape(containerName)) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, strings.NewReader(string(body))) + if err != nil { + return nil, fmt.Errorf("build create request: %w", err) + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := d.client.Do(httpReq) + if err != nil { + return nil, fmt.Errorf("docker create: %w", err) + } + defer resp.Body.Close() + + respBody, _ := io.ReadAll(resp.Body) + + if resp.StatusCode == http.StatusConflict { + return nil, fmt.Errorf("%w: container %q", ErrConflict, containerName) + } + if resp.StatusCode != http.StatusCreated { + return nil, fmt.Errorf("docker create failed (status %d): %s", resp.StatusCode, string(respBody)) + } + + var createResp struct { + ID string `json:"Id"` + } + if err := json.Unmarshal(respBody, &createResp); err != nil { + return nil, fmt.Errorf("parse create response: %w", err) + } + + if err := d.startContainer(ctx, createResp.ID); err != nil { + return nil, fmt.Errorf("start after create: %w", err) + } + + return &WorkerResult{ + Name: req.Name, + Backend: "docker", + Status: StatusRunning, + ContainerID: createResp.ID, + RawStatus: "running", + }, nil +} + +func (d *DockerBackend) Delete(ctx context.Context, name string) error { + containerName := d.containerPrefix + name + u := fmt.Sprintf("http://localhost/containers/%s?force=true", url.PathEscape(containerName)) + req, err := http.NewRequestWithContext(ctx, http.MethodDelete, u, nil) + if err != nil { + return err + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker delete: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return nil // already gone + } + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker delete failed (status %d): %s", resp.StatusCode, string(body)) + } + return nil +} + +func (d *DockerBackend) Start(ctx context.Context, name string) error { + containerName := d.containerPrefix + name + if err := d.startContainer(ctx, containerName); err != nil { + if strings.Contains(err.Error(), "status 404") { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + return err + } + return nil +} + +func (d *DockerBackend) Stop(ctx context.Context, name string) error { + containerName := d.containerPrefix + name + u := fmt.Sprintf("http://localhost/containers/%s/stop?t=10", url.PathEscape(containerName)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, nil) + if err != nil { + return err + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker stop: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + if resp.StatusCode == http.StatusNotModified { + return nil // already stopped + } + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker stop failed (status %d): %s", resp.StatusCode, string(body)) + } + return nil +} + +func (d *DockerBackend) Status(ctx context.Context, name string) (*WorkerResult, error) { + containerName := d.containerPrefix + name + u := fmt.Sprintf("http://localhost/containers/%s/json", url.PathEscape(containerName)) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + resp, err := d.client.Do(req) + if err != nil { + return nil, fmt.Errorf("docker inspect: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return &WorkerResult{ + Name: name, + Backend: "docker", + Status: StatusNotFound, + }, nil + } + + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("docker inspect failed (status %d): %s", resp.StatusCode, string(body)) + } + + var inspectResp struct { + ID string `json:"Id"` + State struct { + Status string `json:"Status"` + } `json:"State"` + } + if err := json.Unmarshal(body, &inspectResp); err != nil { + return nil, fmt.Errorf("parse inspect response: %w", err) + } + + return &WorkerResult{ + Name: name, + Backend: "docker", + Status: normalizeDockerStatus(inspectResp.State.Status), + ContainerID: inspectResp.ID, + RawStatus: inspectResp.State.Status, + }, nil +} + +func (d *DockerBackend) List(ctx context.Context) ([]WorkerResult, error) { + filters, _ := json.Marshal(map[string][]string{ + "name": {d.containerPrefix}, + }) + u := fmt.Sprintf("http://localhost/containers/json?all=true&filters=%s", url.QueryEscape(string(filters))) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return nil, err + } + resp, err := d.client.Do(req) + if err != nil { + return nil, fmt.Errorf("docker list: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("docker list failed (status %d): %s", resp.StatusCode, string(body)) + } + + var containers []struct { + ID string `json:"Id"` + Names []string `json:"Names"` + State string `json:"State"` + } + if err := json.Unmarshal(body, &containers); err != nil { + return nil, fmt.Errorf("parse list response: %w", err) + } + + results := make([]WorkerResult, 0, len(containers)) + for _, c := range containers { + name := "" + for _, n := range c.Names { + n = strings.TrimPrefix(n, "/") + if strings.HasPrefix(n, d.containerPrefix) { + name = strings.TrimPrefix(n, d.containerPrefix) + break + } + } + if name == "" { + continue + } + results = append(results, WorkerResult{ + Name: name, + Backend: "docker", + Status: normalizeDockerStatus(c.State), + ContainerID: c.ID, + RawStatus: c.State, + }) + } + return results, nil +} + +// --- internal helpers --- + +func (d *DockerBackend) startContainer(ctx context.Context, nameOrID string) error { + u := fmt.Sprintf("http://localhost/containers/%s/start", url.PathEscape(nameOrID)) + req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, nil) + if err != nil { + return err + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker start: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotModified { + return nil // already running + } + if resp.StatusCode == http.StatusNotFound { + return fmt.Errorf("docker start failed (status 404): container not found") + } + if resp.StatusCode != http.StatusNoContent && resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("docker start failed (status %d): %s", resp.StatusCode, string(body)) + } + return nil +} + +// dockerCreatePayload is the Docker Engine API container create body. +type dockerCreatePayload struct { + Image string `json:"Image"` + Env []string `json:"Env,omitempty"` + WorkingDir string `json:"WorkingDir,omitempty"` + HostConfig *dockerHostConfig `json:"HostConfig,omitempty"` +} + +type dockerHostConfig struct { + NetworkMode string `json:"NetworkMode,omitempty"` + ExtraHosts []string `json:"ExtraHosts,omitempty"` +} + +func (d *DockerBackend) buildCreatePayload(req CreateRequest) dockerCreatePayload { + // Sort env keys for deterministic output + keys := make([]string, 0, len(req.Env)) + for k := range req.Env { + keys = append(keys, k) + } + sort.Strings(keys) + + envList := make([]string, 0, len(req.Env)) + for _, k := range keys { + envList = append(envList, k+"="+req.Env[k]) + } + + p := dockerCreatePayload{ + Image: req.Image, + Env: envList, + WorkingDir: req.WorkingDir, + } + + if req.Network != "" || len(req.ExtraHosts) > 0 { + p.HostConfig = &dockerHostConfig{ + NetworkMode: req.Network, + ExtraHosts: req.ExtraHosts, + } + } + + return p +} + +func normalizeDockerStatus(status string) WorkerStatus { + switch strings.ToLower(status) { + case "running": + return StatusRunning + case "exited", "dead": + return StatusStopped + case "created", "restarting": + return StatusStarting + default: + return StatusUnknown + } +} diff --git a/orchestrator/backend/docker_test.go b/orchestrator/backend/docker_test.go new file mode 100644 index 00000000..7b00665f --- /dev/null +++ b/orchestrator/backend/docker_test.go @@ -0,0 +1,362 @@ +package backend + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "strings" + "testing" +) + +// mockDockerAPI creates a test HTTP server that simulates Docker Engine API responses. +func mockDockerAPI(t *testing.T) *httptest.Server { + t.Helper() + + // In-memory container store + containers := map[string]map[string]interface{}{} + + mux := http.NewServeMux() + + // POST /containers/create?name=xxx + mux.HandleFunc("POST /containers/create", func(w http.ResponseWriter, r *http.Request) { + name := r.URL.Query().Get("name") + if _, exists := containers[name]; exists { + w.WriteHeader(http.StatusConflict) + json.NewEncoder(w).Encode(map[string]string{"message": "conflict"}) + return + } + var body map[string]interface{} + json.NewDecoder(r.Body).Decode(&body) + id := fmt.Sprintf("sha256-%s", name) + containers[name] = map[string]interface{}{ + "Id": id, + "Name": "/" + name, + "State": map[string]interface{}{"Status": "created"}, + "Image": body["Image"], + } + w.WriteHeader(http.StatusCreated) + json.NewEncoder(w).Encode(map[string]string{"Id": id}) + }) + + // POST /containers/{id}/start + mux.HandleFunc("POST /containers/{id}/start", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for _, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + state := c["State"].(map[string]interface{}) + state["Status"] = "running" + w.WriteHeader(http.StatusNoContent) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // POST /containers/{id}/stop + mux.HandleFunc("POST /containers/{id}/stop", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for _, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + state := c["State"].(map[string]interface{}) + state["Status"] = "exited" + w.WriteHeader(http.StatusNoContent) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // GET /containers/{id}/json + mux.HandleFunc("GET /containers/{id}/json", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for _, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + json.NewEncoder(w).Encode(c) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // DELETE /containers/{id} + mux.HandleFunc("DELETE /containers/{id}", func(w http.ResponseWriter, r *http.Request) { + id := r.PathValue("id") + for name, c := range containers { + if c["Id"] == id || c["Name"] == "/"+id { + delete(containers, name) + w.WriteHeader(http.StatusNoContent) + return + } + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // GET /containers/json (list) + mux.HandleFunc("GET /containers/json", func(w http.ResponseWriter, r *http.Request) { + var result []map[string]interface{} + for name, c := range containers { + state := c["State"].(map[string]interface{}) + result = append(result, map[string]interface{}{ + "Id": c["Id"], + "Names": []string{"/" + name}, + "State": state["Status"], + }) + } + if result == nil { + result = []map[string]interface{}{} + } + json.NewEncoder(w).Encode(result) + }) + + return httptest.NewServer(mux) +} + +func newTestDockerBackend(t *testing.T, serverURL string) *DockerBackend { + t.Helper() + client := &http.Client{} + b := &DockerBackend{ + client: client, + containerPrefix: "hiclaw-worker-", + } + // Patch all requests to go to the test server instead of Unix socket + b.client = &http.Client{ + Transport: &testTransport{serverURL: serverURL}, + } + return b +} + +// testTransport redirects requests from http://localhost/... to the test server. +type testTransport struct { + serverURL string +} + +func (t *testTransport) RoundTrip(req *http.Request) (*http.Response, error) { + req.URL.Scheme = "http" + req.URL.Host = strings.TrimPrefix(t.serverURL, "http://") + return http.DefaultTransport.RoundTrip(req) +} + +func TestDockerCreate(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + result, err := b.Create(context.Background(), CreateRequest{ + Name: "alice", + Image: "hiclaw/worker-agent:latest", + Network: "hiclaw-net", + Env: map[string]string{"HICLAW_WORKER_NAME": "alice"}, + }) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + if result.Name != "alice" { + t.Errorf("expected name alice, got %s", result.Name) + } + if result.Backend != "docker" { + t.Errorf("expected backend docker, got %s", result.Backend) + } + if result.Status != StatusRunning { + t.Errorf("expected status running, got %s", result.Status) + } + if result.ContainerID == "" { + t.Error("expected non-empty container ID") + } +} + +func TestDockerCreateConflict(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:latest"}) + if err != nil { + t.Fatalf("first create failed: %v", err) + } + + _, err = b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:latest"}) + if err == nil { + t.Error("expected conflict error on duplicate create") + } +} + +func TestDockerStatus(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Create a worker first + _, err := b.Create(context.Background(), CreateRequest{Name: "bob", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + result, err := b.Status(context.Background(), "bob") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusRunning { + t.Errorf("expected running, got %s", result.Status) + } +} + +func TestDockerStatusNotFound(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + result, err := b.Status(context.Background(), "nonexistent") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusNotFound { + t.Errorf("expected not_found, got %s", result.Status) + } +} + +func TestDockerStop(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "carol", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + if err := b.Stop(context.Background(), "carol"); err != nil { + t.Fatalf("Stop failed: %v", err) + } + + result, err := b.Status(context.Background(), "carol") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusStopped { + t.Errorf("expected stopped, got %s", result.Status) + } +} + +func TestDockerStartStopped(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "dave", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + b.Stop(context.Background(), "dave") + + if err := b.Start(context.Background(), "dave"); err != nil { + t.Fatalf("Start failed: %v", err) + } + + result, err := b.Status(context.Background(), "dave") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusRunning { + t.Errorf("expected running after start, got %s", result.Status) + } +} + +func TestDockerDelete(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + _, err := b.Create(context.Background(), CreateRequest{Name: "eve", Image: "img:latest"}) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + if err := b.Delete(context.Background(), "eve"); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + result, err := b.Status(context.Background(), "eve") + if err != nil { + t.Fatalf("Status failed: %v", err) + } + if result.Status != StatusNotFound { + t.Errorf("expected not_found after delete, got %s", result.Status) + } +} + +func TestDockerDeleteNotFound(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Deleting a non-existent container should not error + if err := b.Delete(context.Background(), "ghost"); err != nil { + t.Errorf("Delete of non-existent should not error, got: %v", err) + } +} + +func TestDockerList(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Empty list + workers, err := b.List(context.Background()) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(workers) != 0 { + t.Errorf("expected empty list, got %d", len(workers)) + } + + // Create two workers + b.Create(context.Background(), CreateRequest{Name: "w1", Image: "img:latest"}) + b.Create(context.Background(), CreateRequest{Name: "w2", Image: "img:latest"}) + + workers, err = b.List(context.Background()) + if err != nil { + t.Fatalf("List failed: %v", err) + } + if len(workers) != 2 { + t.Errorf("expected 2 workers, got %d", len(workers)) + } + + names := map[string]bool{} + for _, w := range workers { + names[w.Name] = true + if w.Backend != "docker" { + t.Errorf("expected backend docker, got %s", w.Backend) + } + } + if !names["w1"] || !names["w2"] { + t.Errorf("expected workers w1 and w2, got %v", names) + } +} + +func TestNormalizeDockerStatus(t *testing.T) { + cases := []struct { + input string + expected WorkerStatus + }{ + {"running", StatusRunning}, + {"Running", StatusRunning}, + {"exited", StatusStopped}, + {"dead", StatusStopped}, + {"created", StatusStarting}, + {"restarting", StatusStarting}, + {"paused", StatusUnknown}, + {"", StatusUnknown}, + } + for _, tc := range cases { + got := normalizeDockerStatus(tc.input) + if got != tc.expected { + t.Errorf("normalizeDockerStatus(%q) = %s, want %s", tc.input, got, tc.expected) + } + } +} diff --git a/orchestrator/backend/gateway.go b/orchestrator/backend/gateway.go new file mode 100644 index 00000000..c29ec854 --- /dev/null +++ b/orchestrator/backend/gateway.go @@ -0,0 +1,43 @@ +package backend + +import "context" + +// ConsumerRequest holds parameters for creating a gateway consumer. +type ConsumerRequest struct { + Name string `json:"name"` + ConsumerID string `json:"consumer_id,omitempty"` +} + +// ConsumerResult holds the result of a consumer operation. +type ConsumerResult struct { + Name string `json:"name"` + ConsumerID string `json:"consumer_id"` + APIKey string `json:"api_key"` + Status string `json:"status"` // "created" | "exists" +} + +// BindRequest holds parameters for binding a consumer to a model API. +type BindRequest struct { + ConsumerID string `json:"consumer_id"` + ModelAPIID string `json:"model_api_id"` + EnvID string `json:"env_id"` +} + +// GatewayBackend defines the interface for AI Gateway consumer management. +// Implementations: HigressBackend (local), APIGBackend (Alibaba Cloud). +type GatewayBackend interface { + // Name returns the backend identifier (e.g. "higress", "apig"). + Name() string + + // Available reports whether this backend is usable in the current environment. + Available(ctx context.Context) bool + + // CreateConsumer creates a gateway consumer with key-auth credentials. + CreateConsumer(ctx context.Context, req ConsumerRequest) (*ConsumerResult, error) + + // BindConsumer binds a consumer to a model API resource. + BindConsumer(ctx context.Context, req BindRequest) error + + // DeleteConsumer removes a gateway consumer. + DeleteConsumer(ctx context.Context, consumerID string) error +} diff --git a/orchestrator/backend/registry.go b/orchestrator/backend/registry.go new file mode 100644 index 00000000..7265981a --- /dev/null +++ b/orchestrator/backend/registry.go @@ -0,0 +1,93 @@ +package backend + +import ( + "context" + "fmt" + "log" + "os" +) + +// Registry holds all available backends and provides auto-detection. +type Registry struct { + workerBackends []WorkerBackend + gatewayBackends []GatewayBackend +} + +// NewRegistry creates a Registry with the given backends. +func NewRegistry(workers []WorkerBackend, gateways []GatewayBackend) *Registry { + return &Registry{ + workerBackends: workers, + gatewayBackends: gateways, + } +} + +// DetectWorkerBackend returns the first available worker backend. +// Priority matches _detect_worker_backend() in container-api.sh: +// 1. Docker backend (socket available) +// 2. SAE backend (HICLAW_RUNTIME=aliyun) +// 3. nil +func (r *Registry) DetectWorkerBackend(ctx context.Context) WorkerBackend { + for _, b := range r.workerBackends { + if b.Available(ctx) { + log.Printf("Auto-detected worker backend: %s", b.Name()) + return b + } + } + return nil +} + +// GetWorkerBackend returns a specific worker backend by name, or auto-detects if name is empty. +func (r *Registry) GetWorkerBackend(ctx context.Context, name string) (WorkerBackend, error) { + if name == "" { + b := r.DetectWorkerBackend(ctx) + if b == nil { + return nil, fmt.Errorf("no worker backend available") + } + return b, nil + } + for _, b := range r.workerBackends { + if b.Name() == name { + return b, nil + } + } + return nil, fmt.Errorf("unknown worker backend: %q", name) +} + +// DetectGatewayBackend returns the first available gateway backend. +func (r *Registry) DetectGatewayBackend(ctx context.Context) GatewayBackend { + for _, b := range r.gatewayBackends { + if b.Available(ctx) { + log.Printf("Auto-detected gateway backend: %s", b.Name()) + return b + } + } + return nil +} + +// GetGatewayBackend returns a specific gateway backend by name, or auto-detects if name is empty. +func (r *Registry) GetGatewayBackend(ctx context.Context, name string) (GatewayBackend, error) { + if name == "" { + b := r.DetectGatewayBackend(ctx) + if b == nil { + return nil, fmt.Errorf("no gateway backend available") + } + return b, nil + } + for _, b := range r.gatewayBackends { + if b.Name() == name { + return b, nil + } + } + return nil, fmt.Errorf("unknown gateway backend: %q", name) +} + +// DockerSocketAvailable checks if the Docker socket is accessible. +func DockerSocketAvailable(socketPath string) bool { + _, err := os.Stat(socketPath) + return err == nil +} + +// IsAliyunRuntime checks if HICLAW_RUNTIME is set to "aliyun". +func IsAliyunRuntime() bool { + return os.Getenv("HICLAW_RUNTIME") == "aliyun" +} diff --git a/orchestrator/backend/registry_test.go b/orchestrator/backend/registry_test.go new file mode 100644 index 00000000..5c7fde22 --- /dev/null +++ b/orchestrator/backend/registry_test.go @@ -0,0 +1,116 @@ +package backend + +import ( + "context" + "testing" +) + +// mockWorkerBackend implements WorkerBackend for testing. +type mockWorkerBackend struct { + name string + available bool +} + +func (m *mockWorkerBackend) Name() string { return m.name } +func (m *mockWorkerBackend) Available(_ context.Context) bool { return m.available } +func (m *mockWorkerBackend) Create(_ context.Context, _ CreateRequest) (*WorkerResult, error) { return nil, nil } +func (m *mockWorkerBackend) Delete(_ context.Context, _ string) error { return nil } +func (m *mockWorkerBackend) Start(_ context.Context, _ string) error { return nil } +func (m *mockWorkerBackend) Stop(_ context.Context, _ string) error { return nil } +func (m *mockWorkerBackend) Status(_ context.Context, _ string) (*WorkerResult, error) { return nil, nil } +func (m *mockWorkerBackend) List(_ context.Context) ([]WorkerResult, error) { return nil, nil } + +// mockGatewayBackend implements GatewayBackend for testing. +type mockGatewayBackend struct { + name string + available bool +} + +func (m *mockGatewayBackend) Name() string { return m.name } +func (m *mockGatewayBackend) Available(_ context.Context) bool { return m.available } +func (m *mockGatewayBackend) CreateConsumer(_ context.Context, _ ConsumerRequest) (*ConsumerResult, error) { return nil, nil } +func (m *mockGatewayBackend) BindConsumer(_ context.Context, _ BindRequest) error { return nil } +func (m *mockGatewayBackend) DeleteConsumer(_ context.Context, _ string) error { return nil } + +func TestDetectWorkerBackend_Priority(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + sae := &mockWorkerBackend{name: "sae", available: true} + + reg := NewRegistry([]WorkerBackend{docker, sae}, nil) + got := reg.DetectWorkerBackend(context.Background()) + if got == nil || got.Name() != "docker" { + t.Errorf("expected docker backend (first available), got %v", got) + } +} + +func TestDetectWorkerBackend_Fallback(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: false} + sae := &mockWorkerBackend{name: "sae", available: true} + + reg := NewRegistry([]WorkerBackend{docker, sae}, nil) + got := reg.DetectWorkerBackend(context.Background()) + if got == nil || got.Name() != "sae" { + t.Errorf("expected sae backend (fallback), got %v", got) + } +} + +func TestDetectWorkerBackend_None(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: false} + + reg := NewRegistry([]WorkerBackend{docker}, nil) + got := reg.DetectWorkerBackend(context.Background()) + if got != nil { + t.Errorf("expected nil when no backend available, got %v", got.Name()) + } +} + +func TestGetWorkerBackend_ByName(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + sae := &mockWorkerBackend{name: "sae", available: false} + + reg := NewRegistry([]WorkerBackend{docker, sae}, nil) + + got, err := reg.GetWorkerBackend(context.Background(), "sae") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.Name() != "sae" { + t.Errorf("expected sae, got %s", got.Name()) + } +} + +func TestGetWorkerBackend_UnknownName(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + + reg := NewRegistry([]WorkerBackend{docker}, nil) + + _, err := reg.GetWorkerBackend(context.Background(), "k8s") + if err == nil { + t.Error("expected error for unknown backend") + } +} + +func TestGetWorkerBackend_AutoDetect(t *testing.T) { + docker := &mockWorkerBackend{name: "docker", available: true} + + reg := NewRegistry([]WorkerBackend{docker}, nil) + + got, err := reg.GetWorkerBackend(context.Background(), "") + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got.Name() != "docker" { + t.Errorf("expected docker, got %s", got.Name()) + } +} + +func TestDetectGatewayBackend(t *testing.T) { + higress := &mockGatewayBackend{name: "higress", available: false} + apig := &mockGatewayBackend{name: "apig", available: true} + + reg := NewRegistry(nil, []GatewayBackend{higress, apig}) + got := reg.DetectGatewayBackend(context.Background()) + if got == nil || got.Name() != "apig" { + t.Errorf("expected apig backend, got %v", got) + } +} diff --git a/orchestrator/config.go b/orchestrator/config.go new file mode 100644 index 00000000..0349ab7d --- /dev/null +++ b/orchestrator/config.go @@ -0,0 +1,36 @@ +package main + +import "os" + +// Config holds all configuration for the orchestrator service. +type Config struct { + // ListenAddr is the address to listen on (default ":2375"). + ListenAddr string + + // SocketPath is the Docker socket path (default "/var/run/docker.sock"). + SocketPath string + + // ContainerPrefix is the required prefix for worker container names (default "hiclaw-worker-"). + ContainerPrefix string + + // Runtime is the deployment runtime ("aliyun" for cloud, empty for local). + Runtime string +} + +// LoadConfig reads configuration from environment variables. +func LoadConfig() *Config { + c := &Config{ + ListenAddr: envOrDefault("HICLAW_PROXY_LISTEN", ":2375"), + SocketPath: envOrDefault("HICLAW_PROXY_SOCKET", "/var/run/docker.sock"), + ContainerPrefix: envOrDefault("HICLAW_PROXY_CONTAINER_PREFIX", "hiclaw-worker-"), + Runtime: os.Getenv("HICLAW_RUNTIME"), + } + return c +} + +func envOrDefault(key, defaultVal string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultVal +} diff --git a/orchestrator/go.mod b/orchestrator/go.mod new file mode 100644 index 00000000..e825d291 --- /dev/null +++ b/orchestrator/go.mod @@ -0,0 +1,3 @@ +module github.com/alibaba/hiclaw/orchestrator + +go 1.23 diff --git a/orchestrator/main.go b/orchestrator/main.go new file mode 100644 index 00000000..4d3db865 --- /dev/null +++ b/orchestrator/main.go @@ -0,0 +1,64 @@ +package main + +import ( + "log" + "net/http" + + "github.com/alibaba/hiclaw/orchestrator/api" + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/proxy" +) + +func main() { + cfg := LoadConfig() + + // --- Security validator (for Docker API passthrough) --- + validator := proxy.NewSecurityValidator() + + // --- Docker API passthrough handler --- + proxyHandler := proxy.NewHandler(cfg.SocketPath, validator) + + // --- Backend registry --- + var workerBackends []backend.WorkerBackend + + // Docker backend (always registered; Available() checks socket at runtime) + dockerBackend := backend.NewDockerBackend(cfg.SocketPath, cfg.ContainerPrefix) + workerBackends = append(workerBackends, dockerBackend) + + // Future: SAE backend (Phase 2) + // if cfg.Runtime == "aliyun" { ... } + + registry := backend.NewRegistry(workerBackends, nil) + + // --- API handlers --- + workerHandler := api.NewWorkerHandler(registry) + gatewayHandler := api.NewGatewayHandler() + + // --- Route registration --- + mux := http.NewServeMux() + + // Worker lifecycle API + mux.HandleFunc("POST /workers", workerHandler.Create) + mux.HandleFunc("GET /workers", workerHandler.List) + mux.HandleFunc("GET /workers/{name}", workerHandler.Status) + mux.HandleFunc("POST /workers/{name}/start", workerHandler.Start) + mux.HandleFunc("POST /workers/{name}/stop", workerHandler.Stop) + mux.HandleFunc("DELETE /workers/{name}", workerHandler.Delete) + + // Gateway API (Phase 1: 501 stubs) + mux.HandleFunc("POST /gateway/consumers", gatewayHandler.CreateConsumer) + mux.HandleFunc("POST /gateway/consumers/{id}/bind", gatewayHandler.BindConsumer) + mux.HandleFunc("DELETE /gateway/consumers/{id}", gatewayHandler.DeleteConsumer) + + // Docker API passthrough (catch-all, existing behavior) + mux.Handle("/", proxyHandler) + + // --- Start server --- + log.Printf("hiclaw-orchestrator listening on %s, docker socket: %s", cfg.ListenAddr, cfg.SocketPath) + if len(validator.AllowedRegistries) > 0 { + log.Printf("Allowed registries: %v", validator.AllowedRegistries) + } + if err := http.ListenAndServe(cfg.ListenAddr, mux); err != nil { + log.Fatalf("Failed to start server: %v", err) + } +} diff --git a/orchestrator/proxy/proxy.go b/orchestrator/proxy/proxy.go new file mode 100644 index 00000000..5cd1a2dd --- /dev/null +++ b/orchestrator/proxy/proxy.go @@ -0,0 +1,120 @@ +package proxy + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "io" + "log" + "net" + "net/http" + "net/http/httputil" + "regexp" +) + +var ( + // URL patterns for POST/DELETE allowlist + containerAction = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/(start|stop|kill|restart|wait|resize|attach|logs)$`) + containerExec = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+/exec$`) + containerCreate = regexp.MustCompile(`^(/v[\d.]+)?/containers/create$`) + containerDelete = regexp.MustCompile(`^(/v[\d.]+)?/containers/[a-zA-Z0-9_.-]+$`) + execStart = regexp.MustCompile(`^(/v[\d.]+)?/exec/[a-zA-Z0-9]+/(start|resize|json)$`) + imageCreate = regexp.MustCompile(`^(/v[\d.]+)?/images/create$`) +) + +// Handler is a Docker API reverse proxy with security validation. +type Handler struct { + proxy *httputil.ReverseProxy + validator *SecurityValidator +} + +// NewHandler creates a Docker API proxy handler that forwards to the given socket. +func NewHandler(socketPath string, validator *SecurityValidator) *Handler { + transport := &http.Transport{ + DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { + return net.Dial("unix", socketPath) + }, + } + + proxy := &httputil.ReverseProxy{ + Director: func(req *http.Request) { + req.URL.Scheme = "http" + req.URL.Host = "localhost" + }, + Transport: transport, + } + + return &Handler{ + proxy: proxy, + validator: validator, + } +} + +// ServeHTTP handles Docker API requests with security filtering. +func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + path := r.URL.Path + + // GET/HEAD requests are read-only, always allow + if r.Method == http.MethodGet || r.Method == http.MethodHead { + h.proxy.ServeHTTP(w, r) + return + } + + // POST/DELETE allowlist + switch { + case r.Method == http.MethodPost && containerCreate.MatchString(path): + h.handleContainerCreate(w, r) + return + + case r.Method == http.MethodPost && containerAction.MatchString(path): + // start/stop/kill/restart/wait/resize/attach/logs — allow + case r.Method == http.MethodPost && containerExec.MatchString(path): + // exec create — allow + case r.Method == http.MethodPost && execStart.MatchString(path): + // exec start — allow + case r.Method == http.MethodPost && imageCreate.MatchString(path): + // image pull — allow + case r.Method == http.MethodDelete && containerDelete.MatchString(path): + // container remove — allow + + default: + log.Printf("[DENIED] %s %s", r.Method, r.URL.String()) + http.Error(w, fmt.Sprintf(`{"message":"hiclaw-orchestrator: %s %s is not allowed"}`, r.Method, path), http.StatusForbidden) + return + } + + h.proxy.ServeHTTP(w, r) +} + +func (h *Handler) handleContainerCreate(w http.ResponseWriter, r *http.Request) { + body, err := io.ReadAll(r.Body) + r.Body.Close() + if err != nil { + http.Error(w, `{"message":"hiclaw-orchestrator: failed to read request body"}`, http.StatusBadRequest) + return + } + + containerName := r.URL.Query().Get("name") + + var req ContainerCreateRequest + if err := json.Unmarshal(body, &req); err != nil { + http.Error(w, `{"message":"hiclaw-orchestrator: invalid JSON in request body"}`, http.StatusBadRequest) + return + } + + if err := h.validator.ValidateContainerCreate(req, containerName); err != nil { + log.Printf("[BLOCKED] POST /containers/create name=%s: %s", containerName, err) + msg, _ := json.Marshal(map[string]string{"message": fmt.Sprintf("hiclaw-orchestrator: %s", err)}) + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusForbidden) + w.Write(msg) + return + } + + log.Printf("[ALLOWED] POST /containers/create name=%s image=%s", containerName, req.Image) + + r.Body = io.NopCloser(bytes.NewReader(body)) + r.ContentLength = int64(len(body)) + h.proxy.ServeHTTP(w, r) +} diff --git a/docker-proxy/security.go b/orchestrator/proxy/security.go similarity index 99% rename from docker-proxy/security.go rename to orchestrator/proxy/security.go index ab38f43c..ca166f6d 100644 --- a/docker-proxy/security.go +++ b/orchestrator/proxy/security.go @@ -1,4 +1,4 @@ -package main +package proxy import ( "fmt" diff --git a/docker-proxy/security_test.go b/orchestrator/proxy/security_test.go similarity index 99% rename from docker-proxy/security_test.go rename to orchestrator/proxy/security_test.go index 5260e246..75713c3e 100644 --- a/docker-proxy/security_test.go +++ b/orchestrator/proxy/security_test.go @@ -1,4 +1,4 @@ -package main +package proxy import ( "testing" From 9ef6a829aadf7721a73143d4046f7545697ab651 Mon Sep 17 00:00:00 2001 From: jingze Date: Thu, 26 Mar 2026 11:17:11 +0800 Subject: [PATCH 02/11] chore: remove CLAUDE.md from tracking and add to .gitignore Co-Authored-By: Claude Opus 4.6 --- .gitignore | 3 ++ CLAUDE.md | 100 ----------------------------------------------------- 2 files changed, 3 insertions(+), 100 deletions(-) delete mode 100644 CLAUDE.md diff --git a/.gitignore b/.gitignore index 4c109145..9e43d3b4 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,6 @@ manager/copaw-agent/ node_modules/ package.json pnpm-lock.yaml + +# Claude Code +CLAUDE.md diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 43bd350b..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,100 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## What is HiClaw - -HiClaw is an open-source Collaborative Multi-Agent OS using Matrix protocol for human-in-the-loop task coordination. A Manager Agent coordinates Worker Agents, with all communication visible in Matrix rooms. Infrastructure: Higress AI Gateway, Tuwunel Matrix Server, MinIO file storage, Element Web client. - -## Build & Test Commands - -```bash -make build # Build all images (native arch) -make build-manager # Build Manager image only -make build-worker # Build Worker image only -make build-copaw-worker # Build CoPaw Worker image only -make build-orchestrator # Build Orchestrator image only (Go) -make build-openclaw-base # Build base image (rarely needed) - -make test # Build + install + run all integration tests -make test SKIP_INSTALL=1 # Run tests against existing Manager -make test TEST_FILTER="01 02" # Run specific tests only -make test-quick # Smoke test (test-01 only) - -make install # Build + install Manager locally -make uninstall # Stop + remove all containers - -make status # Show all hiclaw container statuses -make logs # Show recent logs (LINES=N to override) -``` - -Orchestrator has its own Go test suite: -```bash -cd orchestrator && go test ./... # Run Go unit tests -cd orchestrator && go test ./backend/... # Run backend tests only -cd orchestrator && go test ./proxy/... # Run security validation tests only -``` - -## Local Full Build (from modified openclaw-base) - -Image dependency: `openclaw-base` → `manager` / `worker`. CoPaw and orchestrator are independent. - -When building from a locally modified openclaw-base, you must override both variables: -```bash -make build-openclaw-base -make build-manager build-worker OPENCLAW_BASE_IMAGE=hiclaw/openclaw-base OPENCLAW_BASE_VERSION=latest -``` - -Without `OPENCLAW_BASE_IMAGE=hiclaw/openclaw-base`, it pulls from the remote registry instead of using your local build. - -## Architecture - -``` -manager/ # All-in-one container: Higress + Tuwunel + MinIO + Element Web + OpenClaw Agent - agent/ # Agent personality (SOUL.md), skills, tools — read by Agent at runtime - scripts/init/ # Supervisord startup scripts for each service - configs/ # Configuration templates (rendered at container start) - supervisord.conf # Process orchestration - -worker/ # OpenClaw Worker container (Node.js 22) -copaw/ # CoPaw Worker container (Python 3.11, alternative runtime) -orchestrator/ # Go-based Worker lifecycle service (unified API + Docker proxy) -openclaw-base/ # Shared base image for manager + worker -shared/lib/ # Shared shell libraries (env bootstrap, credential mgmt, mc wrapper) -install/ # One-click installation scripts (bash + PowerShell) -tests/ # Integration test suite (14 cases) - lib/ # Test helpers: assertions, Matrix client, Higress client, MinIO client -``` - -## Key Conventions - -**Agent-facing content** (`manager/agent/**`): Written in second-person voice addressing the Agent directly ("You are...", "Your responsibilities..."). Never use third-person ("The Manager does X"). This applies to SOUL.md, AGENTS.md, HEARTBEAT.md, SKILL.md, TOOLS.md, and all worker-agent configs. - -**Changelog policy**: Any change to `manager/`, `worker/`, `copaw/`, or `openclaw-base/` must be recorded in `changelog/current.md` before committing. Format: one bullet per logical change with linked commit hash. - -**Shared build context**: Manager, Worker, and CoPaw Dockerfiles use `--build-context shared=./shared/lib` for shared shell libraries. The Makefile handles this automatically. - -**Worker container naming**: All Worker containers must be prefixed `hiclaw-worker-` (enforced by orchestrator security validation). - -## Integration Tests - -Tests live in `tests/` and use bash-based helpers (`tests/lib/`). Each test is a standalone script (`tests/test-NN-*.sh`) that communicates with the Manager via Matrix API. Tests require a running Manager container with all services healthy. - -Key test helpers: -- `tests/lib/test-helpers.sh` — assertions, lifecycle, logging -- `tests/lib/matrix-client.sh` — Matrix API wrapper (send messages, read rooms) -- `tests/lib/higress-client.sh` — Higress Console API wrapper -- `tests/lib/minio-client.sh` — MinIO verification - -## Deployment Modes - -- **Local**: All-in-one container with supervisord, Docker socket mounted for Worker management -- **Cloud (Alibaba SAE)**: Distributed containers, STS credential management, orchestrator for secure container API access - -## Verified Technical Details - -- Tuwunel uses `CONDUWUIT_` env prefix (not `TUWUNEL_`) -- Higress Console uses Session Cookie auth (not Basic Auth) -- MCP Server created via `PUT` (not `POST`) -- Auth plugin takes ~40s to activate after first configuration -- OpenClaw Skills auto-load from `workspace/skills//SKILL.md` From a45b66241105b2aa798a01441868f3a5b5eddfa7 Mon Sep 17 00:00:00 2001 From: jingze Date: Thu, 26 Mar 2026 14:09:45 +0800 Subject: [PATCH 03/11] feat(orchestrator): add SAE backend, APIG gateway, auth, STS token service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the orchestrator refactoring. Transforms the service from a Docker-only proxy into a full cloud-capable control plane. - SAE Backend: manage worker lifecycle via Alibaba Cloud SAE API (Go SDK v4) - APIG Backend: manage AI Gateway consumers (Go SDK v6) - Auth middleware: two-tier auth with static manager key + per-worker API keys - STS Token Service: centralized credential issuance with per-worker OSS policy - OSS key persistence: worker API keys stored in OSS for recovery across restarts - Worker shell rewrite: oss-credentials.sh now uses orchestrator-mediated STS refresh - Shared httputil package: consolidated writeJSON/writeError across packages Workers have no OIDC capability — orchestrator is the sole credential issuer. Co-Authored-By: Claude Opus 4.6 --- orchestrator/Dockerfile | 6 +- orchestrator/api/gateway_handler.go | 96 +++++- orchestrator/api/types.go | 6 +- orchestrator/api/worker_handler.go | 87 +++--- orchestrator/api/worker_handler_test.go | 9 +- orchestrator/auth/keys.go | 146 +++++++++ orchestrator/auth/keys_persist.go | 159 ++++++++++ orchestrator/auth/middleware.go | 88 ++++++ orchestrator/auth/middleware_test.go | 214 +++++++++++++ orchestrator/backend/apig.go | 254 ++++++++++++++++ orchestrator/backend/apig_test.go | 237 +++++++++++++++ orchestrator/backend/cloud_credentials.go | 57 ++++ orchestrator/backend/sae.go | 336 +++++++++++++++++++++ orchestrator/backend/sae_test.go | 260 ++++++++++++++++ orchestrator/config.go | 107 ++++++- orchestrator/credentials/handler.go | 46 +++ orchestrator/credentials/handler_test.go | 101 +++++++ orchestrator/credentials/sts.go | 149 +++++++++ orchestrator/credentials/sts_test.go | 136 +++++++++ orchestrator/credentials/types.go | 12 + orchestrator/go.mod | 21 ++ orchestrator/go.sum | 251 +++++++++++++++ orchestrator/internal/httputil/response.go | 26 ++ orchestrator/main.go | 106 +++++-- shared/lib/oss-credentials.sh | 93 ++---- 25 files changed, 2859 insertions(+), 144 deletions(-) create mode 100644 orchestrator/auth/keys.go create mode 100644 orchestrator/auth/keys_persist.go create mode 100644 orchestrator/auth/middleware.go create mode 100644 orchestrator/auth/middleware_test.go create mode 100644 orchestrator/backend/apig.go create mode 100644 orchestrator/backend/apig_test.go create mode 100644 orchestrator/backend/cloud_credentials.go create mode 100644 orchestrator/backend/sae.go create mode 100644 orchestrator/backend/sae_test.go create mode 100644 orchestrator/credentials/handler.go create mode 100644 orchestrator/credentials/handler_test.go create mode 100644 orchestrator/credentials/sts.go create mode 100644 orchestrator/credentials/sts_test.go create mode 100644 orchestrator/credentials/types.go create mode 100644 orchestrator/go.sum create mode 100644 orchestrator/internal/httputil/response.go diff --git a/orchestrator/Dockerfile b/orchestrator/Dockerfile index ee5584b4..44be780c 100644 --- a/orchestrator/Dockerfile +++ b/orchestrator/Dockerfile @@ -2,11 +2,15 @@ ARG HIGRESS_REGISTRY=higress-registry.cn-hangzhou.cr.aliyuncs.com FROM ${HIGRESS_REGISTRY}/higress/golang:1.23-alpine AS builder WORKDIR /app -COPY go.mod ./ +COPY go.mod go.sum ./ +RUN go mod download COPY *.go ./ COPY proxy/ ./proxy/ COPY backend/ ./backend/ COPY api/ ./api/ +COPY auth/ ./auth/ +COPY credentials/ ./credentials/ +COPY internal/ ./internal/ RUN CGO_ENABLED=0 go build -o /hiclaw-orchestrator . FROM ${HIGRESS_REGISTRY}/higress/alpine:3.20 diff --git a/orchestrator/api/gateway_handler.go b/orchestrator/api/gateway_handler.go index 6eb9a2cf..0aa93e6d 100644 --- a/orchestrator/api/gateway_handler.go +++ b/orchestrator/api/gateway_handler.go @@ -1,30 +1,110 @@ package api import ( + "encoding/json" + "log" "net/http" + + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" ) // GatewayHandler handles /gateway/* HTTP requests. -// Phase 1: all endpoints return 501 Not Implemented. -// Phase 2: will delegate to GatewayBackend (Higress local, APIG cloud). -type GatewayHandler struct{} +type GatewayHandler struct { + registry *backend.Registry +} // NewGatewayHandler creates a GatewayHandler. -func NewGatewayHandler() *GatewayHandler { - return &GatewayHandler{} +func NewGatewayHandler(registry *backend.Registry) *GatewayHandler { + return &GatewayHandler{registry: registry} } // CreateConsumer handles POST /gateway/consumers. func (h *GatewayHandler) CreateConsumer(w http.ResponseWriter, r *http.Request) { - writeError(w, http.StatusNotImplemented, "gateway consumer management not yet implemented (Phase 2)") + b, err := h.registry.GetGatewayBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusNotImplemented, "no gateway backend available") + return + } + + var req CreateConsumerRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httputil.WriteError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + return + } + if req.Name == "" { + httputil.WriteError(w, http.StatusBadRequest, "name is required") + return + } + + result, err := b.CreateConsumer(r.Context(), backend.ConsumerRequest{Name: req.Name}) + if err != nil { + log.Printf("[ERROR] create consumer %s: %v", req.Name, err) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + httputil.WriteJSON(w, http.StatusCreated, ConsumerResponse{ + Name: result.Name, + ConsumerID: result.ConsumerID, + APIKey: result.APIKey, + Status: result.Status, + }) } // BindConsumer handles POST /gateway/consumers/{id}/bind. func (h *GatewayHandler) BindConsumer(w http.ResponseWriter, r *http.Request) { - writeError(w, http.StatusNotImplemented, "gateway consumer binding not yet implemented (Phase 2)") + b, err := h.registry.GetGatewayBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusNotImplemented, "no gateway backend available") + return + } + + consumerID := r.PathValue("id") + if consumerID == "" { + httputil.WriteError(w, http.StatusBadRequest, "consumer ID is required") + return + } + + var req BindConsumerRequest + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + httputil.WriteError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + return + } + + err = b.BindConsumer(r.Context(), backend.BindRequest{ + ConsumerID: consumerID, + ModelAPIID: req.ModelAPIID, + EnvID: req.EnvID, + }) + if err != nil { + log.Printf("[ERROR] bind consumer %s: %v", consumerID, err) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + w.WriteHeader(http.StatusNoContent) } // DeleteConsumer handles DELETE /gateway/consumers/{id}. func (h *GatewayHandler) DeleteConsumer(w http.ResponseWriter, r *http.Request) { - writeError(w, http.StatusNotImplemented, "gateway consumer deletion not yet implemented (Phase 2)") + b, err := h.registry.GetGatewayBackend(r.Context(), "") + if err != nil { + httputil.WriteError(w, http.StatusNotImplemented, "no gateway backend available") + return + } + + consumerID := r.PathValue("id") + if consumerID == "" { + httputil.WriteError(w, http.StatusBadRequest, "consumer ID is required") + return + } + + if err := b.DeleteConsumer(r.Context(), consumerID); err != nil { + log.Printf("[ERROR] delete consumer %s: %v", consumerID, err) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) + return + } + + w.WriteHeader(http.StatusNoContent) } diff --git a/orchestrator/api/types.go b/orchestrator/api/types.go index 1ad0a22b..c6a4a44e 100644 --- a/orchestrator/api/types.go +++ b/orchestrator/api/types.go @@ -24,6 +24,7 @@ type WorkerResponse struct { ContainerID string `json:"container_id,omitempty"` AppID string `json:"app_id,omitempty"` RawStatus string `json:"raw_status,omitempty"` + APIKey string `json:"api_key,omitempty"` } // WorkerListResponse is the JSON response for GET /workers. @@ -51,8 +52,3 @@ type BindConsumerRequest struct { ModelAPIID string `json:"model_api_id"` EnvID string `json:"env_id"` } - -// ErrorResponse is the JSON error response. -type ErrorResponse struct { - Message string `json:"message"` -} diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go index cc6b031c..2bb659e6 100644 --- a/orchestrator/api/worker_handler.go +++ b/orchestrator/api/worker_handler.go @@ -6,41 +6,58 @@ import ( "log" "net/http" + "github.com/alibaba/hiclaw/orchestrator/auth" "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" ) // WorkerHandler handles /workers/* HTTP requests. type WorkerHandler struct { - registry *backend.Registry + registry *backend.Registry + keyStore *auth.KeyStore + orchestratorURL string } -// NewWorkerHandler creates a WorkerHandler with the given backend registry. -func NewWorkerHandler(registry *backend.Registry) *WorkerHandler { - return &WorkerHandler{registry: registry} +// NewWorkerHandler creates a WorkerHandler. +func NewWorkerHandler(registry *backend.Registry, keyStore *auth.KeyStore, orchestratorURL string) *WorkerHandler { + return &WorkerHandler{registry: registry, keyStore: keyStore, orchestratorURL: orchestratorURL} } // Create handles POST /workers. func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { var req CreateWorkerRequest if err := json.NewDecoder(r.Body).Decode(&req); err != nil { - writeError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) + httputil.WriteError(w, http.StatusBadRequest, "invalid JSON: "+err.Error()) return } if req.Name == "" { - writeError(w, http.StatusBadRequest, "name is required") + httputil.WriteError(w, http.StatusBadRequest, "name is required") return } if req.Image == "" { - writeError(w, http.StatusBadRequest, "image is required") + httputil.WriteError(w, http.StatusBadRequest, "image is required") return } b, err := h.registry.GetWorkerBackend(r.Context(), req.Backend) if err != nil { - writeError(w, http.StatusServiceUnavailable, err.Error()) + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) return } + // For SAE backend: generate per-worker API key and inject into env + var apiKey string + if b.Name() == "sae" && h.keyStore != nil && h.keyStore.AuthEnabled() { + apiKey = h.keyStore.GenerateWorkerKey(req.Name) + if req.Env == nil { + req.Env = make(map[string]string) + } + req.Env["HICLAW_WORKER_API_KEY"] = apiKey + if h.orchestratorURL != "" { + req.Env["HICLAW_ORCHESTRATOR_URL"] = h.orchestratorURL + } + } + result, err := b.Create(r.Context(), backend.CreateRequest{ Name: req.Name, Image: req.Image, @@ -52,18 +69,23 @@ func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { }) if err != nil { log.Printf("[ERROR] create worker %s: %v", req.Name, err) + if apiKey != "" { + h.keyStore.RemoveWorkerKey(req.Name) + } writeBackendError(w, err) return } - writeJSON(w, http.StatusCreated, toWorkerResponse(result)) + resp := toWorkerResponse(result) + resp.APIKey = apiKey + httputil.WriteJSON(w, http.StatusCreated, resp) } // List handles GET /workers. func (h *WorkerHandler) List(w http.ResponseWriter, r *http.Request) { b, err := h.registry.GetWorkerBackend(r.Context(), "") if err != nil { - writeJSON(w, http.StatusOK, WorkerListResponse{Workers: []WorkerResponse{}}) + httputil.WriteJSON(w, http.StatusOK, WorkerListResponse{Workers: []WorkerResponse{}}) return } @@ -78,20 +100,20 @@ func (h *WorkerHandler) List(w http.ResponseWriter, r *http.Request) { for _, r := range results { workers = append(workers, toWorkerResponse(&r)) } - writeJSON(w, http.StatusOK, WorkerListResponse{Workers: workers}) + httputil.WriteJSON(w, http.StatusOK, WorkerListResponse{Workers: workers}) } // Status handles GET /workers/{name}. func (h *WorkerHandler) Status(w http.ResponseWriter, r *http.Request) { name := r.PathValue("name") if name == "" { - writeError(w, http.StatusBadRequest, "worker name is required") + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") return } b, err := h.registry.GetWorkerBackend(r.Context(), "") if err != nil { - writeError(w, http.StatusServiceUnavailable, err.Error()) + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) return } @@ -102,20 +124,20 @@ func (h *WorkerHandler) Status(w http.ResponseWriter, r *http.Request) { return } - writeJSON(w, http.StatusOK, toWorkerResponse(result)) + httputil.WriteJSON(w, http.StatusOK, toWorkerResponse(result)) } // Start handles POST /workers/{name}/start. func (h *WorkerHandler) Start(w http.ResponseWriter, r *http.Request) { name := r.PathValue("name") if name == "" { - writeError(w, http.StatusBadRequest, "worker name is required") + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") return } b, err := h.registry.GetWorkerBackend(r.Context(), "") if err != nil { - writeError(w, http.StatusServiceUnavailable, err.Error()) + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) return } @@ -132,13 +154,13 @@ func (h *WorkerHandler) Start(w http.ResponseWriter, r *http.Request) { func (h *WorkerHandler) Stop(w http.ResponseWriter, r *http.Request) { name := r.PathValue("name") if name == "" { - writeError(w, http.StatusBadRequest, "worker name is required") + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") return } b, err := h.registry.GetWorkerBackend(r.Context(), "") if err != nil { - writeError(w, http.StatusServiceUnavailable, err.Error()) + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) return } @@ -155,13 +177,13 @@ func (h *WorkerHandler) Stop(w http.ResponseWriter, r *http.Request) { func (h *WorkerHandler) Delete(w http.ResponseWriter, r *http.Request) { name := r.PathValue("name") if name == "" { - writeError(w, http.StatusBadRequest, "worker name is required") + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") return } b, err := h.registry.GetWorkerBackend(r.Context(), "") if err != nil { - writeError(w, http.StatusServiceUnavailable, err.Error()) + httputil.WriteError(w, http.StatusServiceUnavailable, err.Error()) return } @@ -171,11 +193,13 @@ func (h *WorkerHandler) Delete(w http.ResponseWriter, r *http.Request) { return } + if h.keyStore != nil { + h.keyStore.RemoveWorkerKey(name) + } + w.WriteHeader(http.StatusNoContent) } -// --- helpers --- - func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { return WorkerResponse{ Name: r.Name, @@ -187,26 +211,13 @@ func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { } } -func writeJSON(w http.ResponseWriter, status int, v interface{}) { - w.Header().Set("Content-Type", "application/json") - w.WriteHeader(status) - if err := json.NewEncoder(w).Encode(v); err != nil { - log.Printf("[WARN] failed to write JSON response: %v", err) - } -} - -func writeError(w http.ResponseWriter, status int, message string) { - writeJSON(w, status, ErrorResponse{Message: message}) -} - -// writeBackendError maps typed backend errors to appropriate HTTP status codes. func writeBackendError(w http.ResponseWriter, err error) { switch { case errors.Is(err, backend.ErrConflict): - writeError(w, http.StatusConflict, err.Error()) + httputil.WriteError(w, http.StatusConflict, err.Error()) case errors.Is(err, backend.ErrNotFound): - writeError(w, http.StatusNotFound, err.Error()) + httputil.WriteError(w, http.StatusNotFound, err.Error()) default: - writeError(w, http.StatusInternalServerError, err.Error()) + httputil.WriteError(w, http.StatusInternalServerError, err.Error()) } } diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go index 145469fa..f6cdafdd 100644 --- a/orchestrator/api/worker_handler_test.go +++ b/orchestrator/api/worker_handler_test.go @@ -9,6 +9,7 @@ import ( "net/http/httptest" "testing" + "github.com/alibaba/hiclaw/orchestrator/auth" "github.com/alibaba/hiclaw/orchestrator/backend" ) @@ -101,7 +102,8 @@ func (m *mockBackend) List(_ context.Context) ([]backend.WorkerResult, error) { func setupHandler(mb *mockBackend) (*WorkerHandler, *http.ServeMux) { reg := backend.NewRegistry([]backend.WorkerBackend{mb}, nil) - h := NewWorkerHandler(reg) + ks := auth.NewKeyStore("", nil) // auth disabled for handler tests + h := NewWorkerHandler(reg, ks, "") mux := http.NewServeMux() mux.HandleFunc("POST /workers", h.Create) mux.HandleFunc("GET /workers", h.List) @@ -350,8 +352,9 @@ func TestCreateWorkerGenericError(t *testing.T) { } } -func TestGatewayStubs(t *testing.T) { - h := NewGatewayHandler() +func TestGatewayNoBackend(t *testing.T) { + reg := backend.NewRegistry(nil, nil) // no gateway backends + h := NewGatewayHandler(reg) mux := http.NewServeMux() mux.HandleFunc("POST /gateway/consumers", h.CreateConsumer) mux.HandleFunc("POST /gateway/consumers/{id}/bind", h.BindConsumer) diff --git a/orchestrator/auth/keys.go b/orchestrator/auth/keys.go new file mode 100644 index 00000000..40252dba --- /dev/null +++ b/orchestrator/auth/keys.go @@ -0,0 +1,146 @@ +package auth + +import ( + "context" + "crypto/rand" + "encoding/hex" + "log" + "sync" +) + +// CallerIdentity represents the authenticated caller. +type CallerIdentity struct { + Role string // "manager" | "worker" + WorkerName string // non-empty only when Role == "worker" +} + +// KeyStore manages API keys for manager and workers. +type KeyStore struct { + mu sync.RWMutex + managerKey string // immutable after construction + workerKeys map[string]string // workerName -> apiKey + keyIndex map[string]string // apiKey -> workerName (reverse index) + persister KeyPersister // nil in local mode +} + +// NewKeyStore creates a KeyStore with the given static manager key and optional persister. +func NewKeyStore(managerKey string, persister KeyPersister) *KeyStore { + return &KeyStore{ + managerKey: managerKey, + workerKeys: make(map[string]string), + keyIndex: make(map[string]string), + persister: persister, + } +} + +// AuthEnabled returns true if authentication is configured. +func (ks *KeyStore) AuthEnabled() bool { + return ks.managerKey != "" +} + +// Recover loads worker keys from the persister (called at startup). +func (ks *KeyStore) Recover(ctx context.Context) error { + if ks.persister == nil { + return nil + } + keys, err := ks.persister.Load(ctx) + if err != nil { + return err + } + + ks.mu.Lock() + defer ks.mu.Unlock() + + for name, key := range keys { + ks.workerKeys[name] = key + ks.keyIndex[key] = name + } + if len(keys) > 0 { + log.Printf("[KeyStore] Recovered %d worker keys", len(keys)) + } + return nil +} + +// GenerateWorkerKey creates a cryptographically random API key for a worker. +func (ks *KeyStore) GenerateWorkerKey(workerName string) string { + b := make([]byte, 32) + rand.Read(b) + key := hex.EncodeToString(b) + + ks.mu.Lock() + if oldKey, exists := ks.workerKeys[workerName]; exists { + delete(ks.keyIndex, oldKey) + } + ks.workerKeys[workerName] = key + ks.keyIndex[key] = workerName + snapshot := ks.snapshotLocked() + ks.mu.Unlock() + + ks.persist(snapshot) + return key +} + +// SetWorkerKey sets a known API key for a worker (used during recovery). +func (ks *KeyStore) SetWorkerKey(workerName, key string) { + ks.mu.Lock() + defer ks.mu.Unlock() + + if oldKey, exists := ks.workerKeys[workerName]; exists { + delete(ks.keyIndex, oldKey) + } + ks.workerKeys[workerName] = key + ks.keyIndex[key] = workerName +} + +// RemoveWorkerKey removes a worker's API key. +func (ks *KeyStore) RemoveWorkerKey(workerName string) { + ks.mu.Lock() + if key, exists := ks.workerKeys[workerName]; exists { + delete(ks.keyIndex, key) + delete(ks.workerKeys, workerName) + } + snapshot := ks.snapshotLocked() + ks.mu.Unlock() + + ks.persist(snapshot) +} + +// ValidateKey checks a key and returns the caller identity. +func (ks *KeyStore) ValidateKey(key string) (*CallerIdentity, bool) { + if key == "" { + return nil, false + } + + // managerKey is immutable after construction, no lock needed + if key == ks.managerKey { + return &CallerIdentity{Role: RoleManager}, true + } + + ks.mu.RLock() + defer ks.mu.RUnlock() + + if workerName, exists := ks.keyIndex[key]; exists { + return &CallerIdentity{Role: RoleWorker, WorkerName: workerName}, true + } + + return nil, false +} + +// snapshotLocked returns a copy of workerKeys. Must be called with mu held. +func (ks *KeyStore) snapshotLocked() map[string]string { + cp := make(map[string]string, len(ks.workerKeys)) + for k, v := range ks.workerKeys { + cp[k] = v + } + return cp +} + +// persist saves the current keys to the persister (best-effort, logs on error). +func (ks *KeyStore) persist(keys map[string]string) { + if ks.persister == nil { + return + } + if err := ks.persister.Save(context.Background(), keys); err != nil { + log.Printf("[KeyStore] WARNING: failed to persist keys: %v", err) + } +} diff --git a/orchestrator/auth/keys_persist.go b/orchestrator/auth/keys_persist.go new file mode 100644 index 00000000..b7f87cfb --- /dev/null +++ b/orchestrator/auth/keys_persist.go @@ -0,0 +1,159 @@ +package auth + +import ( + "bytes" + "context" + "crypto/hmac" + "crypto/sha1" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "log" + "net/http" + "time" +) + +// KeyPersister abstracts key storage for persistence across restarts. +type KeyPersister interface { + Save(ctx context.Context, keys map[string]string) error + Load(ctx context.Context) (map[string]string, error) +} + +// OSSCredentialProvider provides credentials for OSS access. +type OSSCredentialProvider interface { + GetAccessKeyId() (*string, error) + GetAccessKeySecret() (*string, error) + GetSecurityToken() (*string, error) +} + +// OSSKeyPersister persists worker keys to an OSS JSON file. +type OSSKeyPersister struct { + endpoint string // e.g. "oss-cn-hangzhou-internal.aliyuncs.com" + bucket string + key string // object key, e.g. "manager/orchestrator-worker-keys.json" + creds OSSCredentialProvider + client *http.Client +} + +// NewOSSKeyPersister creates a persister that stores keys in OSS. +func NewOSSKeyPersister(region, bucket string, creds OSSCredentialProvider) *OSSKeyPersister { + return &OSSKeyPersister{ + endpoint: fmt.Sprintf("oss-%s-internal.aliyuncs.com", region), + bucket: bucket, + key: "manager/orchestrator-worker-keys.json", + creds: creds, + client: &http.Client{Timeout: 30 * time.Second}, + } +} + +func (p *OSSKeyPersister) Save(ctx context.Context, keys map[string]string) error { + data, err := json.Marshal(keys) + if err != nil { + return fmt.Errorf("marshal keys: %w", err) + } + + ossURL := fmt.Sprintf("https://%s.%s/%s", p.bucket, p.endpoint, p.key) + req, err := http.NewRequestWithContext(ctx, http.MethodPut, ossURL, bytes.NewReader(data)) + if err != nil { + return fmt.Errorf("build OSS PUT request: %w", err) + } + req.Header.Set("Content-Type", "application/json") + + if err := p.signRequest(req); err != nil { + return fmt.Errorf("sign OSS request: %w", err) + } + + resp, err := p.client.Do(req) + if err != nil { + return fmt.Errorf("OSS PUT: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("OSS PUT failed (status %d): %s", resp.StatusCode, string(body)) + } + + log.Printf("[KeyPersister] Saved %d worker keys to OSS", len(keys)) + return nil +} + +func (p *OSSKeyPersister) Load(ctx context.Context) (map[string]string, error) { + ossURL := fmt.Sprintf("https://%s.%s/%s", p.bucket, p.endpoint, p.key) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, ossURL, nil) + if err != nil { + return nil, fmt.Errorf("build OSS GET request: %w", err) + } + + if err := p.signRequest(req); err != nil { + return nil, fmt.Errorf("sign OSS request: %w", err) + } + + resp, err := p.client.Do(req) + if err != nil { + return nil, fmt.Errorf("OSS GET: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode == http.StatusNotFound { + return map[string]string{}, nil + } + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return nil, fmt.Errorf("OSS GET failed (status %d): %s", resp.StatusCode, string(body)) + } + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("read OSS response: %w", err) + } + + var keys map[string]string + if err := json.Unmarshal(body, &keys); err != nil { + return nil, fmt.Errorf("parse keys JSON: %w", err) + } + + log.Printf("[KeyPersister] Loaded %d worker keys from OSS", len(keys)) + return keys, nil +} + +// signRequest adds OSS V1 signature headers using STS credentials. +func (p *OSSKeyPersister) signRequest(req *http.Request) error { + ak, err := p.creds.GetAccessKeyId() + if err != nil || ak == nil { + return fmt.Errorf("get access key ID: %w", err) + } + sk, err := p.creds.GetAccessKeySecret() + if err != nil || sk == nil { + return fmt.Errorf("get access key secret: %w", err) + } + token, err := p.creds.GetSecurityToken() + if err != nil { + return fmt.Errorf("get security token: %w", err) + } + + date := time.Now().UTC().Format(http.TimeFormat) + req.Header.Set("Date", date) + if token != nil && *token != "" { + req.Header.Set("x-oss-security-token", *token) + } + + contentType := req.Header.Get("Content-Type") + resource := fmt.Sprintf("/%s/%s", p.bucket, p.key) + + canonicalHeaders := "" + if token != nil && *token != "" { + canonicalHeaders = "x-oss-security-token:" + *token + "\n" + } + + stringToSign := fmt.Sprintf("%s\n\n%s\n%s\n%s%s", + req.Method, contentType, date, canonicalHeaders, resource) + + mac := hmac.New(sha1.New, []byte(*sk)) + mac.Write([]byte(stringToSign)) + signature := base64.StdEncoding.EncodeToString(mac.Sum(nil)) + + req.Header.Set("Authorization", fmt.Sprintf("OSS %s:%s", *ak, signature)) + return nil +} diff --git a/orchestrator/auth/middleware.go b/orchestrator/auth/middleware.go new file mode 100644 index 00000000..f032f073 --- /dev/null +++ b/orchestrator/auth/middleware.go @@ -0,0 +1,88 @@ +package auth + +import ( + "context" + "net/http" + "strings" + + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" +) + +// Role constants. +const ( + RoleManager = "manager" + RoleWorker = "worker" +) + +type contextKey string + +const callerKey contextKey = "caller" + +// CallerFromContext extracts the CallerIdentity from the request context. +func CallerFromContext(ctx context.Context) *CallerIdentity { + if v := ctx.Value(callerKey); v != nil { + return v.(*CallerIdentity) + } + return nil +} + +// CallerKeyForTest returns the context key for injecting CallerIdentity in tests. +func CallerKeyForTest() contextKey { + return callerKey +} + +// Middleware provides HTTP authentication middleware. +type Middleware struct { + keyStore *KeyStore +} + +// NewMiddleware creates an auth Middleware. +func NewMiddleware(keyStore *KeyStore) *Middleware { + return &Middleware{keyStore: keyStore} +} + +// RequireManager returns middleware that only allows manager callers. +func (m *Middleware) RequireManager(next http.Handler) http.Handler { + return m.requireRole(RoleManager, next) +} + +// RequireWorker returns middleware that only allows worker callers. +func (m *Middleware) RequireWorker(next http.Handler) http.Handler { + return m.requireRole(RoleWorker, next) +} + +func (m *Middleware) requireRole(role string, next http.Handler) http.Handler { + return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !m.keyStore.AuthEnabled() { + next.ServeHTTP(w, r) + return + } + + identity, ok := m.authenticate(r) + if !ok { + httputil.WriteError(w, http.StatusUnauthorized, "invalid or missing API key") + return + } + if identity.Role != role { + httputil.WriteError(w, http.StatusForbidden, role+" access required") + return + } + + ctx := context.WithValue(r.Context(), callerKey, identity) + next.ServeHTTP(w, r.WithContext(ctx)) + }) +} + +func (m *Middleware) authenticate(r *http.Request) (*CallerIdentity, bool) { + authHeader := r.Header.Get("Authorization") + if authHeader == "" { + return nil, false + } + + key := strings.TrimPrefix(authHeader, "Bearer ") + if key == authHeader { + return nil, false + } + + return m.keyStore.ValidateKey(key) +} diff --git a/orchestrator/auth/middleware_test.go b/orchestrator/auth/middleware_test.go new file mode 100644 index 00000000..1ee87c73 --- /dev/null +++ b/orchestrator/auth/middleware_test.go @@ -0,0 +1,214 @@ +package auth + +import ( + "net/http" + "net/http/httptest" + "testing" +) + +func TestGenerateWorkerKey(t *testing.T) { + ks := NewKeyStore("manager-secret", nil) + + k1 := ks.GenerateWorkerKey("alice") + k2 := ks.GenerateWorkerKey("bob") + + if k1 == k2 { + t.Error("expected unique keys") + } + if len(k1) != 64 { // 32 bytes hex + t.Errorf("expected 64 char hex key, got %d", len(k1)) + } +} + +func TestGenerateWorkerKeyOverwrite(t *testing.T) { + ks := NewKeyStore("mgr", nil) + + old := ks.GenerateWorkerKey("alice") + new := ks.GenerateWorkerKey("alice") + + if old == new { + t.Error("regenerated key should differ") + } + + // Old key should no longer validate + if _, ok := ks.ValidateKey(old); ok { + t.Error("old key should be invalid after regeneration") + } + id, ok := ks.ValidateKey(new) + if !ok || id.WorkerName != "alice" { + t.Error("new key should validate as alice") + } +} + +func TestValidateManagerKey(t *testing.T) { + ks := NewKeyStore("mgr-key", nil) + + id, ok := ks.ValidateKey("mgr-key") + if !ok || id.Role != "manager" { + t.Error("expected manager identity") + } +} + +func TestValidateWorkerKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + key := ks.GenerateWorkerKey("bob") + + id, ok := ks.ValidateKey(key) + if !ok || id.Role != "worker" || id.WorkerName != "bob" { + t.Errorf("expected worker bob, got %+v", id) + } +} + +func TestValidateInvalidKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + + if _, ok := ks.ValidateKey("bad-key"); ok { + t.Error("expected invalid key to fail") + } + if _, ok := ks.ValidateKey(""); ok { + t.Error("expected empty key to fail") + } +} + +func TestRemoveWorkerKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + key := ks.GenerateWorkerKey("alice") + + ks.RemoveWorkerKey("alice") + + if _, ok := ks.ValidateKey(key); ok { + t.Error("removed key should be invalid") + } +} + +func TestSetWorkerKey(t *testing.T) { + ks := NewKeyStore("mgr", nil) + ks.SetWorkerKey("carol", "known-key-123") + + id, ok := ks.ValidateKey("known-key-123") + if !ok || id.WorkerName != "carol" { + t.Error("expected SetWorkerKey to work") + } +} + +func TestAuthDisabled(t *testing.T) { + ks := NewKeyStore("", nil) // empty = auth disabled + if ks.AuthEnabled() { + t.Error("expected auth disabled with empty manager key") + } +} + +func TestMiddlewareSkipsWhenDisabled(t *testing.T) { + ks := NewKeyStore("", nil) + mw := NewMiddleware(ks) + + called := false + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + w.WriteHeader(http.StatusOK) + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if !called { + t.Error("handler should be called when auth disabled") + } + if w.Code != http.StatusOK { + t.Errorf("expected 200, got %d", w.Code) + } +} + +func TestMiddlewareRequireManagerValid(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + + called := false + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + req.Header.Set("Authorization", "Bearer mgr-secret") + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if !called { + t.Error("handler should be called for valid manager key") + } +} + +func TestMiddlewareRequireManagerRejectsWorker(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + workerKey := ks.GenerateWorkerKey("alice") + + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Error("handler should not be called") + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + req.Header.Set("Authorization", "Bearer "+workerKey) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if w.Code != http.StatusForbidden { + t.Errorf("expected 403, got %d", w.Code) + } +} + +func TestMiddlewareRequireManagerRejectsNoAuth(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + + handler := mw.RequireManager(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Error("handler should not be called") + })) + + req := httptest.NewRequest(http.MethodGet, "/workers", nil) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if w.Code != http.StatusUnauthorized { + t.Errorf("expected 401, got %d", w.Code) + } +} + +func TestMiddlewareRequireWorkerValid(t *testing.T) { + ks := NewKeyStore("mgr", nil) + mw := NewMiddleware(ks) + key := ks.GenerateWorkerKey("bob") + + var gotIdentity *CallerIdentity + handler := mw.RequireWorker(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + gotIdentity = CallerFromContext(r.Context()) + })) + + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + req.Header.Set("Authorization", "Bearer "+key) + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if gotIdentity == nil || gotIdentity.WorkerName != "bob" { + t.Errorf("expected worker bob in context, got %+v", gotIdentity) + } +} + +func TestMiddlewareRequireWorkerRejectsManager(t *testing.T) { + ks := NewKeyStore("mgr-secret", nil) + mw := NewMiddleware(ks) + + handler := mw.RequireWorker(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + t.Error("handler should not be called") + })) + + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + req.Header.Set("Authorization", "Bearer mgr-secret") + w := httptest.NewRecorder() + handler.ServeHTTP(w, req) + + if w.Code != http.StatusForbidden { + t.Errorf("expected 403, got %d", w.Code) + } +} diff --git a/orchestrator/backend/apig.go b/orchestrator/backend/apig.go new file mode 100644 index 00000000..d891b1d5 --- /dev/null +++ b/orchestrator/backend/apig.go @@ -0,0 +1,254 @@ +package backend + +import ( + "context" + "fmt" + "log" + "strings" + + openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client" + apig "github.com/alibabacloud-go/apig-20240327/v6/client" + "github.com/alibabacloud-go/tea/tea" +) + +// APIGClient abstracts the APIG SDK client for testability. +type APIGClient interface { + CreateConsumer(req *apig.CreateConsumerRequest) (*apig.CreateConsumerResponse, error) + GetConsumer(consumerId *string) (*apig.GetConsumerResponse, error) + DeleteConsumer(consumerId *string) (*apig.DeleteConsumerResponse, error) + ListConsumers(req *apig.ListConsumersRequest) (*apig.ListConsumersResponse, error) + CreateConsumerAuthorizationRules(req *apig.CreateConsumerAuthorizationRulesRequest) (*apig.CreateConsumerAuthorizationRulesResponse, error) + QueryConsumerAuthorizationRules(req *apig.QueryConsumerAuthorizationRulesRequest) (*apig.QueryConsumerAuthorizationRulesResponse, error) +} + +// APIGConfig holds APIG backend configuration. +type APIGConfig struct { + Region string + GatewayID string + ModelAPIID string + EnvID string +} + +// APIGBackend manages AI Gateway consumers via Alibaba Cloud APIG. +type APIGBackend struct { + client APIGClient + config APIGConfig +} + +// NewAPIGBackend creates an APIGBackend with auto-configured SDK client. +func NewAPIGBackend(creds CloudCredentialProvider, config APIGConfig) (*APIGBackend, error) { + cred, err := creds.GetCredential() + if err != nil { + return nil, fmt.Errorf("build APIG credentials: %w", err) + } + + endpoint := fmt.Sprintf("apig.%s.aliyuncs.com", config.Region) + apiConfig := &openapi.Config{} + apiConfig.SetCredential(cred). + SetRegionId(config.Region). + SetEndpoint(endpoint) + + client, err := apig.NewClient(apiConfig) + if err != nil { + return nil, fmt.Errorf("create APIG client: %w", err) + } + + return &APIGBackend{client: client, config: config}, nil +} + +// NewAPIGBackendWithClient creates an APIGBackend with a custom client (for testing). +func NewAPIGBackendWithClient(client APIGClient, config APIGConfig) *APIGBackend { + return &APIGBackend{client: client, config: config} +} + +func (a *APIGBackend) Name() string { return "apig" } + +func (a *APIGBackend) Available(_ context.Context) bool { + return IsAliyunRuntime() && a.config.GatewayID != "" +} + +func (a *APIGBackend) CreateConsumer(_ context.Context, req ConsumerRequest) (*ConsumerResult, error) { + // Prefix consumer name with gateway ID to avoid cross-gateway collisions + consumerName := req.Name + if a.config.GatewayID != "" { + consumerName = a.config.GatewayID + "-" + req.Name + } + + // Check if already exists + existingID, existingKey, err := a.findConsumer(consumerName) + if err != nil { + return nil, err + } + if existingID != "" { + return &ConsumerResult{ + Name: req.Name, + ConsumerID: existingID, + APIKey: existingKey, + Status: "exists", + }, nil + } + + // Create consumer + createReq := &apig.CreateConsumerRequest{} + createReq.SetName(consumerName). + SetGatewayType("AI"). + SetEnable(true). + SetDescription(fmt.Sprintf("HiClaw Worker: %s", req.Name)). + SetApikeyIdentityConfig(&apig.ApiKeyIdentityConfig{ + Type: tea.String("Apikey"), + ApikeySource: &apig.ApiKeyIdentityConfigApikeySource{ + Source: tea.String("Default"), + Value: tea.String("Authorization"), + }, + Credentials: []*apig.ApiKeyIdentityConfigCredentials{ + {GenerateMode: tea.String("System")}, + }, + }) + + resp, err := a.client.CreateConsumer(createReq) + if err != nil { + // Handle 409 race condition + if strings.Contains(err.Error(), "ConsumerNameDuplicate") || strings.Contains(err.Error(), "409") { + log.Printf("[APIG] Consumer creation returned 409, re-querying...") + existingID, existingKey, err = a.findConsumer(consumerName) + if err != nil { + return nil, err + } + if existingID != "" { + return &ConsumerResult{ + Name: req.Name, + ConsumerID: existingID, + APIKey: existingKey, + Status: "exists", + }, nil + } + return nil, fmt.Errorf("consumer 409 but not found on re-query") + } + return nil, fmt.Errorf("APIG CreateConsumer: %w", err) + } + + consumerID := "" + if resp.Body != nil && resp.Body.Data != nil && resp.Body.Data.ConsumerId != nil { + consumerID = *resp.Body.Data.ConsumerId + } + + // Fetch API key from detail + apiKey, err := a.getConsumerAPIKey(consumerID) + if err != nil { + log.Printf("[APIG] Warning: created consumer %s but failed to get API key: %v", consumerID, err) + } + + log.Printf("[APIG] Created consumer %s (%s)", consumerName, consumerID) + + return &ConsumerResult{ + Name: req.Name, + ConsumerID: consumerID, + APIKey: apiKey, + Status: "created", + }, nil +} + +func (a *APIGBackend) BindConsumer(_ context.Context, req BindRequest) error { + // Check if already bound + queryReq := &apig.QueryConsumerAuthorizationRulesRequest{} + queryReq.SetConsumerId(req.ConsumerID). + SetResourceId(req.ModelAPIID). + SetEnvironmentId(req.EnvID). + SetResourceType("LLM"). + SetPageNumber(1). + SetPageSize(100) + + queryResp, err := a.client.QueryConsumerAuthorizationRules(queryReq) + if err == nil && queryResp.Body != nil && queryResp.Body.Data != nil && + queryResp.Body.Data.Items != nil && len(queryResp.Body.Data.Items) > 0 { + log.Printf("[APIG] Consumer %s already bound (%d rules)", req.ConsumerID, len(queryResp.Body.Data.Items)) + return nil + } + + // Create authorization rule + createReq := &apig.CreateConsumerAuthorizationRulesRequest{} + createReq.SetAuthorizationRules([]*apig.CreateConsumerAuthorizationRulesRequestAuthorizationRules{ + { + ConsumerId: tea.String(req.ConsumerID), + ResourceType: tea.String("LLM"), + ExpireMode: tea.String("LongTerm"), + ResourceIdentifier: &apig.CreateConsumerAuthorizationRulesRequestAuthorizationRulesResourceIdentifier{ + ResourceId: tea.String(req.ModelAPIID), + EnvironmentId: tea.String(req.EnvID), + }, + }, + }) + + _, err = a.client.CreateConsumerAuthorizationRules(createReq) + if err != nil { + return fmt.Errorf("APIG CreateConsumerAuthorizationRules: %w", err) + } + + log.Printf("[APIG] Consumer %s bound to API %s", req.ConsumerID, req.ModelAPIID) + return nil +} + +func (a *APIGBackend) DeleteConsumer(_ context.Context, consumerID string) error { + _, err := a.client.DeleteConsumer(tea.String(consumerID)) + if err != nil { + return fmt.Errorf("APIG DeleteConsumer: %w", err) + } + log.Printf("[APIG] Deleted consumer %s", consumerID) + return nil +} + +// --- internal helpers --- + +func (a *APIGBackend) findConsumer(consumerName string) (string, string, error) { + page := int32(1) + for { + req := &apig.ListConsumersRequest{} + req.SetGatewayType("AI"). + SetNameLike(consumerName). + SetPageNumber(page). + SetPageSize(100) + + resp, err := a.client.ListConsumers(req) + if err != nil { + return "", "", fmt.Errorf("APIG ListConsumers: %w", err) + } + + if resp.Body == nil || resp.Body.Data == nil || resp.Body.Data.Items == nil { + break + } + + for _, c := range resp.Body.Data.Items { + if c.Name != nil && *c.Name == consumerName { + consumerID := "" + if c.ConsumerId != nil { + consumerID = *c.ConsumerId + } + apiKey, _ := a.getConsumerAPIKey(consumerID) + return consumerID, apiKey, nil + } + } + + if len(resp.Body.Data.Items) < 100 { + break + } + page++ + } + return "", "", nil +} + +func (a *APIGBackend) getConsumerAPIKey(consumerID string) (string, error) { + resp, err := a.client.GetConsumer(tea.String(consumerID)) + if err != nil { + return "", err + } + if resp.Body != nil && resp.Body.Data != nil && + resp.Body.Data.ApiKeyIdentityConfig != nil && + resp.Body.Data.ApiKeyIdentityConfig.Credentials != nil && + len(resp.Body.Data.ApiKeyIdentityConfig.Credentials) > 0 { + cred := resp.Body.Data.ApiKeyIdentityConfig.Credentials[0] + if cred.Apikey != nil { + return *cred.Apikey, nil + } + } + return "", nil +} diff --git a/orchestrator/backend/apig_test.go b/orchestrator/backend/apig_test.go new file mode 100644 index 00000000..35494c3b --- /dev/null +++ b/orchestrator/backend/apig_test.go @@ -0,0 +1,237 @@ +package backend + +import ( + "context" + "fmt" + "testing" + + apig "github.com/alibabacloud-go/apig-20240327/v6/client" + "github.com/alibabacloud-go/tea/tea" +) + +// mockAPIGClient implements APIGClient for testing. +type mockAPIGClient struct { + consumers map[string]*mockConsumer // consumerID -> consumer + rules map[string][]string // consumerID -> ruleIDs + nextID int +} + +type mockConsumer struct { + id string + name string + apiKey string +} + +func newMockAPIGClient() *mockAPIGClient { + return &mockAPIGClient{ + consumers: map[string]*mockConsumer{}, + rules: map[string][]string{}, + } +} + +func (m *mockAPIGClient) CreateConsumer(req *apig.CreateConsumerRequest) (*apig.CreateConsumerResponse, error) { + name := tea.StringValue(req.Name) + for _, c := range m.consumers { + if c.name == name { + return nil, fmt.Errorf("ConsumerNameDuplicate: %s", name) + } + } + m.nextID++ + id := fmt.Sprintf("cs-%d", m.nextID) + apiKey := fmt.Sprintf("key-%s", name) + m.consumers[id] = &mockConsumer{id: id, name: name, apiKey: apiKey} + return &apig.CreateConsumerResponse{ + Body: &apig.CreateConsumerResponseBody{ + Data: &apig.CreateConsumerResponseBodyData{ + ConsumerId: tea.String(id), + }, + }, + }, nil +} + +func (m *mockAPIGClient) GetConsumer(consumerId *string) (*apig.GetConsumerResponse, error) { + id := tea.StringValue(consumerId) + c, ok := m.consumers[id] + if !ok { + return nil, fmt.Errorf("consumer not found: %s", id) + } + return &apig.GetConsumerResponse{ + Body: &apig.GetConsumerResponseBody{ + Data: &apig.GetConsumerResponseBodyData{ + ConsumerId: tea.String(c.id), + ApiKeyIdentityConfig: &apig.ApiKeyIdentityConfig{ + Credentials: []*apig.ApiKeyIdentityConfigCredentials{ + {Apikey: tea.String(c.apiKey)}, + }, + }, + }, + }, + }, nil +} + +func (m *mockAPIGClient) DeleteConsumer(consumerId *string) (*apig.DeleteConsumerResponse, error) { + id := tea.StringValue(consumerId) + delete(m.consumers, id) + delete(m.rules, id) + return &apig.DeleteConsumerResponse{}, nil +} + +func (m *mockAPIGClient) ListConsumers(req *apig.ListConsumersRequest) (*apig.ListConsumersResponse, error) { + nameLike := tea.StringValue(req.NameLike) + var items []*apig.ListConsumersResponseBodyDataItems + for _, c := range m.consumers { + if nameLike != "" && c.name != nameLike { + continue + } + items = append(items, &apig.ListConsumersResponseBodyDataItems{ + ConsumerId: tea.String(c.id), + Name: tea.String(c.name), + }) + } + return &apig.ListConsumersResponse{ + Body: &apig.ListConsumersResponseBody{ + Data: &apig.ListConsumersResponseBodyData{ + Items: items, + }, + }, + }, nil +} + +func (m *mockAPIGClient) CreateConsumerAuthorizationRules(req *apig.CreateConsumerAuthorizationRulesRequest) (*apig.CreateConsumerAuthorizationRulesResponse, error) { + var ruleIDs []*string + for _, rule := range req.AuthorizationRules { + cid := tea.StringValue(rule.ConsumerId) + m.nextID++ + ruleID := fmt.Sprintf("rule-%d", m.nextID) + m.rules[cid] = append(m.rules[cid], ruleID) + ruleIDs = append(ruleIDs, tea.String(ruleID)) + } + return &apig.CreateConsumerAuthorizationRulesResponse{ + Body: &apig.CreateConsumerAuthorizationRulesResponseBody{ + Data: &apig.CreateConsumerAuthorizationRulesResponseBodyData{ + ConsumerAuthorizationRuleIds: ruleIDs, + }, + }, + }, nil +} + +func (m *mockAPIGClient) QueryConsumerAuthorizationRules(req *apig.QueryConsumerAuthorizationRulesRequest) (*apig.QueryConsumerAuthorizationRulesResponse, error) { + cid := tea.StringValue(req.ConsumerId) + rules := m.rules[cid] + var items []*apig.QueryConsumerAuthorizationRulesResponseBodyDataItems + for _, rid := range rules { + items = append(items, &apig.QueryConsumerAuthorizationRulesResponseBodyDataItems{ + ConsumerAuthorizationRuleId: tea.String(rid), + }) + } + return &apig.QueryConsumerAuthorizationRulesResponse{ + Body: &apig.QueryConsumerAuthorizationRulesResponseBody{ + Data: &apig.QueryConsumerAuthorizationRulesResponseBodyData{ + Items: items, + }, + }, + }, nil +} + +func newTestAPIGBackend(client APIGClient) *APIGBackend { + return NewAPIGBackendWithClient(client, APIGConfig{ + Region: "cn-hangzhou", + GatewayID: "gw-test", + ModelAPIID: "api-test", + EnvID: "env-test", + }) +} + +func TestAPIGCreateConsumer(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + result, err := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "alice"}) + if err != nil { + t.Fatalf("CreateConsumer failed: %v", err) + } + if result.Status != "created" { + t.Errorf("expected created, got %s", result.Status) + } + if result.ConsumerID == "" { + t.Error("expected non-empty consumer ID") + } + if result.APIKey == "" { + t.Error("expected non-empty API key") + } +} + +func TestAPIGCreateConsumerIdempotent(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + b.CreateConsumer(context.Background(), ConsumerRequest{Name: "bob"}) + result, err := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "bob"}) + if err != nil { + t.Fatalf("second CreateConsumer failed: %v", err) + } + if result.Status != "exists" { + t.Errorf("expected exists, got %s", result.Status) + } +} + +func TestAPIGBindConsumer(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + result, _ := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "carol"}) + + err := b.BindConsumer(context.Background(), BindRequest{ + ConsumerID: result.ConsumerID, + ModelAPIID: "api-test", + EnvID: "env-test", + }) + if err != nil { + t.Fatalf("BindConsumer failed: %v", err) + } + + // Second bind should be idempotent + err = b.BindConsumer(context.Background(), BindRequest{ + ConsumerID: result.ConsumerID, + ModelAPIID: "api-test", + EnvID: "env-test", + }) + if err != nil { + t.Fatalf("second BindConsumer failed: %v", err) + } +} + +func TestAPIGDeleteConsumer(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + result, _ := b.CreateConsumer(context.Background(), ConsumerRequest{Name: "dave"}) + + err := b.DeleteConsumer(context.Background(), result.ConsumerID) + if err != nil { + t.Fatalf("DeleteConsumer failed: %v", err) + } + + if len(mock.consumers) != 0 { + t.Errorf("expected 0 consumers after delete, got %d", len(mock.consumers)) + } +} + +func TestAPIGConsumerNamePrefix(t *testing.T) { + mock := newMockAPIGClient() + b := newTestAPIGBackend(mock) + + b.CreateConsumer(context.Background(), ConsumerRequest{Name: "eve"}) + + // Verify the consumer was created with gateway ID prefix + found := false + for _, c := range mock.consumers { + if c.name == "gw-test-eve" { + found = true + break + } + } + if !found { + t.Error("expected consumer name to be prefixed with gateway ID") + } +} diff --git a/orchestrator/backend/cloud_credentials.go b/orchestrator/backend/cloud_credentials.go new file mode 100644 index 00000000..5213712a --- /dev/null +++ b/orchestrator/backend/cloud_credentials.go @@ -0,0 +1,57 @@ +package backend + +import ( + "fmt" + "os" + + credential "github.com/aliyun/credentials-go/credentials" +) + +// CloudCredentialProvider abstracts Alibaba Cloud credential creation. +type CloudCredentialProvider interface { + GetCredential() (credential.Credential, error) +} + +// DefaultCloudCredentialProvider builds credentials from environment variables. +type DefaultCloudCredentialProvider struct{} + +// NewDefaultCloudCredentialProvider creates a provider that auto-detects OIDC or AK/SK. +func NewDefaultCloudCredentialProvider() *DefaultCloudCredentialProvider { + return &DefaultCloudCredentialProvider{} +} + +func (p *DefaultCloudCredentialProvider) GetCredential() (credential.Credential, error) { + oidcTokenFile := os.Getenv("ALIBABA_CLOUD_OIDC_TOKEN_FILE") + if oidcTokenFile != "" { + if _, err := os.Stat(oidcTokenFile); err == nil { + region := envOrDefault("HICLAW_REGION", "cn-hangzhou") + stsEndpoint := fmt.Sprintf("sts-vpc.%s.aliyuncs.com", region) + config := new(credential.Config). + SetType("oidc_role_arn"). + SetRoleArn(os.Getenv("ALIBABA_CLOUD_ROLE_ARN")). + SetOIDCProviderArn(os.Getenv("ALIBABA_CLOUD_OIDC_PROVIDER_ARN")). + SetOIDCTokenFilePath(oidcTokenFile). + SetRoleSessionName("hiclaw-orchestrator"). + SetSTSEndpoint(stsEndpoint) + return credential.NewCredential(config) + } + } + + ak := os.Getenv("ALIBABA_CLOUD_ACCESS_KEY_ID") + if ak != "" { + config := new(credential.Config). + SetType("access_key"). + SetAccessKeyId(ak). + SetAccessKeySecret(os.Getenv("ALIBABA_CLOUD_ACCESS_KEY_SECRET")) + return credential.NewCredential(config) + } + + return nil, fmt.Errorf("no Alibaba Cloud credentials found: set ALIBABA_CLOUD_OIDC_TOKEN_FILE or ALIBABA_CLOUD_ACCESS_KEY_ID") +} + +func envOrDefault(key, defaultVal string) string { + if v := os.Getenv(key); v != "" { + return v + } + return defaultVal +} diff --git a/orchestrator/backend/sae.go b/orchestrator/backend/sae.go new file mode 100644 index 00000000..e4bc1fd0 --- /dev/null +++ b/orchestrator/backend/sae.go @@ -0,0 +1,336 @@ +package backend + +import ( + "context" + "encoding/json" + "fmt" + "log" + "strings" + + openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client" + sae "github.com/alibabacloud-go/sae-20190506/v4/client" +) + +// SAEClient abstracts the SAE SDK client for testability. +type SAEClient interface { + CreateApplication(req *sae.CreateApplicationRequest) (*sae.CreateApplicationResponse, error) + DeleteApplication(req *sae.DeleteApplicationRequest) (*sae.DeleteApplicationResponse, error) + StartApplication(req *sae.StartApplicationRequest) (*sae.StartApplicationResponse, error) + StopApplication(req *sae.StopApplicationRequest) (*sae.StopApplicationResponse, error) + DescribeApplicationStatus(req *sae.DescribeApplicationStatusRequest) (*sae.DescribeApplicationStatusResponse, error) + ListApplications(req *sae.ListApplicationsRequest) (*sae.ListApplicationsResponse, error) +} + +// SAEConfig holds SAE backend configuration. +type SAEConfig struct { + Region string + NamespaceID string + WorkerImage string + CopawWorkerImage string + VPCID string + VSwitchID string + SecurityGroupID string + CPU int32 + Memory int32 +} + +// SAEBackend manages worker lifecycle via Alibaba Cloud SAE. +type SAEBackend struct { + client SAEClient + config SAEConfig + containerPrefix string +} + +// NewSAEBackend creates a SAEBackend with auto-configured SDK client. +func NewSAEBackend(creds CloudCredentialProvider, config SAEConfig, containerPrefix string) (*SAEBackend, error) { + if containerPrefix == "" { + containerPrefix = "hiclaw-worker-" + } + if config.CPU == 0 { + config.CPU = 1000 + } + if config.Memory == 0 { + config.Memory = 2048 + } + + cred, err := creds.GetCredential() + if err != nil { + return nil, fmt.Errorf("build SAE credentials: %w", err) + } + + endpoint := fmt.Sprintf("sae.%s.aliyuncs.com", config.Region) + apiConfig := &openapi.Config{} + apiConfig.SetCredential(cred). + SetRegionId(config.Region). + SetEndpoint(endpoint) + + client, err := sae.NewClient(apiConfig) + if err != nil { + return nil, fmt.Errorf("create SAE client: %w", err) + } + + return &SAEBackend{ + client: client, + config: config, + containerPrefix: containerPrefix, + }, nil +} + +// NewSAEBackendWithClient creates a SAEBackend with a custom client (for testing). +func NewSAEBackendWithClient(client SAEClient, config SAEConfig, containerPrefix string) *SAEBackend { + if containerPrefix == "" { + containerPrefix = "hiclaw-worker-" + } + if config.CPU == 0 { + config.CPU = 1000 + } + if config.Memory == 0 { + config.Memory = 2048 + } + return &SAEBackend{ + client: client, + config: config, + containerPrefix: containerPrefix, + } +} + +func (s *SAEBackend) Name() string { return "sae" } + +func (s *SAEBackend) Available(_ context.Context) bool { + return IsAliyunRuntime() && s.config.WorkerImage != "" +} + +func (s *SAEBackend) Create(_ context.Context, req CreateRequest) (*WorkerResult, error) { + appName := s.containerPrefix + req.Name + + // Check if already exists + existingID, err := s.findAppByName(appName) + if err != nil { + return nil, err + } + if existingID != "" { + return nil, fmt.Errorf("%w: SAE app %q", ErrConflict, appName) + } + + // Build env vars + image := req.Image + if image == "" { + if req.Runtime == "copaw" && s.config.CopawWorkerImage != "" { + image = s.config.CopawWorkerImage + } else { + image = s.config.WorkerImage + } + } + + envList := s.buildEnvList(req.Env) + + saeReq := &sae.CreateApplicationRequest{} + saeReq.SetAppName(appName). + SetNamespaceId(s.config.NamespaceID). + SetPackageType("Image"). + SetImageUrl(image). + SetCpu(s.config.CPU). + SetMemory(s.config.Memory). + SetReplicas(1). + SetVpcId(s.config.VPCID). + SetVSwitchId(s.config.VSwitchID). + SetSecurityGroupId(s.config.SecurityGroupID). + SetAppDescription(fmt.Sprintf("HiClaw Worker Agent: %s", req.Name)). + SetEnvs(envList). + SetCustomImageNetworkType("internet") + + resp, err := s.client.CreateApplication(saeReq) + if err != nil { + return nil, fmt.Errorf("SAE CreateApplication: %w", err) + } + + appID := "" + if resp.Body != nil && resp.Body.Data != nil && resp.Body.Data.AppId != nil { + appID = *resp.Body.Data.AppId + } + + log.Printf("[SAE] Created application %s (%s)", appName, appID) + + return &WorkerResult{ + Name: req.Name, + Backend: "sae", + Status: StatusStarting, + AppID: appID, + }, nil +} + +func (s *SAEBackend) Delete(_ context.Context, name string) error { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return err + } + if appID == "" { + return nil // already gone + } + + req := &sae.DeleteApplicationRequest{} + req.SetAppId(appID) + _, err = s.client.DeleteApplication(req) + if err != nil { + return fmt.Errorf("SAE DeleteApplication: %w", err) + } + + log.Printf("[SAE] Deleted application %s (%s)", appName, appID) + return nil +} + +func (s *SAEBackend) Start(_ context.Context, name string) error { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return err + } + if appID == "" { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + + req := &sae.StartApplicationRequest{} + req.SetAppId(appID) + _, err = s.client.StartApplication(req) + if err != nil { + return fmt.Errorf("SAE StartApplication: %w", err) + } + return nil +} + +func (s *SAEBackend) Stop(_ context.Context, name string) error { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return err + } + if appID == "" { + return fmt.Errorf("%w: worker %q", ErrNotFound, name) + } + + req := &sae.StopApplicationRequest{} + req.SetAppId(appID) + _, err = s.client.StopApplication(req) + if err != nil { + return fmt.Errorf("SAE StopApplication: %w", err) + } + return nil +} + +func (s *SAEBackend) Status(_ context.Context, name string) (*WorkerResult, error) { + appName := s.containerPrefix + name + appID, err := s.findAppByName(appName) + if err != nil { + return nil, err + } + if appID == "" { + return &WorkerResult{ + Name: name, + Backend: "sae", + Status: StatusNotFound, + }, nil + } + + req := &sae.DescribeApplicationStatusRequest{} + req.SetAppId(appID) + resp, err := s.client.DescribeApplicationStatus(req) + if err != nil { + return nil, fmt.Errorf("SAE DescribeApplicationStatus: %w", err) + } + + rawStatus := "unknown" + if resp.Body != nil && resp.Body.Data != nil && resp.Body.Data.CurrentStatus != nil { + rawStatus = *resp.Body.Data.CurrentStatus + } + + return &WorkerResult{ + Name: name, + Backend: "sae", + Status: normalizeSAEStatus(rawStatus), + AppID: appID, + RawStatus: rawStatus, + }, nil +} + +func (s *SAEBackend) List(_ context.Context) ([]WorkerResult, error) { + req := &sae.ListApplicationsRequest{} + req.SetNamespaceId(s.config.NamespaceID) + resp, err := s.client.ListApplications(req) + if err != nil { + return nil, fmt.Errorf("SAE ListApplications: %w", err) + } + + results := make([]WorkerResult, 0) + if resp.Body == nil || resp.Body.Data == nil { + return results, nil + } + + for _, app := range resp.Body.Data.Applications { + if app.AppName == nil || !strings.HasPrefix(*app.AppName, s.containerPrefix) { + continue + } + name := strings.TrimPrefix(*app.AppName, s.containerPrefix) + appID := "" + if app.AppId != nil { + appID = *app.AppId + } + results = append(results, WorkerResult{ + Name: name, + Backend: "sae", + AppID: appID, + }) + } + return results, nil +} + +// --- internal helpers --- + +func (s *SAEBackend) findAppByName(appName string) (string, error) { + req := &sae.ListApplicationsRequest{} + req.SetNamespaceId(s.config.NamespaceID). + SetAppName(appName) + resp, err := s.client.ListApplications(req) + if err != nil { + return "", fmt.Errorf("SAE ListApplications: %w", err) + } + + if resp.Body == nil || resp.Body.Data == nil { + return "", nil + } + + for _, app := range resp.Body.Data.Applications { + if app.AppName != nil && *app.AppName == appName { + if app.AppId != nil { + return *app.AppId, nil + } + } + } + return "", nil +} + +func (s *SAEBackend) buildEnvList(env map[string]string) string { + type envEntry struct { + Name string `json:"name"` + Value string `json:"value"` + } + entries := make([]envEntry, 0, len(env)) + for k, v := range env { + entries = append(entries, envEntry{Name: k, Value: v}) + } + b, _ := json.Marshal(entries) + return string(b) +} + +func normalizeSAEStatus(status string) WorkerStatus { + switch strings.ToUpper(status) { + case "RUNNING": + return StatusRunning + case "STOPPED": + return StatusStopped + case "DEPLOYING": + return StatusStarting + default: + return StatusUnknown + } +} diff --git a/orchestrator/backend/sae_test.go b/orchestrator/backend/sae_test.go new file mode 100644 index 00000000..32950f67 --- /dev/null +++ b/orchestrator/backend/sae_test.go @@ -0,0 +1,260 @@ +package backend + +import ( + "context" + "fmt" + "testing" + + sae "github.com/alibabacloud-go/sae-20190506/v4/client" + "github.com/alibabacloud-go/tea/tea" +) + +// mockSAEClient implements SAEClient for testing. +type mockSAEClient struct { + apps map[string]*mockSAEApp // appName -> app +} + +type mockSAEApp struct { + appID string + status string + envs string // JSON array +} + +func newMockSAEClient() *mockSAEClient { + return &mockSAEClient{apps: map[string]*mockSAEApp{}} +} + +func (m *mockSAEClient) CreateApplication(req *sae.CreateApplicationRequest) (*sae.CreateApplicationResponse, error) { + name := *req.AppName + if _, exists := m.apps[name]; exists { + return nil, fmt.Errorf("app %s already exists", name) + } + appID := "app-" + name + m.apps[name] = &mockSAEApp{ + appID: appID, + status: "DEPLOYING", + envs: tea.StringValue(req.Envs), + } + return &sae.CreateApplicationResponse{ + Body: &sae.CreateApplicationResponseBody{ + Data: &sae.CreateApplicationResponseBodyData{ + AppId: tea.String(appID), + }, + }, + }, nil +} + +func (m *mockSAEClient) DeleteApplication(req *sae.DeleteApplicationRequest) (*sae.DeleteApplicationResponse, error) { + for name, app := range m.apps { + if app.appID == *req.AppId { + delete(m.apps, name) + return &sae.DeleteApplicationResponse{}, nil + } + } + return &sae.DeleteApplicationResponse{}, nil +} + +func (m *mockSAEClient) StartApplication(req *sae.StartApplicationRequest) (*sae.StartApplicationResponse, error) { + for _, app := range m.apps { + if app.appID == *req.AppId { + app.status = "RUNNING" + return &sae.StartApplicationResponse{}, nil + } + } + return nil, fmt.Errorf("app not found") +} + +func (m *mockSAEClient) StopApplication(req *sae.StopApplicationRequest) (*sae.StopApplicationResponse, error) { + for _, app := range m.apps { + if app.appID == *req.AppId { + app.status = "STOPPED" + return &sae.StopApplicationResponse{}, nil + } + } + return nil, fmt.Errorf("app not found") +} + +func (m *mockSAEClient) DescribeApplicationStatus(req *sae.DescribeApplicationStatusRequest) (*sae.DescribeApplicationStatusResponse, error) { + for _, app := range m.apps { + if app.appID == *req.AppId { + return &sae.DescribeApplicationStatusResponse{ + Body: &sae.DescribeApplicationStatusResponseBody{ + Data: &sae.DescribeApplicationStatusResponseBodyData{ + CurrentStatus: tea.String(app.status), + }, + }, + }, nil + } + } + return nil, fmt.Errorf("app not found") +} + +func (m *mockSAEClient) ListApplications(req *sae.ListApplicationsRequest) (*sae.ListApplicationsResponse, error) { + var apps []*sae.ListApplicationsResponseBodyDataApplications + for name, app := range m.apps { + // Filter by app_name if provided + if req.AppName != nil && *req.AppName != "" && *req.AppName != name { + continue + } + apps = append(apps, &sae.ListApplicationsResponseBodyDataApplications{ + AppId: tea.String(app.appID), + AppName: tea.String(name), + }) + } + return &sae.ListApplicationsResponse{ + Body: &sae.ListApplicationsResponseBody{ + Data: &sae.ListApplicationsResponseBodyData{ + Applications: apps, + }, + }, + }, nil +} + +func newTestSAEBackend(client SAEClient) *SAEBackend { + return NewSAEBackendWithClient(client, SAEConfig{ + Region: "cn-hangzhou", + NamespaceID: "test-ns", + WorkerImage: "hiclaw/worker:latest", + VPCID: "vpc-test", + VSwitchID: "vsw-test", + SecurityGroupID: "sg-test", + }, "hiclaw-worker-") +} + +func TestSAECreate(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + result, err := b.Create(context.Background(), CreateRequest{ + Name: "alice", + Image: "custom:v1", + Env: map[string]string{"KEY": "VAL"}, + }) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + if result.Name != "alice" { + t.Errorf("expected alice, got %s", result.Name) + } + if result.Backend != "sae" { + t.Errorf("expected sae, got %s", result.Backend) + } + if result.AppID == "" { + t.Error("expected non-empty app ID") + } +} + +func TestSAECreateConflict(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:v1"}) + _, err := b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:v1"}) + if err == nil { + t.Error("expected conflict error") + } +} + +func TestSAEDelete(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + b.Create(context.Background(), CreateRequest{Name: "bob", Image: "img:v1"}) + if err := b.Delete(context.Background(), "bob"); err != nil { + t.Fatalf("Delete failed: %v", err) + } + + result, _ := b.Status(context.Background(), "bob") + if result.Status != StatusNotFound { + t.Errorf("expected not_found after delete, got %s", result.Status) + } +} + +func TestSAEDeleteNotFound(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + if err := b.Delete(context.Background(), "ghost"); err != nil { + t.Errorf("delete non-existent should not error, got: %v", err) + } +} + +func TestSAEStartStop(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + b.Create(context.Background(), CreateRequest{Name: "carol", Image: "img:v1"}) + + if err := b.Start(context.Background(), "carol"); err != nil { + t.Fatalf("Start failed: %v", err) + } + result, _ := b.Status(context.Background(), "carol") + if result.Status != StatusRunning { + t.Errorf("expected running, got %s", result.Status) + } + + if err := b.Stop(context.Background(), "carol"); err != nil { + t.Fatalf("Stop failed: %v", err) + } + result, _ = b.Status(context.Background(), "carol") + if result.Status != StatusStopped { + t.Errorf("expected stopped, got %s", result.Status) + } +} + +func TestSAEStartNotFound(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + err := b.Start(context.Background(), "ghost") + if err == nil { + t.Error("expected error for non-existent worker") + } +} + +func TestSAEStatus(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + result, _ := b.Status(context.Background(), "nonexistent") + if result.Status != StatusNotFound { + t.Errorf("expected not_found, got %s", result.Status) + } +} + +func TestSAEList(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + workers, _ := b.List(context.Background()) + if len(workers) != 0 { + t.Errorf("expected empty list, got %d", len(workers)) + } + + b.Create(context.Background(), CreateRequest{Name: "w1", Image: "img:v1"}) + b.Create(context.Background(), CreateRequest{Name: "w2", Image: "img:v1"}) + + workers, _ = b.List(context.Background()) + if len(workers) != 2 { + t.Errorf("expected 2 workers, got %d", len(workers)) + } +} + +func TestNormalizeSAEStatus(t *testing.T) { + cases := []struct { + input string + expected WorkerStatus + }{ + {"RUNNING", StatusRunning}, + {"STOPPED", StatusStopped}, + {"DEPLOYING", StatusStarting}, + {"UNKNOWN", StatusUnknown}, + {"", StatusUnknown}, + } + for _, tc := range cases { + got := normalizeSAEStatus(tc.input) + if got != tc.expected { + t.Errorf("normalizeSAEStatus(%q) = %s, want %s", tc.input, got, tc.expected) + } + } +} diff --git a/orchestrator/config.go b/orchestrator/config.go index 0349ab7d..3cc1541f 100644 --- a/orchestrator/config.go +++ b/orchestrator/config.go @@ -1,31 +1,117 @@ package main -import "os" +import ( + "os" + "strconv" + + "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/credentials" +) // Config holds all configuration for the orchestrator service. type Config struct { // ListenAddr is the address to listen on (default ":2375"). ListenAddr string - // SocketPath is the Docker socket path (default "/var/run/docker.sock"). SocketPath string - // ContainerPrefix is the required prefix for worker container names (default "hiclaw-worker-"). ContainerPrefix string - // Runtime is the deployment runtime ("aliyun" for cloud, empty for local). Runtime string + + // Auth + ManagerAPIKey string // HICLAW_ORCHESTRATOR_API_KEY + + // SAE Backend + Region string + SAENamespaceID string + SAEWorkerImage string + SAECopawWorkerImage string + SAEVPCID string + SAEVSwitchID string + SAESecurityGroupID string + SAEWorkerCPU int32 + SAEWorkerMemory int32 + + // APIG Gateway + GWGatewayID string + GWModelAPIID string + GWEnvID string + + // STS + OSSBucket string + STSRoleArn string + OIDCProviderArn string + OIDCTokenFile string + + // Orchestrator URL (advertised to SAE workers for STS refresh) + OrchestratorURL string } // LoadConfig reads configuration from environment variables. func LoadConfig() *Config { - c := &Config{ + return &Config{ ListenAddr: envOrDefault("HICLAW_PROXY_LISTEN", ":2375"), SocketPath: envOrDefault("HICLAW_PROXY_SOCKET", "/var/run/docker.sock"), ContainerPrefix: envOrDefault("HICLAW_PROXY_CONTAINER_PREFIX", "hiclaw-worker-"), Runtime: os.Getenv("HICLAW_RUNTIME"), + + ManagerAPIKey: os.Getenv("HICLAW_ORCHESTRATOR_API_KEY"), + + Region: envOrDefault("HICLAW_REGION", "cn-hangzhou"), + SAENamespaceID: os.Getenv("HICLAW_SAE_NAMESPACE_ID"), + SAEWorkerImage: os.Getenv("HICLAW_SAE_WORKER_IMAGE"), + SAECopawWorkerImage: os.Getenv("HICLAW_SAE_COPAW_WORKER_IMAGE"), + SAEVPCID: os.Getenv("HICLAW_SAE_VPC_ID"), + SAEVSwitchID: os.Getenv("HICLAW_SAE_VSWITCH_ID"), + SAESecurityGroupID: os.Getenv("HICLAW_SAE_SECURITY_GROUP_ID"), + SAEWorkerCPU: int32(envOrDefaultInt("HICLAW_SAE_WORKER_CPU", 1000)), + SAEWorkerMemory: int32(envOrDefaultInt("HICLAW_SAE_WORKER_MEMORY", 2048)), + + GWGatewayID: os.Getenv("HICLAW_GW_GATEWAY_ID"), + GWModelAPIID: os.Getenv("HICLAW_GW_MODEL_API_ID"), + GWEnvID: os.Getenv("HICLAW_GW_ENV_ID"), + + OSSBucket: os.Getenv("HICLAW_OSS_BUCKET"), + STSRoleArn: os.Getenv("ALIBABA_CLOUD_ROLE_ARN"), + OIDCProviderArn: os.Getenv("ALIBABA_CLOUD_OIDC_PROVIDER_ARN"), + OIDCTokenFile: os.Getenv("ALIBABA_CLOUD_OIDC_TOKEN_FILE"), + + OrchestratorURL: os.Getenv("HICLAW_ORCHESTRATOR_URL"), + } +} + +func (c *Config) SAEConfig() backend.SAEConfig { + return backend.SAEConfig{ + Region: c.Region, + NamespaceID: c.SAENamespaceID, + WorkerImage: c.SAEWorkerImage, + CopawWorkerImage: c.SAECopawWorkerImage, + VPCID: c.SAEVPCID, + VSwitchID: c.SAEVSwitchID, + SecurityGroupID: c.SAESecurityGroupID, + CPU: c.SAEWorkerCPU, + Memory: c.SAEWorkerMemory, + } +} + +func (c *Config) APIGConfig() backend.APIGConfig { + return backend.APIGConfig{ + Region: c.Region, + GatewayID: c.GWGatewayID, + ModelAPIID: c.GWModelAPIID, + EnvID: c.GWEnvID, + } +} + +func (c *Config) STSConfig() credentials.STSConfig { + return credentials.STSConfig{ + Region: c.Region, + RoleArn: c.STSRoleArn, + OIDCProviderArn: c.OIDCProviderArn, + OIDCTokenFile: c.OIDCTokenFile, + OSSBucket: c.OSSBucket, } - return c } func envOrDefault(key, defaultVal string) string { @@ -34,3 +120,12 @@ func envOrDefault(key, defaultVal string) string { } return defaultVal } + +func envOrDefaultInt(key string, defaultVal int) int { + if v := os.Getenv(key); v != "" { + if n, err := strconv.Atoi(v); err == nil { + return n + } + } + return defaultVal +} diff --git a/orchestrator/credentials/handler.go b/orchestrator/credentials/handler.go new file mode 100644 index 00000000..18dc6408 --- /dev/null +++ b/orchestrator/credentials/handler.go @@ -0,0 +1,46 @@ +package credentials + +import ( + "log" + "net/http" + + "github.com/alibaba/hiclaw/orchestrator/auth" + "github.com/alibaba/hiclaw/orchestrator/internal/httputil" +) + +// Handler handles /credentials/* HTTP requests. +type Handler struct { + stsService *STSService +} + +// NewHandler creates a credentials Handler. +func NewHandler(stsService *STSService) *Handler { + return &Handler{stsService: stsService} +} + +// RefreshToken handles POST /credentials/sts. +func (h *Handler) RefreshToken(w http.ResponseWriter, r *http.Request) { + if h.stsService == nil { + httputil.WriteError(w, http.StatusServiceUnavailable, "STS service not available (not in cloud mode)") + return + } + + caller := auth.CallerFromContext(r.Context()) + workerName := "" + if caller != nil { + workerName = caller.WorkerName + } + if workerName == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker identity not found in request context") + return + } + + token, err := h.stsService.IssueWorkerToken(r.Context(), workerName) + if err != nil { + log.Printf("[ERROR] issue STS token for worker %s: %v", workerName, err) + httputil.WriteError(w, http.StatusInternalServerError, "failed to issue STS token: "+err.Error()) + return + } + + httputil.WriteJSON(w, http.StatusOK, token) +} diff --git a/orchestrator/credentials/handler_test.go b/orchestrator/credentials/handler_test.go new file mode 100644 index 00000000..a44cca20 --- /dev/null +++ b/orchestrator/credentials/handler_test.go @@ -0,0 +1,101 @@ +package credentials + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/alibaba/hiclaw/orchestrator/auth" +) + +func TestHandlerRefreshToken(t *testing.T) { + // Mock STS endpoint + mockSTS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + json.NewEncoder(w).Encode(map[string]interface{}{ + "Credentials": map[string]string{ + "AccessKeyId": "test-ak", + "AccessKeySecret": "test-sk", + "SecurityToken": "test-token", + "Expiration": "2026-03-26T12:00:00Z", + }, + }) + })) + defer mockSTS.Close() + + tmpFile, _ := os.CreateTemp("", "oidc-*") + tmpFile.WriteString("mock-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSServiceWithClient(STSConfig{ + Region: "cn-hangzhou", + RoleArn: "acs:ram::123:role/test", + OIDCProviderArn: "acs:ram::123:oidc-provider/test", + OIDCTokenFile: tmpFile.Name(), + OSSBucket: "test-bucket", + }, mockSTS.Client()) + svc.endpointOverride = mockSTS.URL + + h := NewHandler(svc) + + // Build request with worker identity in context + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + ctx := context.WithValue(req.Context(), auth.CallerKeyForTest(), &auth.CallerIdentity{ + Role: "worker", + WorkerName: "alice", + }) + req = req.WithContext(ctx) + + w := httptest.NewRecorder() + h.RefreshToken(w, req) + + if w.Code != http.StatusOK { + t.Fatalf("expected 200, got %d: %s", w.Code, w.Body.String()) + } + + var token STSToken + json.NewDecoder(w.Body).Decode(&token) + if token.AccessKeyID != "test-ak" { + t.Errorf("expected test-ak, got %s", token.AccessKeyID) + } + if token.OSSBucket != "test-bucket" { + t.Errorf("expected test-bucket, got %s", token.OSSBucket) + } +} + +func TestHandlerNoSTSService(t *testing.T) { + h := NewHandler(nil) + + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + w := httptest.NewRecorder() + h.RefreshToken(w, req) + + if w.Code != http.StatusServiceUnavailable { + t.Errorf("expected 503, got %d", w.Code) + } +} + +func TestHandlerMissingWorkerIdentity(t *testing.T) { + tmpFile, _ := os.CreateTemp("", "oidc-*") + tmpFile.WriteString("mock-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSService(STSConfig{ + Region: "cn-hangzhou", + OIDCTokenFile: tmpFile.Name(), + }) + h := NewHandler(svc) + + // Request without caller identity in context + req := httptest.NewRequest(http.MethodPost, "/credentials/sts", nil) + w := httptest.NewRecorder() + h.RefreshToken(w, req) + + if w.Code != http.StatusBadRequest { + t.Errorf("expected 400, got %d", w.Code) + } +} diff --git a/orchestrator/credentials/sts.go b/orchestrator/credentials/sts.go new file mode 100644 index 00000000..c067ec17 --- /dev/null +++ b/orchestrator/credentials/sts.go @@ -0,0 +1,149 @@ +package credentials + +import ( + "context" + "encoding/json" + "fmt" + "io" + "net/http" + "net/url" + "os" + "strings" + "time" +) + +// STSConfig holds configuration for the STS token service. +type STSConfig struct { + Region string + RoleArn string + OIDCProviderArn string + OIDCTokenFile string + OSSBucket string +} + +func (c STSConfig) endpoint() string { + return fmt.Sprintf("https://sts-vpc.%s.aliyuncs.com", c.Region) +} + +// STSService issues scoped STS tokens to workers via AssumeRoleWithOIDC. +type STSService struct { + config STSConfig + httpClient *http.Client + endpointOverride string // for testing +} + +// NewSTSService creates an STS service. +func NewSTSService(config STSConfig) *STSService { + return &STSService{ + config: config, + httpClient: &http.Client{Timeout: 30 * time.Second}, + } +} + +// NewSTSServiceWithClient creates an STS service with a custom HTTP client (for testing). +func NewSTSServiceWithClient(config STSConfig, client *http.Client) *STSService { + return &STSService{ + config: config, + httpClient: client, + } +} + +// IssueWorkerToken calls AssumeRoleWithOIDC with an inline policy scoped to the worker. +func (s *STSService) IssueWorkerToken(ctx context.Context, workerName string) (*STSToken, error) { + oidcToken, err := os.ReadFile(s.config.OIDCTokenFile) + if err != nil { + return nil, fmt.Errorf("read OIDC token file: %w", err) + } + + policy := BuildWorkerPolicy(s.config.OSSBucket, workerName) + endpoint := s.config.endpoint() + if s.endpointOverride != "" { + endpoint = s.endpointOverride + } + + form := url.Values{ + "Action": {"AssumeRoleWithOIDC"}, + "Format": {"JSON"}, + "Version": {"2015-04-01"}, + "RoleArn": {s.config.RoleArn}, + "OIDCProviderArn": {s.config.OIDCProviderArn}, + "OIDCToken": {strings.TrimSpace(string(oidcToken))}, + "RoleSessionName": {fmt.Sprintf("hiclaw-worker-%s", workerName)}, + "DurationSeconds": {"3600"}, + "Policy": {policy}, + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, strings.NewReader(form.Encode())) + if err != nil { + return nil, fmt.Errorf("build STS request: %w", err) + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + + resp, err := s.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("STS request failed: %w", err) + } + defer resp.Body.Close() + + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("STS returned %d: %s", resp.StatusCode, string(body)) + } + + var stsResp struct { + Credentials struct { + AccessKeyId string `json:"AccessKeyId"` + AccessKeySecret string `json:"AccessKeySecret"` + SecurityToken string `json:"SecurityToken"` + Expiration string `json:"Expiration"` + } `json:"Credentials"` + } + if err := json.Unmarshal(body, &stsResp); err != nil { + return nil, fmt.Errorf("parse STS response: %w", err) + } + + ossEndpoint := fmt.Sprintf("oss-%s-internal.aliyuncs.com", s.config.Region) + + return &STSToken{ + AccessKeyID: stsResp.Credentials.AccessKeyId, + AccessKeySecret: stsResp.Credentials.AccessKeySecret, + SecurityToken: stsResp.Credentials.SecurityToken, + Expiration: stsResp.Credentials.Expiration, + ExpiresInSec: 3600, + OSSEndpoint: ossEndpoint, + OSSBucket: s.config.OSSBucket, + }, nil +} + +// BuildWorkerPolicy generates an OSS inline policy restricting access to +// agents/{workerName}/* and shared/*. +func BuildWorkerPolicy(bucket, workerName string) string { + policy := map[string]interface{}{ + "Version": "1", + "Statement": []map[string]interface{}{ + { + "Effect": "Allow", + "Action": []string{"oss:ListObjects"}, + "Resource": []string{fmt.Sprintf("acs:oss:*:*:%s", bucket)}, + "Condition": map[string]interface{}{ + "StringLike": map[string]interface{}{ + "oss:Prefix": []string{ + fmt.Sprintf("agents/%s/*", workerName), + "shared/*", + }, + }, + }, + }, + { + "Effect": "Allow", + "Action": []string{"oss:GetObject", "oss:PutObject", "oss:DeleteObject"}, + "Resource": []string{ + fmt.Sprintf("acs:oss:*:*:%s/agents/%s/*", bucket, workerName), + fmt.Sprintf("acs:oss:*:*:%s/shared/*", bucket), + }, + }, + }, + } + b, _ := json.Marshal(policy) + return string(b) +} diff --git a/orchestrator/credentials/sts_test.go b/orchestrator/credentials/sts_test.go new file mode 100644 index 00000000..4d5dbee8 --- /dev/null +++ b/orchestrator/credentials/sts_test.go @@ -0,0 +1,136 @@ +package credentials + +import ( + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "testing" +) + +func TestBuildWorkerPolicy(t *testing.T) { + policy := BuildWorkerPolicy("my-bucket", "alice") + + var parsed map[string]interface{} + if err := json.Unmarshal([]byte(policy), &parsed); err != nil { + t.Fatalf("policy is not valid JSON: %v", err) + } + + stmts, ok := parsed["Statement"].([]interface{}) + if !ok || len(stmts) != 2 { + t.Fatalf("expected 2 statements, got %v", parsed["Statement"]) + } + + // Check ListObjects statement has correct condition + stmt0 := stmts[0].(map[string]interface{}) + cond := stmt0["Condition"].(map[string]interface{}) + sl := cond["StringLike"].(map[string]interface{}) + prefixes := sl["oss:Prefix"].([]interface{}) + if prefixes[0] != "agents/alice/*" { + t.Errorf("expected agents/alice/*, got %v", prefixes[0]) + } + if prefixes[1] != "shared/*" { + t.Errorf("expected shared/*, got %v", prefixes[1]) + } + + // Check read/write statement has correct resources + stmt1 := stmts[1].(map[string]interface{}) + resources := stmt1["Resource"].([]interface{}) + if resources[0] != "acs:oss:*:*:my-bucket/agents/alice/*" { + t.Errorf("unexpected resource: %v", resources[0]) + } + if resources[1] != "acs:oss:*:*:my-bucket/shared/*" { + t.Errorf("unexpected resource: %v", resources[1]) + } +} + +func TestIssueWorkerToken(t *testing.T) { + // Mock STS endpoint + mockSTS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + t.Errorf("expected POST, got %s", r.Method) + } + r.ParseForm() + if r.FormValue("Action") != "AssumeRoleWithOIDC" { + t.Errorf("expected AssumeRoleWithOIDC action") + } + if r.FormValue("DurationSeconds") != "3600" { + t.Errorf("expected 3600 duration") + } + // Verify policy contains worker name + policy := r.FormValue("Policy") + if policy == "" { + t.Error("expected non-empty policy") + } + var parsed map[string]interface{} + json.Unmarshal([]byte(policy), &parsed) + + json.NewEncoder(w).Encode(map[string]interface{}{ + "Credentials": map[string]string{ + "AccessKeyId": "test-ak", + "AccessKeySecret": "test-sk", + "SecurityToken": "test-token", + "Expiration": "2026-03-26T12:00:00Z", + }, + }) + })) + defer mockSTS.Close() + + // Write temp OIDC token file + tmpFile, _ := os.CreateTemp("", "oidc-token-*") + tmpFile.WriteString("mock-oidc-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSServiceWithClient(STSConfig{ + Region: "cn-hangzhou", + RoleArn: "acs:ram::123:role/test", + OIDCProviderArn: "acs:ram::123:oidc-provider/test", + OIDCTokenFile: tmpFile.Name(), + OSSBucket: "test-bucket", + }, mockSTS.Client()) + + // Override endpoint to use mock server + svc.endpointOverride = mockSTS.URL + + token, err := svc.IssueWorkerToken(t.Context(), "alice") + if err != nil { + t.Fatalf("IssueWorkerToken failed: %v", err) + } + if token.AccessKeyID != "test-ak" { + t.Errorf("expected test-ak, got %s", token.AccessKeyID) + } + if token.OSSBucket != "test-bucket" { + t.Errorf("expected test-bucket, got %s", token.OSSBucket) + } + if token.ExpiresInSec != 3600 { + t.Errorf("expected 3600, got %d", token.ExpiresInSec) + } +} + +func TestIssueWorkerTokenSTSError(t *testing.T) { + mockSTS := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusForbidden) + w.Write([]byte(`{"Code":"NoPermission","Message":"forbidden"}`)) + })) + defer mockSTS.Close() + + tmpFile, _ := os.CreateTemp("", "oidc-token-*") + tmpFile.WriteString("mock-oidc-token") + tmpFile.Close() + defer os.Remove(tmpFile.Name()) + + svc := NewSTSServiceWithClient(STSConfig{ + Region: "cn-hangzhou", + RoleArn: "acs:ram::123:role/test", + OIDCProviderArn: "acs:ram::123:oidc-provider/test", + OIDCTokenFile: tmpFile.Name(), + OSSBucket: "test-bucket", + }, mockSTS.Client()) + svc.endpointOverride = mockSTS.URL + + _, err := svc.IssueWorkerToken(t.Context(), "alice") + if err == nil { + t.Error("expected error for STS 403") + } +} diff --git a/orchestrator/credentials/types.go b/orchestrator/credentials/types.go new file mode 100644 index 00000000..3b1d26c8 --- /dev/null +++ b/orchestrator/credentials/types.go @@ -0,0 +1,12 @@ +package credentials + +// STSToken holds temporary credentials issued to a worker. +type STSToken struct { + AccessKeyID string `json:"access_key_id"` + AccessKeySecret string `json:"access_key_secret"` + SecurityToken string `json:"security_token"` + Expiration string `json:"expiration"` + ExpiresInSec int `json:"expires_in_sec"` + OSSEndpoint string `json:"oss_endpoint"` + OSSBucket string `json:"oss_bucket"` +} diff --git a/orchestrator/go.mod b/orchestrator/go.mod index e825d291..d769583a 100644 --- a/orchestrator/go.mod +++ b/orchestrator/go.mod @@ -1,3 +1,24 @@ module github.com/alibaba/hiclaw/orchestrator go 1.23 + +require ( + github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.16 + github.com/alibabacloud-go/sae-20190506/v4 v4.11.5 + github.com/aliyun/credentials-go v1.4.12 +) + +require ( + github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5 // indirect + github.com/alibabacloud-go/apig-20240327/v6 v6.0.6 // indirect + github.com/alibabacloud-go/debug v1.0.1 // indirect + github.com/alibabacloud-go/tea v1.3.13 // indirect + github.com/alibabacloud-go/tea-utils/v2 v2.0.7 // indirect + github.com/clbanning/mxj/v2 v2.7.0 // indirect + github.com/json-iterator/go v1.1.12 // indirect + github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect + github.com/modern-go/reflect2 v1.0.2 // indirect + github.com/tjfoc/gmsm v1.4.1 // indirect + golang.org/x/net v0.26.0 // indirect + gopkg.in/ini.v1 v1.67.0 // indirect +) diff --git a/orchestrator/go.sum b/orchestrator/go.sum new file mode 100644 index 00000000..0fb0ac49 --- /dev/null +++ b/orchestrator/go.sum @@ -0,0 +1,251 @@ +cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw= +github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/alibabacloud-go/alibabacloud-gateway-pop v0.0.6 h1:eIf+iGJxdU4U9ypaUfbtOWCsZSbTb8AUHvyPrxu6mAA= +github.com/alibabacloud-go/alibabacloud-gateway-pop v0.0.6/go.mod h1:4EUIoxs/do24zMOGGqYVWgw0s9NtiylnJglOeEB5UJo= +github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.4/go.mod h1:sCavSAvdzOjul4cEqeVtvlSaSScfNsTQ+46HwlTL1hc= +github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5 h1:zE8vH9C7JiZLNJJQ5OwjU9mSi4T9ef9u3BURT6LCLC8= +github.com/alibabacloud-go/alibabacloud-gateway-spi v0.0.5/go.mod h1:tWnyE9AjF8J8qqLk645oUmVUnFybApTQWklQmi5tY6g= +github.com/alibabacloud-go/apig-20240327/v6 v6.0.6 h1:5W4QYdzTfCQiwdhDmflbAp5NV07ps3IcGqpvPAR0ZbU= +github.com/alibabacloud-go/apig-20240327/v6 v6.0.6/go.mod h1:VCQaugCTmRp5E1HXWFnCdpJP+UVSFkaJBn787UpR6Qw= +github.com/alibabacloud-go/darabonba-array v0.1.0 h1:vR8s7b1fWAQIjEjWnuF0JiKsCvclSRTfDzZHTYqfufY= +github.com/alibabacloud-go/darabonba-array v0.1.0/go.mod h1:BLKxr0brnggqOJPqT09DFJ8g3fsDshapUD3C3aOEFaI= +github.com/alibabacloud-go/darabonba-encode-util v0.0.2 h1:1uJGrbsGEVqWcWxrS9MyC2NG0Ax+GpOM5gtupki31XE= +github.com/alibabacloud-go/darabonba-encode-util v0.0.2/go.mod h1:JiW9higWHYXm7F4PKuMgEUETNZasrDM6vqVr/Can7H8= +github.com/alibabacloud-go/darabonba-map v0.0.2 h1:qvPnGB4+dJbJIxOOfawxzF3hzMnIpjmafa0qOTp6udc= +github.com/alibabacloud-go/darabonba-map v0.0.2/go.mod h1:28AJaX8FOE/ym8OUFWga+MtEzBunJwQGceGQlvaPGPc= +github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.14/go.mod h1:lxFGfobinVsQ49ntjpgWghXmIF0/Sm4+wvBJ1h5RtaE= +github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.16 h1:LHhjxZkNWAKWepxcWyzgFgo0X6TUVhL7sC7ANc60p8A= +github.com/alibabacloud-go/darabonba-openapi/v2 v2.1.16/go.mod h1:lxFGfobinVsQ49ntjpgWghXmIF0/Sm4+wvBJ1h5RtaE= +github.com/alibabacloud-go/darabonba-signature-util v0.0.7 h1:UzCnKvsjPFzApvODDNEYqBHMFt1w98wC7FOo0InLyxg= +github.com/alibabacloud-go/darabonba-signature-util v0.0.7/go.mod h1:oUzCYV2fcCH797xKdL6BDH8ADIHlzrtKVjeRtunBNTQ= +github.com/alibabacloud-go/darabonba-string v1.0.2 h1:E714wms5ibdzCqGeYJ9JCFywE5nDyvIXIIQbZVFkkqo= +github.com/alibabacloud-go/darabonba-string v1.0.2/go.mod h1:93cTfV3vuPhhEwGGpKKqhVW4jLe7tDpo3LUM0i0g6mA= +github.com/alibabacloud-go/debug v0.0.0-20190504072949-9472017b5c68/go.mod h1:6pb/Qy8c+lqua8cFpEy7g39NRRqOWc3rOwAy8m5Y2BY= +github.com/alibabacloud-go/debug v1.0.0/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= +github.com/alibabacloud-go/debug v1.0.1 h1:MsW9SmUtbb1Fnt3ieC6NNZi6aEwrXfDksD4QA6GSbPg= +github.com/alibabacloud-go/debug v1.0.1/go.mod h1:8gfgZCCAC3+SCzjWtY053FrOcd4/qlH6IHTI4QyICOc= +github.com/alibabacloud-go/endpoint-util v1.1.0 h1:r/4D3VSw888XGaeNpP994zDUaxdgTSHBbVfZlzf6b5Q= +github.com/alibabacloud-go/endpoint-util v1.1.0/go.mod h1:O5FuCALmCKs2Ff7JFJMudHs0I5EBgecXXxZRyswlEjE= +github.com/alibabacloud-go/openapi-util v0.1.0 h1:0z75cIULkDrdEhkLWgi9tnLe+KhAFE/r5Pb3312/eAY= +github.com/alibabacloud-go/openapi-util v0.1.0/go.mod h1:sQuElr4ywwFRlCCberQwKRFhRzIyG4QTP/P4y1CJ6Ws= +github.com/alibabacloud-go/sae-20190506/v4 v4.11.5 h1:UXZ7qgJW/E7LIYrLpmUl7riMt+gf0EQTnAz+B0P44FQ= +github.com/alibabacloud-go/sae-20190506/v4 v4.11.5/go.mod h1:6g/gfr1piYjVZWKmnX6OqnVOiQK21Dxi1ra11Y5xuRM= +github.com/alibabacloud-go/tea v1.1.0/go.mod h1:IkGyUSX4Ba1V+k4pCtJUc6jDpZLFph9QMy2VUPTwukg= +github.com/alibabacloud-go/tea v1.1.7/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= +github.com/alibabacloud-go/tea v1.1.8/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= +github.com/alibabacloud-go/tea v1.1.11/go.mod h1:/tmnEaQMyb4Ky1/5D+SE1BAsa5zj/KeGOFfwYm3N/p4= +github.com/alibabacloud-go/tea v1.1.17/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A= +github.com/alibabacloud-go/tea v1.1.20/go.mod h1:nXxjm6CIFkBhwW4FQkNrolwbfon8Svy6cujmKFUq98A= +github.com/alibabacloud-go/tea v1.2.2/go.mod h1:CF3vOzEMAG+bR4WOql8gc2G9H3EkH3ZLAQdpmpXMgwk= +github.com/alibabacloud-go/tea v1.3.13 h1:WhGy6LIXaMbBM6VBYcsDCz6K/TPsT1Ri2hPmmZffZ94= +github.com/alibabacloud-go/tea v1.3.13/go.mod h1:A560v/JTQ1n5zklt2BEpurJzZTI8TUT+Psg2drWlxRg= +github.com/alibabacloud-go/tea-utils v1.3.1 h1:iWQeRzRheqCMuiF3+XkfybB3kTgUXkXX+JMrqfLeB2I= +github.com/alibabacloud-go/tea-utils v1.3.1/go.mod h1:EI/o33aBfj3hETm4RLiAxF/ThQdSngxrpF8rKUDJjPE= +github.com/alibabacloud-go/tea-utils/v2 v2.0.5/go.mod h1:dL6vbUT35E4F4bFTHL845eUloqaerYBYPsdWR2/jhe4= +github.com/alibabacloud-go/tea-utils/v2 v2.0.7 h1:WDx5qW3Xa5ZgJ1c8NfqJkF6w+AU5wB8835UdhPr6Ax0= +github.com/alibabacloud-go/tea-utils/v2 v2.0.7/go.mod h1:qxn986l+q33J5VkialKMqT/TTs3E+U9MJpd001iWQ9I= +github.com/aliyun/credentials-go v1.1.2/go.mod h1:ozcZaMR5kLM7pwtCMEpVmQ242suV6qTJya2bDq4X1Tw= +github.com/aliyun/credentials-go v1.3.1/go.mod h1:8jKYhQuDawt8x2+fusqa1Y6mPxemTsBEN04dgcAcYz0= +github.com/aliyun/credentials-go v1.3.6/go.mod h1:1LxUuX7L5YrZUWzBrRyk0SwSdH4OmPrib8NVePL3fxM= +github.com/aliyun/credentials-go v1.4.5/go.mod h1:Jm6d+xIgwJVLVWT561vy67ZRP4lPTQxMbEYRuT2Ti1U= +github.com/aliyun/credentials-go v1.4.12 h1:7D8eXGotNwthZuUEgAMgBoqxmIHwfaPVwW+/04LIJSQ= +github.com/aliyun/credentials-go v1.4.12/go.mod h1:Jm6d+xIgwJVLVWT561vy67ZRP4lPTQxMbEYRuT2Ti1U= +github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= +github.com/clbanning/mxj/v2 v2.7.0 h1:WA/La7UGCanFe5NpHF0Q3DNtnCsVoxbPKuyBNHWRyME= +github.com/clbanning/mxj/v2 v2.7.0/go.mod h1:hNiWqW14h+kc+MdF9C6/YoRfjEJoR3ou6tn/Qo+ve2s= +github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw= +github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= +github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= +github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= +github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= +github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= +github.com/golang/mock v1.1.1/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfbm0A= +github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U= +github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw= +github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8= +github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA= +github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs= +github.com/golang/protobuf v1.4.0-rc.4.0.20200313231945-b860323f09d0/go.mod h1:WU3c8KckQ9AFe+yFwt9sWVRKCVIyN9cPHBJSNnbL67w= +github.com/golang/protobuf v1.4.0/go.mod h1:jodUvKwWbYaEsadDk5Fwe5c77LiNKVO9IDvqG2KuDX0= +github.com/golang/protobuf v1.4.2/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw735rRwI= +github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= +github.com/google/go-cmp v0.3.0/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMywk6iLU= +github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI= +github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= +github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/gopherjs/gopherjs v0.0.0-20200217142428-fce0ec30dd00/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= +github.com/json-iterator/go v1.1.10/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= +github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= +github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= +github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= +github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= +github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/modern-go/reflect2 v1.0.2 h1:xBagoLtFs94CBntxluKeaWgTMpvLxC4ur3nMaC9Gz0M= +github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/niemeyer/pretty v0.0.0-20200227124842-a10e7caefd8e/go.mod h1:zD1mROLANZcx1PVRCS0qkT7pwLkGfwJo4zjcN/Tysno= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= +github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= +github.com/smartystreets/assertions v1.1.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= +github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.5.1 h1:nOGnQDM7FYENwehXlg/kFVnos3rEvtKTjRvOWSzb6H4= +github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= +github.com/tjfoc/gmsm v1.3.2/go.mod h1:HaUcFuY0auTiaHB9MHFGCPx5IaLhTUd2atbCFBQXn9w= +github.com/tjfoc/gmsm v1.4.1 h1:aMe1GlZb+0bLjn+cKTPEvvn9oUEBlJitaZiiBwsbgho= +github.com/tjfoc/gmsm v1.4.1/go.mod h1:j4INPkHWMrhJb38G+J6W4Tw0AbuN8Thu3PbdVYhVcTE= +github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.1.30/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20191219195013-becbf705a915/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= +golang.org/x/crypto v0.13.0/go.mod h1:y6Z2r+Rw4iayiXXAIxJIDAJ1zMW4yaTpebo8fPOliYc= +golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/crypto v0.18.0/go.mod h1:R0j02AL6hcrfOiy9T4ZYp/rcWeMxM3L6QYxlOuEG1mg= +golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= +golang.org/x/crypto v0.21.0/go.mod h1:0BP7YvVV9gBbVKyeTG0Gyn+gZm94bibOW5BjDEYAOMs= +golang.org/x/crypto v0.23.0/go.mod h1:CKFgDieR+mRhux2Lsu27y0fO304Db0wZe70UKqHu0v8= +golang.org/x/crypto v0.24.0/go.mod h1:Z1PMYSOR5nyMcyAVAIQSKCDwalqy85Aqn1x3Ws4L5DM= +golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= +golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= +golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= +golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= +golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= +golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.12.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.15.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190213061140-3a22650c66bd/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= +golang.org/x/net v0.0.0-20190311183353-d8887717615a/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200506145744-7e3656a0809f/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= +golang.org/x/net v0.0.0-20201010224723-4f7140c49acb/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= +golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= +golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= +golang.org/x/net v0.15.0/go.mod h1:idbUs1IY1+zTqbi8yxTbhexhEEk5ur9LInksu6HrEpk= +golang.org/x/net v0.17.0/go.mod h1:NxSsAGuq816PNPmqtQdLE42eU2Fs7NoRIZrHJAlaCOE= +golang.org/x/net v0.20.0/go.mod h1:z8BVo6PvndSri0LbOE3hAn0apkU+1YvI6E70E9jsnvY= +golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= +golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg= +golang.org/x/net v0.25.0/go.mod h1:JkAGAh7GEvH74S6FOH42FLoXpXbE/aqXSrIQjXgsiwM= +golang.org/x/net v0.26.0 h1:soB7SVo0PWrY4vPW/+ay0jKDNScG2X9wFeYlXIvJsOQ= +golang.org/x/net v0.26.0/go.mod h1:5YKkiSynbBIh3p6iOc/vibscux0x38BZDkn8sCUPxHE= +golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= +golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20200317015054-43a5402ce75a/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.3.0/go.mod h1:FU7BRWz2tNW+3quACPkgCx/L+uEAv1htQ0V83Z9Rj+Y= +golang.org/x/sync v0.6.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200509044756-6aff5f38e54f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.16.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.20.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/sys v0.21.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/telemetry v0.0.0-20240228155512-f48c80bd79b2/go.mod h1:TeRTkGYfJXctD9OcfyVLyj2J3IxLnKwHJR8f4D8a3YE= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= +golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= +golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= +golang.org/x/term v0.12.0/go.mod h1:owVbMEjm3cBLCHdkQu9b1opXd4ETQWc3BhuQGKgXgvU= +golang.org/x/term v0.13.0/go.mod h1:LTmsnFJwVN6bCy1rVCoS+qHT1HhALEFxKncY3WNNh4U= +golang.org/x/term v0.16.0/go.mod h1:yn7UURbUtPyrVJPGPq404EukNFxcm/foM+bV/bfcDsY= +golang.org/x/term v0.17.0/go.mod h1:lLRBjIVuehSbZlaOtGMbcMncT+aqLLLmKrsjNrUguwk= +golang.org/x/term v0.18.0/go.mod h1:ILwASektA3OnRv7amZ1xhE/KTR+u50pbXfZ03+6Nx58= +golang.org/x/term v0.20.0/go.mod h1:8UkIAJTvZgivsXaD6/pH6U9ecQzZ45awqEOzuCvwpFY= +golang.org/x/term v0.21.0/go.mod h1:ooXLefLobQVslOqselCNF4SxFAaoS6KujMbsGzSDmX0= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= +golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.15.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= +golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= +golang.org/x/tools v0.0.0-20190311212946-11955173bddd/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190328211700-ab21143f2384/go.mod h1:LCzVGOaR6xXOjkQ3onu1FJEFr0SW1gC7cKk1uF8kGRs= +golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBnxXauZ1Gv1EHHAz9KjViQ78xBX0Q= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20200509030707-2212a7e161a5/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= +golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= +golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= +golang.org/x/tools v0.13.0/go.mod h1:HvlwmtVNQAhOuCjW7xxvovg8wbNq7LwfXh/k7wXUl58= +golang.org/x/tools v0.21.1-0.20240508182429-e35e4ccd0d2d/go.mod h1:aiJjzUbINMkxbQROHiO6hDPo2LHcIPhhQsa9DLh0yGk= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM= +google.golang.org/appengine v1.4.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4= +google.golang.org/genproto v0.0.0-20180817151627-c66870c02cf8/go.mod h1:JiN7NxoALGmiZfu7CAH4rXhgtRTLTxftemlI0sWmxmc= +google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc= +google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= +google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg= +google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY= +google.golang.org/grpc v1.31.0/go.mod h1:N36X2cJ7JwdamYAgDz+s+rVMFjt3numwzf/HckM8pak= +google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= +google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= +google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM= +google.golang.org/protobuf v1.20.1-0.20200309200217-e05f789c0967/go.mod h1:A+miEFZTKqfCUM6K7xSMQL9OKL/b6hQv+e19PK+JZNE= +google.golang.org/protobuf v1.21.0/go.mod h1:47Nbq4nVaFHyn7ilMalzfO3qCViNmqZ2kzikPIcrTAo= +google.golang.org/protobuf v1.23.0/go.mod h1:EGpADcykh3NcUnDUJcl1+ZksZNG86OlYog2l/sGQquU= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/check.v1 v1.0.0-20200227125254-8fa46927fb4f/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/ini.v1 v1.56.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= +gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= +gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10= +gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= +honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= +honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= diff --git a/orchestrator/internal/httputil/response.go b/orchestrator/internal/httputil/response.go new file mode 100644 index 00000000..a1accc42 --- /dev/null +++ b/orchestrator/internal/httputil/response.go @@ -0,0 +1,26 @@ +package httputil + +import ( + "encoding/json" + "log" + "net/http" +) + +// ErrorResponse is the standard JSON error response. +type ErrorResponse struct { + Message string `json:"message"` +} + +// WriteJSON writes a JSON response with the given status code. +func WriteJSON(w http.ResponseWriter, status int, v interface{}) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(v); err != nil { + log.Printf("[WARN] failed to write JSON response: %v", err) + } +} + +// WriteError writes a JSON error response. +func WriteError(w http.ResponseWriter, status int, message string) { + WriteJSON(w, status, ErrorResponse{Message: message}) +} diff --git a/orchestrator/main.go b/orchestrator/main.go index 4d3db865..7d2d8ce1 100644 --- a/orchestrator/main.go +++ b/orchestrator/main.go @@ -1,60 +1,124 @@ package main import ( + "context" "log" "net/http" "github.com/alibaba/hiclaw/orchestrator/api" + authpkg "github.com/alibaba/hiclaw/orchestrator/auth" "github.com/alibaba/hiclaw/orchestrator/backend" + "github.com/alibaba/hiclaw/orchestrator/credentials" "github.com/alibaba/hiclaw/orchestrator/proxy" ) func main() { cfg := LoadConfig() + // --- Cloud credentials (shared by SAE, APIG, STS, OSS key persistence) --- + var cloudCreds backend.CloudCredentialProvider + if cfg.Runtime == "aliyun" { + cloudCreds = backend.NewDefaultCloudCredentialProvider() + } + + // --- Auth --- + var persister authpkg.KeyPersister + if cfg.Runtime == "aliyun" && cloudCreds != nil && cfg.OSSBucket != "" { + cred, err := cloudCreds.GetCredential() + if err != nil { + log.Printf("[WARN] Failed to get credentials for key persistence: %v", err) + } else { + persister = authpkg.NewOSSKeyPersister(cfg.Region, cfg.OSSBucket, cred) + } + } + + keyStore := authpkg.NewKeyStore(cfg.ManagerAPIKey, persister) + if err := keyStore.Recover(context.Background()); err != nil { + log.Printf("[WARN] Failed to recover worker keys: %v", err) + } + authMw := authpkg.NewMiddleware(keyStore) + // --- Security validator (for Docker API passthrough) --- validator := proxy.NewSecurityValidator() // --- Docker API passthrough handler --- proxyHandler := proxy.NewHandler(cfg.SocketPath, validator) - // --- Backend registry --- + // --- Worker backends --- var workerBackends []backend.WorkerBackend // Docker backend (always registered; Available() checks socket at runtime) dockerBackend := backend.NewDockerBackend(cfg.SocketPath, cfg.ContainerPrefix) workerBackends = append(workerBackends, dockerBackend) - // Future: SAE backend (Phase 2) - // if cfg.Runtime == "aliyun" { ... } + // SAE backend (cloud mode) + var saeBackend *backend.SAEBackend + if cfg.Runtime == "aliyun" && cloudCreds != nil { + var err error + saeBackend, err = backend.NewSAEBackend(cloudCreds, cfg.SAEConfig(), cfg.ContainerPrefix) + if err != nil { + log.Printf("[WARN] Failed to create SAE backend: %v", err) + } else { + workerBackends = append(workerBackends, saeBackend) + } + } + + // --- Gateway backends --- + var gatewayBackends []backend.GatewayBackend + if cfg.Runtime == "aliyun" && cloudCreds != nil { + apigBackend, err := backend.NewAPIGBackend(cloudCreds, cfg.APIGConfig()) + if err != nil { + log.Printf("[WARN] Failed to create APIG backend: %v", err) + } else { + gatewayBackends = append(gatewayBackends, apigBackend) + } + } + + registry := backend.NewRegistry(workerBackends, gatewayBackends) - registry := backend.NewRegistry(workerBackends, nil) + // --- STS service --- + var stsService *credentials.STSService + if cfg.Runtime == "aliyun" && cfg.OIDCTokenFile != "" { + stsService = credentials.NewSTSService(cfg.STSConfig()) + } // --- API handlers --- - workerHandler := api.NewWorkerHandler(registry) - gatewayHandler := api.NewGatewayHandler() + workerHandler := api.NewWorkerHandler(registry, keyStore, cfg.OrchestratorURL) + gatewayHandler := api.NewGatewayHandler(registry) + stsHandler := credentials.NewHandler(stsService) - // --- Route registration --- + // --- Route registration with auth --- mux := http.NewServeMux() - // Worker lifecycle API - mux.HandleFunc("POST /workers", workerHandler.Create) - mux.HandleFunc("GET /workers", workerHandler.List) - mux.HandleFunc("GET /workers/{name}", workerHandler.Status) - mux.HandleFunc("POST /workers/{name}/start", workerHandler.Start) - mux.HandleFunc("POST /workers/{name}/stop", workerHandler.Stop) - mux.HandleFunc("DELETE /workers/{name}", workerHandler.Delete) + // Worker lifecycle API — manager only + mux.Handle("POST /workers", authMw.RequireManager(http.HandlerFunc(workerHandler.Create))) + mux.Handle("GET /workers", authMw.RequireManager(http.HandlerFunc(workerHandler.List))) + mux.Handle("GET /workers/{name}", authMw.RequireManager(http.HandlerFunc(workerHandler.Status))) + mux.Handle("POST /workers/{name}/start", authMw.RequireManager(http.HandlerFunc(workerHandler.Start))) + mux.Handle("POST /workers/{name}/stop", authMw.RequireManager(http.HandlerFunc(workerHandler.Stop))) + mux.Handle("DELETE /workers/{name}", authMw.RequireManager(http.HandlerFunc(workerHandler.Delete))) - // Gateway API (Phase 1: 501 stubs) - mux.HandleFunc("POST /gateway/consumers", gatewayHandler.CreateConsumer) - mux.HandleFunc("POST /gateway/consumers/{id}/bind", gatewayHandler.BindConsumer) - mux.HandleFunc("DELETE /gateway/consumers/{id}", gatewayHandler.DeleteConsumer) + // Gateway API — manager only + mux.Handle("POST /gateway/consumers", authMw.RequireManager(http.HandlerFunc(gatewayHandler.CreateConsumer))) + mux.Handle("POST /gateway/consumers/{id}/bind", authMw.RequireManager(http.HandlerFunc(gatewayHandler.BindConsumer))) + mux.Handle("DELETE /gateway/consumers/{id}", authMw.RequireManager(http.HandlerFunc(gatewayHandler.DeleteConsumer))) - // Docker API passthrough (catch-all, existing behavior) - mux.Handle("/", proxyHandler) + // STS token refresh — workers only + mux.Handle("POST /credentials/sts", authMw.RequireWorker(http.HandlerFunc(stsHandler.RefreshToken))) + + // Docker API passthrough (catch-all) — manager only + mux.Handle("/", authMw.RequireManager(proxyHandler)) // --- Start server --- - log.Printf("hiclaw-orchestrator listening on %s, docker socket: %s", cfg.ListenAddr, cfg.SocketPath) + log.Printf("hiclaw-orchestrator listening on %s", cfg.ListenAddr) + if cfg.Runtime == "aliyun" { + log.Printf("Cloud mode: SAE=%v, APIG=%v, STS=%v", saeBackend != nil, len(gatewayBackends) > 0, stsService != nil) + } else { + log.Printf("Local mode: docker socket=%s", cfg.SocketPath) + } + if keyStore.AuthEnabled() { + log.Printf("Auth: enabled (manager key configured)") + } if len(validator.AllowedRegistries) > 0 { log.Printf("Allowed registries: %v", validator.AllowedRegistries) } diff --git a/shared/lib/oss-credentials.sh b/shared/lib/oss-credentials.sh index 5fd82084..8b29c29d 100644 --- a/shared/lib/oss-credentials.sh +++ b/shared/lib/oss-credentials.sh @@ -1,104 +1,73 @@ #!/bin/bash -# oss-credentials.sh - Shared STS credential management for mc (MinIO Client) +# oss-credentials.sh - STS credential management for mc (MinIO Client) # -# In cloud SAE mode, mc requires STS temporary credentials via MC_HOST_hiclaw. +# Workers obtain STS temporary credentials from the orchestrator service. +# The orchestrator holds OIDC credentials and issues per-worker scoped tokens. # STS tokens expire after 1 hour. This library provides lazy-refresh: credentials # are cached in a file and refreshed only when they are about to expire. # +# Required env vars (set by orchestrator at worker creation): +# HICLAW_ORCHESTRATOR_URL - orchestrator HTTP endpoint (e.g. http://hiclaw-orchestrator:2375) +# HICLAW_WORKER_API_KEY - per-worker API key for authentication +# # Usage: # source /opt/hiclaw/scripts/lib/oss-credentials.sh # ensure_mc_credentials # call before any mc command # mc mirror ... # -# In local mode (no OIDC env vars), ensure_mc_credentials is a no-op. +# In local mode (no HICLAW_ORCHESTRATOR_URL), ensure_mc_credentials is a no-op. _OSS_CRED_FILE="/tmp/mc-oss-credentials.env" _OSS_CRED_REFRESH_MARGIN=600 # refresh if less than 10 minutes remaining -# Internal: build an inline STS policy that restricts OSS access to the -# worker's own prefix (agents//*) and the shared prefix (shared/*). -# Called only when HICLAW_WORKER_NAME is set (i.e. worker context). -# Manager does not set HICLAW_WORKER_NAME, so it gets unrestricted access. -_oss_build_worker_policy() { - local worker="$1" - local bucket="${HICLAW_OSS_BUCKET:-hiclaw-cloud-storage}" - - cat <&2 - fi +# Internal: call orchestrator STS endpoint and write credentials to file +_oss_refresh_sts_via_orchestrator() { + local resp http_code + local sts_ak sts_sk sts_token oss_endpoint oss_bucket region - sts_resp=$(curl -s -w "\n%{http_code}" -X POST "https://sts-vpc.${region}.aliyuncs.com" \ - -d "Action=AssumeRoleWithOIDC" \ - -d "Format=JSON" \ - -d "Version=2015-04-01" \ - --data-urlencode "Timestamp=${timestamp}" \ - -d "SignatureNonce=${nonce}" \ - --data-urlencode "RoleArn=${ALIBABA_CLOUD_ROLE_ARN}" \ - --data-urlencode "OIDCProviderArn=${ALIBABA_CLOUD_OIDC_PROVIDER_ARN}" \ - --data-urlencode "OIDCToken=${oidc_token}" \ - -d "RoleSessionName=hiclaw-oss-session" \ - -d "DurationSeconds=3600" \ - "${policy_args[@]}" \ + resp=$(curl -s -w "\n%{http_code}" -X POST "${HICLAW_ORCHESTRATOR_URL}/credentials/sts" \ + -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" \ --connect-timeout 10 --max-time 30 2>&1) - http_code=$(echo "${sts_resp}" | tail -1) - sts_resp=$(echo "${sts_resp}" | sed '$d') + http_code=$(echo "${resp}" | tail -1) + resp=$(echo "${resp}" | sed '$d') if [ "${http_code}" != "200" ]; then - echo "[oss-credentials] ERROR: STS request failed (HTTP ${http_code})" >&2 - echo "[oss-credentials] Response: ${sts_resp}" >&2 + echo "[oss-credentials] ERROR: orchestrator STS request failed (HTTP ${http_code})" >&2 + echo "[oss-credentials] Response: ${resp}" >&2 return 1 fi - sts_ak=$(echo "${sts_resp}" | jq -r '.Credentials.AccessKeyId') - sts_sk=$(echo "${sts_resp}" | jq -r '.Credentials.AccessKeySecret') - sts_token=$(echo "${sts_resp}" | jq -r '.Credentials.SecurityToken') + sts_ak=$(echo "${resp}" | jq -r '.access_key_id') + sts_sk=$(echo "${resp}" | jq -r '.access_key_secret') + sts_token=$(echo "${resp}" | jq -r '.security_token') + oss_endpoint=$(echo "${resp}" | jq -r '.oss_endpoint') + oss_bucket=$(echo "${resp}" | jq -r '.oss_bucket') if [ -z "${sts_ak}" ] || [ "${sts_ak}" = "null" ]; then - echo "[oss-credentials] ERROR: Failed to parse STS credentials" >&2 - echo "[oss-credentials] Response: ${sts_resp}" >&2 + echo "[oss-credentials] ERROR: Failed to parse STS credentials from orchestrator" >&2 + echo "[oss-credentials] Response: ${resp}" >&2 return 1 fi # expires_at = now + 3600 seconds (STS token lifetime) + local expires_at expires_at=$(( $(date +%s) + 3600 )) cat > "${_OSS_CRED_FILE}" </dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 + echo "[oss-credentials] STS credentials refreshed via orchestrator (AK prefix: ${sts_ak:0:8}..., expires: $(date -d @${expires_at} '+%H:%M:%S' 2>/dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 } # Public: ensure MC_HOST_hiclaw is set with valid (non-expired) STS credentials. -# In local mode (no OIDC env vars), this is a no-op. +# In local mode (no HICLAW_ORCHESTRATOR_URL), this is a no-op. ensure_mc_credentials() { # Skip in local mode — mc alias is configured with static credentials - if [ -z "${ALIBABA_CLOUD_OIDC_TOKEN_FILE:-}" ] || [ ! -f "${ALIBABA_CLOUD_OIDC_TOKEN_FILE:-/nonexistent}" ]; then + if [ -z "${HICLAW_ORCHESTRATOR_URL:-}" ]; then return 0 fi @@ -116,7 +85,7 @@ ensure_mc_credentials() { fi if [ "${needs_refresh}" = true ]; then - _oss_refresh_sts || return 1 + _oss_refresh_sts_via_orchestrator || return 1 . "${_OSS_CRED_FILE}" fi From bff6fc5dcf0a5eca23032d074e36e56fddbe118f Mon Sep 17 00:00:00 2001 From: jingze Date: Thu, 26 Mar 2026 16:01:26 +0800 Subject: [PATCH 04/11] refactor(manager): simplify shell scripts to use orchestrator API Phase 3: replace direct Docker API calls and Python/Shell SAE wrappers with thin orchestrator REST API client. - Rewrite container-api.sh: worker_backend_* now call orchestrator /workers/* API - Simplify gateway-api.sh: cloud path calls orchestrator /gateway/* API - Simplify create-worker.sh Step 9: unified orchestrator call, no Docker/SAE split - Delete aliyun-sae.sh and aliyun-api.py (replaced by orchestrator Go backends) - Remove Python SDK dependencies from Dockerfile.aliyun Net deletion: ~1100 lines of shell/Python code. Co-Authored-By: Claude Opus 4.6 --- manager/Dockerfile.aliyun | 14 - .../scripts/create-worker.sh | 141 ++-- .../scripts/enable-worker-console.sh | 30 +- .../scripts/lifecycle-worker.sh | 41 +- manager/scripts/init/start-manager-agent.sh | 8 +- manager/scripts/lib/cloud/aliyun-api.py | 527 --------------- manager/scripts/lib/cloud/aliyun-sae.sh | 81 --- manager/scripts/lib/container-api.sh | 613 +++--------------- manager/scripts/lib/gateway-api.sh | 44 +- orchestrator/backend/apig.go | 21 +- 10 files changed, 213 insertions(+), 1307 deletions(-) delete mode 100644 manager/scripts/lib/cloud/aliyun-api.py delete mode 100644 manager/scripts/lib/cloud/aliyun-sae.sh diff --git a/manager/Dockerfile.aliyun b/manager/Dockerfile.aliyun index c199a9c1..5e86729d 100644 --- a/manager/Dockerfile.aliyun +++ b/manager/Dockerfile.aliyun @@ -23,19 +23,6 @@ FROM ${OPENCLAW_BASE_IMAGE} # mc (MinIO Client) — real binary; wrapper installed after shared libs are copied COPY --from=mc /usr/bin/mc /usr/local/bin/mc.bin -# Install Python packages for cloud Worker management (SAE API via OIDC/AK) -# python3 is already in openclaw-base; install pip and SDK -ARG PIP_INDEX_URL=https://mirrors.aliyun.com/pypi/simple/ -RUN apt-get update && apt-get install -y --no-install-recommends python3-pip && \ - rm -rf /var/lib/apt/lists/* && \ - pip3 install --no-cache-dir \ - --index-url "${PIP_INDEX_URL}" \ - --trusted-host "$(echo ${PIP_INDEX_URL} | sed 's|https\?://||;s|/.*||')" \ - alibabacloud-sae20190506 \ - alibabacloud-apig20240327 \ - alibabacloud-credentials \ - alibabacloud-tea-openapi - # ---- Built-in observability plugin (bundled unconditionally, enabled at runtime) ---- # Placed before agent/configs COPY so that code changes do not invalidate this layer. ARG OPENCLAW_CMS_PLUGIN_URL=https://arms-apm-cn-hangzhou-pre.oss-cn-hangzhou.aliyuncs.com/openclaw-cms-plugin/0.1.1/openclaw-cms-plugin.tar.gz @@ -61,7 +48,6 @@ COPY manager/agent/ /opt/hiclaw/agent/ COPY manager/configs/ /opt/hiclaw/configs/ # ---- Copy scripts: shared libs first, then manager scripts ---- -# manager/scripts/ includes lib/cloud/aliyun-api.py and lib/cloud/aliyun-sae.sh COPY shared/lib/ /opt/hiclaw/scripts/lib/ COPY manager/scripts/ /opt/hiclaw/scripts/ diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index ea368bd6..783ce4f5 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -751,122 +751,105 @@ _build_install_cmd() { echo "${cmd}" } -# Build extra environment variables JSON for container creation -_build_extra_env() { - local items=() - if [ -n "${SKILLS_API_URL}" ]; then - items+=("SKILLS_API_URL=${SKILLS_API_URL}") - fi - if [ -n "${CONSOLE_PORT}" ]; then - items+=("HICLAW_CONSOLE_PORT=${CONSOLE_PORT}") - fi - if [ ${#items[@]} -eq 0 ]; then - echo "[]" - else - printf '%s\n' "${items[@]}" | jq -R . | jq -s . - fi -} - if [ "${REMOTE_MODE}" = true ]; then log "Step 9: Remote mode requested" INSTALL_CMD=$(_build_install_cmd) -elif [ "${HICLAW_RUNTIME}" = "aliyun" ]; then - log "Step 9: Creating Worker via cloud backend (SAE, runtime=${WORKER_RUNTIME})..." - - # Select SAE image based on worker runtime - SAE_IMAGE="" - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - SAE_IMAGE="${HICLAW_SAE_COPAW_WORKER_IMAGE:-}" - if [ -z "${SAE_IMAGE}" ]; then - _fail "HICLAW_SAE_COPAW_WORKER_IMAGE not set (required for copaw runtime on cloud)" - fi - fi +elif container_api_available; then + log "Step 9: Creating Worker via orchestrator (runtime=${WORKER_RUNTIME})..." - # Build complete SAE environment variables (Worker needs these to connect) - SAE_ENVS=$(jq -cn \ + # Build environment variables for the worker + WORKER_ENV=$(jq -cn \ --arg worker_name "${WORKER_NAME}" \ --arg worker_key "${WORKER_KEY}" \ --arg matrix_url "${HICLAW_MATRIX_URL:-}" \ --arg matrix_domain "${MATRIX_DOMAIN}" \ --arg matrix_token "${WORKER_MATRIX_TOKEN}" \ --arg ai_gw_url "${HICLAW_AI_GATEWAY_URL:-}" \ - --arg oss_bucket "${HICLAW_OSS_BUCKET:-hiclaw-cloud-storage}" \ + --arg oss_bucket "${HICLAW_OSS_BUCKET:-}" \ --arg region "${HICLAW_REGION:-cn-hangzhou}" \ --arg runtime "${WORKER_RUNTIME}" \ --arg console_port "${CONSOLE_PORT:-}" \ + --arg skills_api_url "${SKILLS_API_URL:-}" \ + --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + --arg fs_access_key "${WORKER_NAME}" \ + --arg fs_secret_key "${WORKER_MINIO_PASSWORD}" \ '{ + "HICLAW_WORKER_NAME": $worker_name, "HICLAW_WORKER_GATEWAY_KEY": $worker_key, "HICLAW_MATRIX_URL": $matrix_url, "HICLAW_MATRIX_DOMAIN": $matrix_domain, "HICLAW_WORKER_MATRIX_TOKEN": $matrix_token, "HICLAW_AI_GATEWAY_URL": $ai_gw_url, - "HICLAW_OSS_BUCKET": $oss_bucket, - "HICLAW_REGION": $region + "HICLAW_FS_ENDPOINT": ("http://" + ($fs_domain | split(":")[0]) + ":8080"), + "HICLAW_FS_ACCESS_KEY": $fs_access_key, + "HICLAW_FS_SECRET_KEY": $fs_secret_key } + | if $oss_bucket != "" then . + { "HICLAW_OSS_BUCKET": $oss_bucket, "HICLAW_REGION": $region } else . end + | if $skills_api_url != "" then . + { "SKILLS_API_URL": $skills_api_url } else . end + | if $console_port != "" then . + { "HICLAW_CONSOLE_PORT": $console_port } else . end | if $runtime == "copaw" then . + { "HICLAW_RUNTIME": "aliyun" } - | if $console_port != "" then . + { "HICLAW_CONSOLE_PORT": $console_port } else . end else . + { "OPENCLAW_DISABLE_BONJOUR": "1", - "OPENCLAW_MDNS_HOSTNAME": ("hiclaw-w-" + $worker_name) + "OPENCLAW_MDNS_HOSTNAME": ("hiclaw-w-" + $worker_name), + "HOME": ("/root/hiclaw-fs/agents/" + $worker_name) } end') - log " SAE_ENVS: ${SAE_ENVS:0:200}..." - - CREATE_OUTPUT=$(sae_create_worker "${WORKER_NAME}" "${SAE_ENVS}" "${SAE_IMAGE}" 2>/dev/null) || true - log " SAE create response: ${CREATE_OUTPUT:0:300}" - SAE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) - if [ "${SAE_STATUS}" = "created" ] || [ "${SAE_STATUS}" = "exists" ]; then - DEPLOY_MODE="cloud" - WORKER_STATUS="starting" - log " SAE application ready for ${WORKER_NAME}" - else - log " WARNING: SAE application creation returned: ${CREATE_OUTPUT}" - WORKER_STATUS="error" - fi -elif container_api_available; then - log "Step 9: Starting Worker container locally (runtime=${WORKER_RUNTIME})..." - EXTRA_ENV_JSON=$(_build_extra_env) - - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - CREATE_OUTPUT=$(container_create_copaw_worker "${WORKER_NAME}" "${WORKER_NAME}" "${WORKER_MINIO_PASSWORD}" "${EXTRA_ENV_JSON}" "${CUSTOM_IMAGE}" 2>&1) || true - else - CREATE_OUTPUT=$(container_create_worker "${WORKER_NAME}" "${WORKER_NAME}" "${WORKER_MINIO_PASSWORD}" "${EXTRA_ENV_JSON}" "${CUSTOM_IMAGE}" 2>&1) || true - fi - - CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | tail -1) - CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | grep -o 'CONSOLE_HOST_PORT=[0-9]*' | head -1 | cut -d= -f2) - if [ -n "${CONTAINER_ID}" ] && [ ${#CONTAINER_ID} -ge 12 ]; then - DEPLOY_MODE="local" - if [ -n "${CONSOLE_HOST_PORT}" ]; then - log " Console available at host port ${CONSOLE_HOST_PORT}" + # Build create request body + CREATE_BODY=$(jq -cn \ + --arg name "${WORKER_NAME}" \ + --arg image "${CUSTOM_IMAGE:-}" \ + --arg runtime "${WORKER_RUNTIME}" \ + --argjson env "${WORKER_ENV}" \ + '{name: $name, runtime: $runtime, env: $env} + | if $image != "" then . + {image: $image} else . end') + + CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true + log " Create response: ${CREATE_OUTPUT:0:300}" + + CREATE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) + CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | jq -r '.container_id // empty' 2>/dev/null) + + if [ "${CREATE_STATUS}" = "running" ] || [ "${CREATE_STATUS}" = "starting" ]; then + DEPLOY_MODE=$(echo "${CREATE_OUTPUT}" | jq -r '.backend // "local"' 2>/dev/null) + if [ "${DEPLOY_MODE}" = "docker" ]; then + DEPLOY_MODE="local" + elif [ "${DEPLOY_MODE}" = "sae" ]; then + DEPLOY_MODE="cloud" fi - log " Waiting for Worker agent to be ready..." - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - if container_wait_copaw_worker_ready "${WORKER_NAME}" 120; then - WORKER_STATUS="ready" - log " CoPaw Worker agent is ready!" + + # Wait for readiness (only for local Docker containers with exec access) + if [ "${DEPLOY_MODE}" = "local" ] && [ -n "${CONTAINER_ID}" ]; then + log " Waiting for Worker agent to be ready..." + if [ "${WORKER_RUNTIME}" = "copaw" ]; then + if container_wait_copaw_worker_ready "${WORKER_NAME}" 120; then + WORKER_STATUS="ready" + log " CoPaw Worker agent is ready!" + else + WORKER_STATUS="starting" + log " WARNING: CoPaw Worker agent not ready within timeout" + fi else - WORKER_STATUS="starting" - log " WARNING: CoPaw Worker agent not ready within timeout (container may still be initializing)" + if container_wait_worker_ready "${WORKER_NAME}" 120; then + WORKER_STATUS="ready" + log " Worker agent is ready!" + else + WORKER_STATUS="starting" + log " WARNING: Worker agent not ready within timeout" + fi fi else - if container_wait_worker_ready "${WORKER_NAME}" 120; then - WORKER_STATUS="ready" - log " Worker agent is ready!" - else - WORKER_STATUS="starting" - log " WARNING: Worker agent not ready within timeout (container may still be initializing)" - fi + WORKER_STATUS="starting" + log " Worker created on ${DEPLOY_MODE} backend" fi else - log " WARNING: Container creation failed, falling back to remote mode" + log " WARNING: Worker creation failed, falling back to remote mode" INSTALL_CMD=$(_build_install_cmd) fi else - log "Step 9: No container runtime socket available" + log "Step 9: No orchestrator available" INSTALL_CMD=$(_build_install_cmd) fi diff --git a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh index 1ec68dea..23b92258 100755 --- a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh +++ b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh @@ -99,20 +99,30 @@ else log "Disabling console" fi -# --- Recreate container --- -log "Stopping container ${CONTAINER_NAME}..." -_api POST "/containers/${CONTAINER_NAME}/stop?t=10" > /dev/null 2>&1 || true -sleep 1 -_api DELETE "/containers/${CONTAINER_NAME}?force=true" > /dev/null 2>&1 +# --- Recreate container via orchestrator --- +log "Deleting worker ${WORKER_NAME}..." +worker_backend_delete "${WORKER_NAME}" > /dev/null 2>&1 || true sleep 1 -log "Recreating container..." -CREATE_OUTPUT=$(container_create_copaw_worker "${WORKER_NAME}" "${FS_ACCESS_KEY}" "${FS_SECRET_KEY}" "${EXTRA_ENV}" 2>&1) || true +log "Recreating worker..." +# Build env map from the extra env array +ENV_MAP=$(echo "${EXTRA_ENV}" | jq '[.[] | split("=") | {(.[0]): (.[1:] | join("="))}] | add // {}') +ENV_MAP=$(echo "${ENV_MAP}" | jq \ + --arg name "${WORKER_NAME}" \ + --arg fak "${FS_ACCESS_KEY}" \ + --arg fsk "${FS_SECRET_KEY}" \ + '. + {"HICLAW_WORKER_NAME": $name, "HICLAW_FS_ACCESS_KEY": $fak, "HICLAW_FS_SECRET_KEY": $fsk}') + +CREATE_BODY=$(jq -cn \ + --arg name "${WORKER_NAME}" \ + --arg image "${CONTAINER_IMAGE}" \ + --argjson env "${ENV_MAP}" \ + '{name: $name, image: $image, runtime: "copaw", env: $env}') -CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | tail -1) -CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | grep -o 'CONSOLE_HOST_PORT=[0-9]*' | head -1 | cut -d= -f2) +CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true +CREATE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) -if [ -z "${CONTAINER_ID}" ] || [ ${#CONTAINER_ID} -lt 12 ]; then +if [ "${CREATE_STATUS}" != "running" ] && [ "${CREATE_STATUS}" != "starting" ]; then log "ERROR: Failed to recreate container" echo "${CREATE_OUTPUT}" >&2 jq -n '{"error": "recreate_failed"}' diff --git a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh index 5fb6a41d..d54410c6 100755 --- a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh +++ b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh @@ -163,10 +163,7 @@ _worker_has_cron_jobs() { action_sync_status() { _init_lifecycle_file - local backend - backend=$(_detect_worker_backend) - - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "No worker backend available — marking all workers as remote" local workers workers=$(_get_all_workers) @@ -188,7 +185,7 @@ action_sync_status() { _ensure_worker_entry "$worker" local status status=$(worker_backend_status "$worker") - _log "Worker $worker: status=$status (backend=$backend)" + _log "Worker $worker: status=$status" local tmp tmp=$(mktemp) jq --arg w "$worker" --arg s "$status" --arg ts "$(_ts)" \ @@ -305,14 +302,12 @@ action_stop() { _init_lifecycle_file _ensure_worker_entry "$worker" - local backend - backend=$(_detect_worker_backend) - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "ERROR: No worker backend available" return 1 fi - _log "Stopping worker $worker (backend=$backend)" + _log "Stopping worker $worker" if worker_backend_stop "$worker"; then local tmp tmp=$(mktemp) @@ -379,8 +374,7 @@ action_start() { fi local backend - backend=$(_detect_worker_backend) - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "ERROR: No worker backend available" return 1 fi @@ -390,24 +384,23 @@ action_start() { local ok=false if [ "$status" = "not_found" ]; then - _log "Worker $worker not found — recreating (backend=$backend)" + _log "Worker $worker not found — recreating" local creds_file="/data/worker-creds/${worker}.env" if [ -f "$creds_file" ]; then source "$creds_file" fi local runtime runtime=$(jq -r --arg w "$worker" '.workers[$w].runtime // "openclaw"' "$REGISTRY_FILE" 2>/dev/null) - if [ "$backend" = "docker" ]; then - if [ "$runtime" = "copaw" ]; then - container_create_copaw_worker "$worker" "$worker" "${WORKER_MINIO_PASSWORD:-}" 2>&1 && ok=true - else - container_create_worker "$worker" "$worker" "${WORKER_MINIO_PASSWORD:-}" 2>&1 && ok=true - fi - else - worker_backend_create "$worker" "" "" "[]" 2>&1 && ok=true - fi + + # Build create request for orchestrator + local create_body + create_body=$(jq -cn \ + --arg name "$worker" \ + --arg runtime "$runtime" \ + '{name: $name, runtime: $runtime}') + worker_backend_create "$create_body" > /dev/null 2>&1 && ok=true else - _log "Starting worker $worker (status: $status, backend=$backend)" + _log "Starting worker $worker (status: $status)" worker_backend_start "$worker" && ok=true fi @@ -451,8 +444,8 @@ action_ensure_ready() { fi local status - status=$(container_status_worker "$worker") - _log "Worker $worker container_status=$status" + status=$(worker_backend_status "$worker") + _log "Worker $worker status=$status" if [ "$status" = "running" ]; then echo "{\"worker\":\"$worker\",\"status\":\"ready\",\"container_status\":\"running\"}" diff --git a/manager/scripts/init/start-manager-agent.sh b/manager/scripts/init/start-manager-agent.sh index 88a22357..1c1943b4 100755 --- a/manager/scripts/init/start-manager-agent.sh +++ b/manager/scripts/init/start-manager-agent.sh @@ -794,11 +794,9 @@ if container_api_available; then _runtime=$(jq -r --arg w "${_worker_name}" '.workers[$w].runtime // "openclaw"' "${REGISTRY_FILE}" 2>/dev/null) _recreated=false for _attempt in 1 2 3; do - if [ "${_runtime}" = "copaw" ]; then - container_create_copaw_worker "${_worker_name}" "${_worker_name}" "${WORKER_MINIO_PASSWORD}" 2>&1 && _recreated=true && break - else - container_create_worker "${_worker_name}" "${_worker_name}" "${WORKER_MINIO_PASSWORD}" 2>&1 && _recreated=true && break - fi + local _create_body + _create_body=$(jq -cn --arg name "${_worker_name}" --arg runtime "${_runtime}" '{name: $name, runtime: $runtime}') + worker_backend_create "${_create_body}" > /dev/null 2>&1 && _recreated=true && break log " Attempt ${_attempt}/3 failed for ${_worker_name}, retrying in $((5 * _attempt))s..." sleep $((5 * _attempt)) done diff --git a/manager/scripts/lib/cloud/aliyun-api.py b/manager/scripts/lib/cloud/aliyun-api.py deleted file mode 100644 index 945411ff..00000000 --- a/manager/scripts/lib/cloud/aliyun-api.py +++ /dev/null @@ -1,527 +0,0 @@ -#!/usr/bin/env python3 -""" -aliyun-api.py — Alibaba Cloud Worker management for HiClaw Manager. - -Provides SAE application CRUD and AI Gateway consumer management, -callable from shell scripts (create-worker.sh, lifecycle-worker.sh). - -Authentication priority: - 1. OIDC (ALIBABA_CLOUD_OIDC_TOKEN_FILE present) — SAE RRSA - 2. AK/SK (ALIBABA_CLOUD_ACCESS_KEY_ID present) — local/debug - 3. Fail - -Usage: - aliyun-api.py sae-create --name [--image ] [--envs '{"K":"V"}'] - aliyun-api.py sae-delete --name - aliyun-api.py sae-stop --name - aliyun-api.py sae-start --name - aliyun-api.py sae-status --name - aliyun-api.py sae-list - aliyun-api.py gw-create-consumer --name - aliyun-api.py gw-bind-consumer --consumer-id --api-id --env-id - -Output: JSON to stdout. Logs to stderr. -""" - -import argparse -import json -import os -import sys - -# --------------------------------------------------------------------------- -# Logging -# --------------------------------------------------------------------------- - -def log(msg): - print(f"[aliyun-api] {msg}", file=sys.stderr) - -# --------------------------------------------------------------------------- -# Credential helpers -# --------------------------------------------------------------------------- - -def _build_credential(): - """Build alibabacloud Credential based on environment.""" - from alibabacloud_credentials.client import Client as CredClient - from alibabacloud_credentials.models import Config as CredConfig - - oidc_token_file = os.environ.get("ALIBABA_CLOUD_OIDC_TOKEN_FILE", "") - ak = os.environ.get("ALIBABA_CLOUD_ACCESS_KEY_ID", "") - - if oidc_token_file and os.path.isfile(oidc_token_file): - log("Using OIDC RRSA credentials") - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - conf = CredConfig( - type="oidc_role_arn", - role_arn=os.environ["ALIBABA_CLOUD_ROLE_ARN"], - oidc_provider_arn=os.environ["ALIBABA_CLOUD_OIDC_PROVIDER_ARN"], - oidc_token_file_path=oidc_token_file, - role_session_name="hiclaw-manager-role", - sts_endpoint=f"sts-vpc.{region}.aliyuncs.com", - ) - return CredClient(conf) - - if ak: - log("Using AK/SK credentials") - conf = CredConfig( - type="access_key", - access_key_id=ak, - access_key_secret=os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"], - ) - return CredClient(conf) - - raise RuntimeError("No credentials found. Set ALIBABA_CLOUD_OIDC_TOKEN_FILE or ALIBABA_CLOUD_ACCESS_KEY_ID.") - - -def _get_sae_client(): - """Build SAE client with auto-detected credentials.""" - from alibabacloud_sae20190506.client import Client as SaeClient - from alibabacloud_tea_openapi.models import Config as ApiConfig - - cred = _build_credential() - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - - config = ApiConfig( - credential=cred, - region_id=region, - endpoint=f"sae.{region}.aliyuncs.com", - ) - return SaeClient(config) - - -def _get_apig_client(): - """Build AI Gateway (APIG) client with auto-detected credentials.""" - from alibabacloud_apig20240327.client import Client as ApigClient - from alibabacloud_tea_openapi.models import Config as ApiConfig - - cred = _build_credential() - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - - config = ApiConfig( - credential=cred, - region_id=region, - endpoint=f"apig.{region}.aliyuncs.com", - ) - return ApigClient(config) - - -# --------------------------------------------------------------------------- -# Helper: find SAE app by name -# --------------------------------------------------------------------------- - -def _find_worker_app(sae, worker_name): - """Find a SAE application by worker name. Returns (app_id, app_name) or (None, None).""" - from alibabacloud_sae20190506 import models as sae_models - - namespace_id = os.environ.get("HICLAW_SAE_NAMESPACE_ID", "") - app_name = f"hiclaw-worker-{worker_name}" - - req = sae_models.ListApplicationsRequest( - namespace_id=namespace_id, - app_name=app_name, - ) - resp = sae.list_applications(req) - if resp.body and resp.body.data and resp.body.data.applications: - for app in resp.body.data.applications: - if app.app_name == app_name: - return app.app_id, app.app_name - return None, None - - -# --------------------------------------------------------------------------- -# SAE operations -# --------------------------------------------------------------------------- - -def sae_create(args): - """Create a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_name = f"hiclaw-worker-{args.name}" - - # Check if already exists - existing_id, _ = _find_worker_app(sae, args.name) - if existing_id: - log(f"Application already exists: {app_name} ({existing_id})") - print(json.dumps({"app_id": existing_id, "app_name": app_name, "status": "exists"})) - return - - # Parse extra envs (supports @/path/to/file or inline JSON) - envs = {} - if args.envs: - raw = args.envs - if raw.startswith("@"): - with open(raw[1:], "r") as f: - raw = f.read() - envs = json.loads(raw) - - # Read config from environment - region = os.environ.get("HICLAW_REGION", "cn-hangzhou") - namespace_id = os.environ.get("HICLAW_SAE_NAMESPACE_ID", "") - image = args.image or os.environ.get("HICLAW_SAE_WORKER_IMAGE", "") - vpc_id = os.environ.get("HICLAW_SAE_VPC_ID", "") - vswitch_id = os.environ.get("HICLAW_SAE_VSWITCH_ID", "") - sg_id = os.environ.get("HICLAW_SAE_SECURITY_GROUP_ID", "") - oidc_role_name = os.environ.get("HICLAW_SAE_WORKER_OIDC_ROLE_NAME", "hiclaw-worker-role") - cpu = int(os.environ.get("HICLAW_SAE_WORKER_CPU", "1000")) - memory = int(os.environ.get("HICLAW_SAE_WORKER_MEMORY", "2048")) - - if not image: - print(json.dumps({"error": "No worker image. Set HICLAW_SAE_WORKER_IMAGE or --image."})) - sys.exit(1) - - # Base envs for worker (runtime-specific envs are passed via --envs by caller) - base_envs = { - "HICLAW_WORKER_NAME": args.name, - "HICLAW_REGION": region, - "TZ": "Asia/Shanghai", - } - base_envs.update(envs) - - # Build SAE envs JSON array format - env_list = [{"name": k, "value": v} for k, v in base_envs.items()] - - req = sae_models.CreateApplicationRequest( - app_name=app_name, - namespace_id=namespace_id, - package_type="Image", - image_url=image, - cpu=cpu, - memory=memory, - replicas=1, - vpc_id=vpc_id, - v_switch_id=vswitch_id, - security_group_id=sg_id, - app_description=f"HiClaw Worker Agent: {args.name}", - envs=json.dumps(env_list), - oidc_role_name=oidc_role_name, - custom_image_network_type="internet", - ) - - resp = sae.create_application(req) - app_id = resp.body.data.app_id - log(f"Application created: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "created"})) - - -def sae_delete(args): - """Delete a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.DeleteApplicationRequest(app_id=app_id) - sae.delete_application(req) - log(f"Application deleted: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "deleted"})) - - -def sae_stop(args): - """Stop a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.StopApplicationRequest(app_id=app_id) - sae.stop_application(req) - log(f"Application stopped: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "stopped"})) - - -def sae_start(args): - """Start a SAE application for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.StartApplicationRequest(app_id=app_id) - sae.start_application(req) - log(f"Application started: {app_name} ({app_id})") - print(json.dumps({"app_id": app_id, "app_name": app_name, "status": "running"})) - - -def sae_status(args): - """Check SAE application status for a Worker.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - app_id, app_name = _find_worker_app(sae, args.name) - - if not app_id: - print(json.dumps({"app_name": f"hiclaw-worker-{args.name}", "status": "not_found"})) - return - - req = sae_models.DescribeApplicationStatusRequest(app_id=app_id) - resp = sae.describe_application_status(req) - current_status = resp.body.data.current_status if resp.body.data else "unknown" - - # Normalize SAE status to simpler values - status_map = { - "RUNNING": "running", - "STOPPED": "stopped", - "UNKNOWN": "unknown", - "DEPLOYING": "starting", - } - normalized = status_map.get(current_status, current_status.lower() if current_status else "unknown") - - print(json.dumps({ - "app_id": app_id, - "app_name": app_name, - "status": normalized, - "sae_status": current_status, - })) - - -def sae_list(args): - """List all hiclaw-worker SAE applications.""" - from alibabacloud_sae20190506 import models as sae_models - - sae = _get_sae_client() - namespace_id = os.environ.get("HICLAW_SAE_NAMESPACE_ID", "") - - req = sae_models.ListApplicationsRequest(namespace_id=namespace_id) - resp = sae.list_applications(req) - - workers = [] - prefix = "hiclaw-worker-" - if resp.body and resp.body.data and resp.body.data.applications: - for app in resp.body.data.applications: - if app.app_name and app.app_name.startswith(prefix): - name = app.app_name[len(prefix):] - workers.append({ - "name": name, - "app_name": app.app_name, - "app_id": app.app_id, - }) - - print(json.dumps({"workers": workers})) - - -# --------------------------------------------------------------------------- -# AI Gateway consumer operations -# --------------------------------------------------------------------------- - -def _find_existing_consumer(apig, consumer_name, retries=1, retry_delay=0): - """Search for an existing consumer by name with optional retry (for API eventual consistency). - - Returns (consumer_id, api_key) or (None, None). - """ - import time - from alibabacloud_apig20240327 import models as apig_models - - for attempt in range(retries): - if attempt > 0: - log(f"Retry {attempt}/{retries - 1} after {retry_delay}s ...") - time.sleep(retry_delay) - - page = 1 - while True: - req = apig_models.ListConsumersRequest( - gateway_type="AI", - name_like=consumer_name, - page_number=page, - page_size=100, - ) - resp = apig.list_consumers(req) - if not resp.body.data or not resp.body.data.items: - break - for c in resp.body.data.items: - if c.name == consumer_name: - detail = apig.get_consumer(c.consumer_id) - d = detail.body.data - key = None - if d.api_key_identity_config and d.api_key_identity_config.credentials: - key = d.api_key_identity_config.credentials[0].apikey - return c.consumer_id, key - if len(resp.body.data.items) < 100: - break - page += 1 - - return None, None - - -def gw_create_consumer(args): - """Create an AI Gateway consumer for a Worker. - - Consumer name is prefixed with a short gateway ID to avoid account-level - name collisions across gateways (Consumer is an account-level resource). - The gateway ID is read from HICLAW_GW_GATEWAY_ID env var. - """ - from alibabacloud_apig20240327 import models as apig_models - - apig = _get_apig_client() - raw_name = args.name - - # Prefix consumer name with gateway ID to avoid cross-gateway collisions - gateway_id = os.environ.get("HICLAW_GW_GATEWAY_ID", "") - if gateway_id: - consumer_name = f"{gateway_id}-{raw_name}" - else: - log("WARNING: HICLAW_GW_GATEWAY_ID not set, using raw consumer name") - consumer_name = raw_name - - existing_id, existing_key = _find_existing_consumer(apig, consumer_name) - if existing_id: - log(f"Consumer already exists: {existing_id}") - print(json.dumps({"consumer_id": existing_id, "api_key": existing_key, "status": "exists"})) - return - - try: - req = apig_models.CreateConsumerRequest( - name=consumer_name, - gateway_type="AI", - enable=True, - description=f"HiClaw Worker: {raw_name}", - apikey_identity_config=apig_models.ApiKeyIdentityConfig( - type="Apikey", - apikey_source=apig_models.ApiKeyIdentityConfigApikeySource( - source="Default", - value="Authorization", - ), - credentials=[ - apig_models.ApiKeyIdentityConfigCredentials(generate_mode="System") - ], - ), - ) - resp = apig.create_consumer(req) - consumer_id = resp.body.data.consumer_id - except Exception as e: - if "ConsumerNameDuplicate" in str(e) or "409" in str(e): - log(f"Consumer creation returned 409, re-querying with retries...") - existing_id, existing_key = _find_existing_consumer(apig, consumer_name, retries=3, retry_delay=2) - if existing_id: - log(f"Consumer found after 409: {existing_id}") - print(json.dumps({"consumer_id": existing_id, "api_key": existing_key, "status": "exists"})) - return - raise RuntimeError(f"Consumer 409 but not found on re-query: {e}") from e - raise - - detail = apig.get_consumer(consumer_id) - key = None - if detail.body.data.api_key_identity_config and detail.body.data.api_key_identity_config.credentials: - key = detail.body.data.api_key_identity_config.credentials[0].apikey - - log(f"Consumer created: {consumer_id}, key={key}") - print(json.dumps({"consumer_id": consumer_id, "api_key": key, "status": "created"})) - - -def gw_bind_consumer(args): - """Bind a consumer to an HTTP API (LLM type).""" - from alibabacloud_apig20240327 import models as apig_models - - apig = _get_apig_client() - - try: - req = apig_models.QueryConsumerAuthorizationRulesRequest( - consumer_id=args.consumer_id, - resource_id=args.api_id, - environment_id=args.env_id, - resource_type="LLM", - page_number=1, - page_size=100, - ) - resp = apig.query_consumer_authorization_rules(req) - if resp.body.data and resp.body.data.items and len(resp.body.data.items) > 0: - rule_ids = [r.consumer_authorization_rule_id for r in resp.body.data.items] - log(f"Consumer already bound: {len(rule_ids)} rules") - print(json.dumps({"rule_ids": rule_ids, "status": "exists"})) - return - except Exception: - pass - - req = apig_models.CreateConsumerAuthorizationRulesRequest( - authorization_rules=[ - apig_models.CreateConsumerAuthorizationRulesRequestAuthorizationRules( - consumer_id=args.consumer_id, - resource_type="LLM", - expire_mode="LongTerm", - resource_identifier=apig_models.CreateConsumerAuthorizationRulesRequestAuthorizationRulesResourceIdentifier( - resource_id=args.api_id, - environment_id=args.env_id, - ), - ) - ], - ) - resp = apig.create_consumer_authorization_rules(req) - rule_ids = resp.body.data.consumer_authorization_rule_ids or [] - log(f"Consumer bound: {len(rule_ids)} rules") - print(json.dumps({"rule_ids": rule_ids, "status": "created"})) - - -# --------------------------------------------------------------------------- -# CLI entry point -# --------------------------------------------------------------------------- - -def main(): - parser = argparse.ArgumentParser(description="HiClaw Cloud Worker API") - sub = parser.add_subparsers(dest="command") - - # SAE commands - p = sub.add_parser("sae-create") - p.add_argument("--name", required=True) - p.add_argument("--image") - p.add_argument("--envs", default="{}") - - p = sub.add_parser("sae-delete") - p.add_argument("--name", required=True) - - p = sub.add_parser("sae-stop") - p.add_argument("--name", required=True) - - p = sub.add_parser("sae-start") - p.add_argument("--name", required=True) - - p = sub.add_parser("sae-status") - p.add_argument("--name", required=True) - - sub.add_parser("sae-list") - - # Gateway commands - p = sub.add_parser("gw-create-consumer") - p.add_argument("--name", required=True) - - p = sub.add_parser("gw-bind-consumer") - p.add_argument("--consumer-id", required=True) - p.add_argument("--api-id", required=True) - p.add_argument("--env-id", required=True) - - args = parser.parse_args() - - commands = { - "sae-create": sae_create, - "sae-delete": sae_delete, - "sae-stop": sae_stop, - "sae-start": sae_start, - "sae-status": sae_status, - "sae-list": sae_list, - "gw-create-consumer": gw_create_consumer, - "gw-bind-consumer": gw_bind_consumer, - } - - if args.command not in commands: - parser.print_help() - sys.exit(1) - - try: - commands[args.command](args) - except Exception as e: - print(json.dumps({"error": str(e)})) - sys.exit(1) - - -if __name__ == "__main__": - main() diff --git a/manager/scripts/lib/cloud/aliyun-sae.sh b/manager/scripts/lib/cloud/aliyun-sae.sh deleted file mode 100644 index 3c3f2d23..00000000 --- a/manager/scripts/lib/cloud/aliyun-sae.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/bin/bash -# aliyun-sae.sh - Alibaba Cloud SAE provider for HiClaw worker management -# -# Sourced by container-api.sh when the file exists. -# All SAE operations are delegated to aliyun-api.py. -# -# Prerequisites: -# - HICLAW_SAE_WORKER_IMAGE env var set (signals cloud SAE mode) -# - /opt/hiclaw/scripts/lib/cloud/aliyun-api.py available -# - RRSA OIDC configured on the SAE application - -CLOUD_WORKER_API="/opt/hiclaw/scripts/lib/cloud/aliyun-api.py" - -cloud_sae_available() { - [ -n "${HICLAW_SAE_WORKER_IMAGE:-}" ] && [ -f "${CLOUD_WORKER_API}" ] -} - -# ── SAE Worker lifecycle ────────────────────────────────────────────────────── - -sae_create_worker() { - local worker_name="$1" - local extra_envs_json="$2" - local image_override="${3:-}" - extra_envs_json="${extra_envs_json:-"{}"}" - _log "Creating SAE application for worker: ${worker_name}" - local envs_file - envs_file=$(mktemp /tmp/sae-envs-XXXXXX.json) - printf '%s' "${extra_envs_json}" > "${envs_file}" - local image_arg="" - if [ -n "${image_override}" ]; then - image_arg="--image ${image_override}" - fi - python3 "${CLOUD_WORKER_API}" sae-create --name "${worker_name}" --envs "@${envs_file}" ${image_arg} - local rc=$? - rm -f "${envs_file}" - return ${rc} -} - -sae_delete_worker() { - local worker_name="$1" - _log "Deleting SAE application for worker: ${worker_name}" - python3 "${CLOUD_WORKER_API}" sae-delete --name "${worker_name}" -} - -sae_stop_worker() { - local worker_name="$1" - _log "Stopping SAE application for worker: ${worker_name}" - python3 "${CLOUD_WORKER_API}" sae-stop --name "${worker_name}" -} - -sae_start_worker() { - local worker_name="$1" - _log "Starting SAE application for worker: ${worker_name}" - python3 "${CLOUD_WORKER_API}" sae-start --name "${worker_name}" -} - -sae_status_worker() { - local worker_name="$1" - local result - result=$(python3 "${CLOUD_WORKER_API}" sae-status --name "${worker_name}" 2>/dev/null) - echo "${result}" | jq -r '.status // "unknown"' 2>/dev/null -} - -sae_list_workers() { - python3 "${CLOUD_WORKER_API}" sae-list -} - -# ── AI Gateway consumer operations ──────────────────────────────────────────── - -cloud_create_consumer() { - local consumer_name="$1" - python3 "${CLOUD_WORKER_API}" gw-create-consumer --name "${consumer_name}" -} - -cloud_bind_consumer() { - local consumer_id="$1" - local api_id="$2" - local env_id="$3" - python3 "${CLOUD_WORKER_API}" gw-bind-consumer \ - --consumer-id "${consumer_id}" --api-id "${api_id}" --env-id "${env_id}" -} diff --git a/manager/scripts/lib/container-api.sh b/manager/scripts/lib/container-api.sh index 53296d91..309668ba 100755 --- a/manager/scripts/lib/container-api.sh +++ b/manager/scripts/lib/container-api.sh @@ -1,293 +1,121 @@ #!/bin/bash -# container-api.sh - Container runtime API helper -# Provides functions to create/manage sibling containers via the host's -# container runtime socket (Docker or Podman compatible). +# container-api.sh - Worker lifecycle API client # -# Supports two modes: -# 1. HTTP proxy mode: set HICLAW_CONTAINER_API=http://hiclaw-orchestrator:2375 -# 2. Unix socket mode (legacy): mount docker.sock into the container +# Thin client for the hiclaw-orchestrator REST API. +# All worker CRUD operations go through the orchestrator's unified API. +# Docker exec/logs operations still use Docker API passthrough. +# +# Required: +# HICLAW_CONTAINER_API - orchestrator URL (e.g. http://hiclaw-orchestrator:2375) # # Usage: # source /opt/hiclaw/scripts/lib/container-api.sh -# container_api_available # returns 0 if socket is mounted -# container_create_worker "alice" # create and start a worker container -# container_stop_worker "alice" # stop a worker container -# container_remove_worker "alice" # remove a worker container -# container_logs_worker "alice" # get worker container logs - -CONTAINER_SOCKET="${HICLAW_CONTAINER_SOCKET:-/var/run/docker.sock}" -CONTAINER_API_BASE="${HICLAW_CONTAINER_API:-}" -if [ -z "${CONTAINER_API_BASE}" ]; then - CONTAINER_API_BASE="http://localhost" -fi -WORKER_IMAGE="${HICLAW_WORKER_IMAGE:-hiclaw/worker-agent:latest}" -COPAW_WORKER_IMAGE="${HICLAW_COPAW_WORKER_IMAGE:-hiclaw/copaw-worker:latest}" +# worker_backend_create '{"name":"alice","image":"hiclaw/worker-agent:latest"}' +# worker_backend_status "alice" +# worker_backend_delete "alice" + +CONTAINER_API_BASE="${HICLAW_CONTAINER_API:-http://localhost:2375}" WORKER_CONTAINER_PREFIX="hiclaw-worker-" _log() { echo "[hiclaw-container $(date '+%Y-%m-%d %H:%M:%S')] $1" } -_api() { - local method="$1" - local path="$2" - local data="${3:-}" - if [ -n "${HICLAW_CONTAINER_API}" ]; then - # HTTP proxy mode - if [ -n "${data}" ]; then - curl -s -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - else - # Unix socket mode (legacy) - if [ -n "${data}" ]; then - curl -s --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - fi -} - -_api_code() { - local method="$1" - local path="$2" - local data="${3:-}" - if [ -n "${HICLAW_CONTAINER_API}" ]; then - # HTTP proxy mode - if [ -n "${data}" ]; then - curl -s -o /dev/null -w '%{http_code}' -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s -o /dev/null -w '%{http_code}' -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - else - # Unix socket mode (legacy) - if [ -n "${data}" ]; then - curl -s -o /dev/null -w '%{http_code}' --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - -H 'Content-Type: application/json' \ - -d "${data}" \ - "${CONTAINER_API_BASE}${path}" - else - curl -s -o /dev/null -w '%{http_code}' --unix-socket "${CONTAINER_SOCKET}" \ - -X "${method}" \ - "${CONTAINER_API_BASE}${path}" - fi - fi -} - -# Check if container runtime API is available -# Supports both HTTP proxy mode (HICLAW_CONTAINER_API) and unix socket mode. -# This function is designed to work correctly in both strict mode (set -euo pipefail) -# and non-strict mode. It uses a subshell for the API check to prevent exit on errors. -container_api_available() { - if [ -n "${HICLAW_CONTAINER_API}" ]; then - # HTTP proxy mode: check if proxy is reachable - local version - version=$(curl -s "${CONTAINER_API_BASE}/version" 2>/dev/null) || true - if echo "${version}" | grep -q '"ApiVersion"' 2>/dev/null; then - return 0 - fi - return 1 - fi - # Unix socket mode (legacy) - if [ ! -S "${CONTAINER_SOCKET}" ]; then - return 1 - fi - # Use a subshell to prevent strict mode (set -e) from exiting on curl failures - # The || true ensures the command substitution doesn't fail in strict mode - local version - version=$(_api GET /version 2>/dev/null) || true - if echo "${version}" | grep -q '"ApiVersion"' 2>/dev/null; then - return 0 - fi - return 1 -} - -# Get the Manager container's own IP (for Worker to connect back) -container_get_manager_ip() { - hostname -I 2>/dev/null | awk '{print $1}' -} +# ============================================================ +# Orchestrator API client +# ============================================================ -# Ensure a container image exists locally, pulling it if necessary. -# Usage: _ensure_image -# The Docker/Podman "create image" API streams JSON progress; we wait for -# completion and check the final status. -_ensure_image() { - local image="$1" - # Quick check: does the image already exist locally? - local inspect - inspect=$(_api GET "/images/${image}/json" 2>/dev/null) - if echo "${inspect}" | grep -q '"Id"' 2>/dev/null; then - return 0 +_orch_api() { + local method="$1" path="$2" body="${3:-}" + local url="${CONTAINER_API_BASE}${path}" + local auth_args=() + if [ -n "${HICLAW_ORCHESTRATOR_API_KEY:-}" ]; then + auth_args=(-H "Authorization: Bearer ${HICLAW_ORCHESTRATOR_API_KEY}") fi - - _log "Image not found locally, pulling: ${image}" - # POST /images/create?fromImage= streams progress JSON. - # curl will block until the pull finishes (or fails). - local pull_output - if [ -n "${HICLAW_CONTAINER_API}" ]; then - pull_output=$(curl -s -X POST "${CONTAINER_API_BASE}/images/create?fromImage=${image}" 2>&1) + if [ -n "$body" ]; then + curl -s -X "$method" "$url" "${auth_args[@]}" \ + -H "Content-Type: application/json" -d "$body" else - pull_output=$(curl -s --unix-socket "${CONTAINER_SOCKET}" \ - -X POST "${CONTAINER_API_BASE}/images/create?fromImage=${image}" 2>&1) + curl -s -X "$method" "$url" "${auth_args[@]}" fi - - # Verify the image is now available - inspect=$(_api GET "/images/${image}/json" 2>/dev/null) - if echo "${inspect}" | grep -q '"Id"' 2>/dev/null; then - _log "Image pulled successfully: ${image}" - return 0 - fi - - _log "ERROR: Failed to pull image: ${image}" - _log " Pull output (last 500 chars): ${pull_output: -500}" - return 1 } -# Create and start a Worker container -# Usage: container_create_worker [fs_access_key] [fs_secret_key] [extra_env_json] [custom_image] -# extra_env_json: optional JSON array of additional environment variables, e.g. '["SKILLS_API_URL=https://example.com"]' -# custom_image: optional custom Docker image to use instead of the default WORKER_IMAGE -# Returns: container ID on success, empty on failure -container_create_worker() { - local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - - # Build environment variables for the Worker - # Always use the fixed internal domain so workers on hiclaw-net can reach MinIO - # via the manager's network alias, regardless of user-configured FS domain. - local fs_endpoint="http://fs-local.hiclaw.io:8080" - local fs_access_key="${2:-${HICLAW_MINIO_USER:-${HICLAW_ADMIN_USER:-admin}}}" - local fs_secret_key="${3:-${HICLAW_MINIO_PASSWORD:-${HICLAW_ADMIN_PASSWORD:-admin}}}" - local extra_env="${4:-[]}" - local custom_image="${5:-}" - local image="${custom_image:-${WORKER_IMAGE}}" - - _log "Creating Worker container: ${container_name}" - _log " Image: ${image}" - _log " FS endpoint: ${fs_endpoint}" - - # Pull image if not available locally - if ! _ensure_image "${image}"; then - return 1 - fi - - # Remove existing container with same name (if any) - local existing - existing=$(_api GET "/containers/${container_name}/json" 2>/dev/null) - if echo "${existing}" | grep -q '"Id"' 2>/dev/null; then - _log "Removing existing container: ${container_name}" - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - sleep 1 +_orch_api_code() { + local method="$1" path="$2" body="${3:-}" + local url="${CONTAINER_API_BASE}${path}" + local auth_args=() + if [ -n "${HICLAW_ORCHESTRATOR_API_KEY:-}" ]; then + auth_args=(-H "Authorization: Bearer ${HICLAW_ORCHESTRATOR_API_KEY}") fi - - # Create the container - # Always use hiclaw-net; Docker DNS resolves *-local.hiclaw.io via manager's network aliases - local host_config="{\"NetworkMode\":\"hiclaw-net\"}" - - local worker_home="/root/hiclaw-fs/agents/${worker_name}" - - # Build base environment variables - local base_env='["HOME='"${worker_home}"'","HICLAW_WORKER_NAME='"${worker_name}"'","HICLAW_FS_ENDPOINT='"${fs_endpoint}"'","HICLAW_FS_ACCESS_KEY='"${fs_access_key}"'","HICLAW_FS_SECRET_KEY='"${fs_secret_key}"'"]' - - # Merge with extra environment variables if provided - local all_env - if [ "${extra_env}" != "[]" ] && [ -n "${extra_env}" ]; then - all_env=$(echo "${base_env} ${extra_env}" | jq -s 'add') + if [ -n "$body" ]; then + curl -s -o /dev/null -w '%{http_code}' -X "$method" "$url" "${auth_args[@]}" \ + -H "Content-Type: application/json" -d "$body" else - all_env="${base_env}" + curl -s -o /dev/null -w '%{http_code}' -X "$method" "$url" "${auth_args[@]}" fi - - local create_payload - create_payload=$(cat </dev/null) - - if [ -z "${container_id}" ]; then - _log "ERROR: Failed to create container. Response: ${create_resp}" - return 1 - fi - _log "Container created: ${container_id:0:12}" +# ============================================================ +# Worker Backend API (unified — orchestrator handles Docker/SAE dispatch) +# ============================================================ - # Start the container - local start_code - start_code=$(_api_code POST "/containers/${container_id}/start") - if [ "${start_code}" != "204" ] && [ "${start_code}" != "304" ]; then - _log "ERROR: Failed to start container (HTTP ${start_code})" - return 1 - fi +# Create a worker. Accepts JSON body with name, image, runtime, env, etc. +# Usage: worker_backend_create '{"name":"alice","image":"img:latest","env":{...}}' +worker_backend_create() { + local body="$1" + _orch_api POST /workers "$body" +} - _log "Worker container ${container_name} started successfully" - echo "${container_id}" - return 0 +# Delete a worker by name. +worker_backend_delete() { + local worker_name="$1" + _orch_api DELETE "/workers/${worker_name}" } -# Start an existing stopped Worker container -# Use this to wake up a container that was previously stopped (preserves container config). -# Different from container_create_worker which creates a new container from scratch. -container_start_worker() { +# Start a stopped worker. Returns 0 on success. +worker_backend_start() { local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" local code - code=$(_api_code POST "/containers/${container_name}/start") - if [ "${code}" = "204" ] || [ "${code}" = "304" ]; then - _log "Worker ${container_name} started" - return 0 - fi - _log "WARNING: Start returned HTTP ${code}" - return 1 + code=$(_orch_api_code POST "/workers/${worker_name}/start") + [ "${code}" -ge 200 ] && [ "${code}" -lt 300 ] } -# Stop a Worker container -container_stop_worker() { +# Stop a running worker. Returns 0 on success. +worker_backend_stop() { local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" local code - code=$(_api_code POST "/containers/${container_name}/stop?t=10") - if [ "${code}" = "204" ] || [ "${code}" = "304" ]; then - _log "Worker ${container_name} stopped" - return 0 - fi - _log "WARNING: Stop returned HTTP ${code}" - return 1 + code=$(_orch_api_code POST "/workers/${worker_name}/stop") + [ "${code}" -ge 200 ] && [ "${code}" -lt 300 ] } -# Remove a Worker container (force) -container_remove_worker() { +# Get worker status. Returns JSON with .status field. +worker_backend_status() { local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - _log "Worker ${container_name} removed" + _orch_api GET "/workers/${worker_name}" | jq -r '.status // "unknown"' 2>/dev/null +} + +# List all workers. Returns JSON with .workers array. +worker_backend_list() { + _orch_api GET /workers +} + +# Check if orchestrator API is reachable. +container_api_available() { + local code + code=$(_orch_api_code GET /workers 2>/dev/null) || true + [ "${code}" = "200" ] } -# Get Worker container logs +# ============================================================ +# Docker API passthrough (for exec, logs, inspect) +# ============================================================ +# These operations require raw Docker API access and go through +# the orchestrator's Docker API passthrough (catch-all route). +# Reuses _orch_api/_orch_api_code since they hit the same endpoint. + +_api() { _orch_api "$@"; } + +# Get Worker container logs (Docker API passthrough) container_logs_worker() { local worker_name="$1" local tail="${2:-50}" @@ -295,8 +123,7 @@ container_logs_worker() { _api GET "/containers/${container_name}/logs?stdout=true&stderr=true&tail=${tail}" } -# Get Worker container status -# Returns: "running", "exited", "created", or "not_found" +# Get Worker container status via Docker inspect (for readiness checks) container_status_worker() { local worker_name="$1" local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" @@ -310,18 +137,14 @@ container_status_worker() { } # Execute a command inside a Worker container via Docker exec API -# Usage: container_exec_worker [args...] -# Returns: command output (raw Docker stream; contains binary framing prefix per chunk) container_exec_worker() { local worker_name="$1" shift local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - # Build JSON array from args using jq for proper escaping local cmd_json cmd_json=$(jq -cn --args '$ARGS.positional' -- "$@") - # Create exec instance local exec_create exec_create=$(_api POST "/containers/${container_name}/exec" \ "{\"AttachStdout\":true,\"AttachStderr\":true,\"Tty\":false,\"Cmd\":${cmd_json}}") @@ -333,15 +156,11 @@ container_exec_worker() { return 1 fi - # Start exec and stream output (binary-framed; callers can grep the raw bytes) _api POST "/exec/${exec_id}/start" '{"Detach":false,"Tty":false}' return 0 } -# Wait for Worker agent (OpenClaw gateway) to become ready -# Mirrors the wait_manager_ready logic in hiclaw-install.sh -# Usage: container_wait_worker_ready [timeout_seconds] -# Returns: 0 if ready, 1 if timed out or container stopped unexpectedly +# Wait for OpenClaw Worker to become ready container_wait_worker_ready() { local worker_name="$1" local timeout="${2:-120}" @@ -350,7 +169,6 @@ container_wait_worker_ready() { _log "Waiting for Worker ${worker_name} to be ready (timeout: ${timeout}s)..." while [ "${elapsed}" -lt "${timeout}" ]; do - # Bail early if the container is no longer running local cstatus cstatus=$(container_status_worker "${worker_name}") if [ "${cstatus}" != "running" ]; then @@ -358,9 +176,6 @@ container_wait_worker_ready() { return 1 fi - # Check OpenClaw gateway health inside the worker container. - # The Docker exec API returns a binary-framed stream, but grep -q still - # finds the string inside the payload bytes. if container_exec_worker "${worker_name}" openclaw gateway health --json 2>/dev/null \ | grep -q '"ok"' 2>/dev/null; then _log "Worker ${worker_name} is ready!" @@ -376,157 +191,7 @@ container_wait_worker_ready() { return 1 } -# Create and start a CoPaw Worker container -# Uses the CoPaw worker image and sets appropriate working directory. -# Usage: container_create_copaw_worker [fs_access_key] [fs_secret_key] [extra_env_json] [custom_image] -container_create_copaw_worker() { - local worker_name="$1" - local container_name="${WORKER_CONTAINER_PREFIX}${worker_name}" - - # Always use the fixed internal domain so workers on hiclaw-net can reach MinIO - # via the manager's network alias, regardless of user-configured FS domain. - local fs_endpoint="http://fs-local.hiclaw.io:8080" - local fs_access_key="${2:-${HICLAW_MINIO_USER:-${HICLAW_ADMIN_USER:-admin}}}" - local fs_secret_key="${3:-${HICLAW_MINIO_PASSWORD:-${HICLAW_ADMIN_PASSWORD:-admin}}}" - local extra_env="${4:-[]}" - local custom_image="${5:-}" - local image="${custom_image:-${COPAW_WORKER_IMAGE}}" - - _log "Creating CoPaw Worker container: ${container_name}" - _log " Image: ${image}" - _log " FS endpoint: ${fs_endpoint}" - - # Pull image if not available locally - if ! _ensure_image "${image}"; then - return 1 - fi - - # Remove existing container with same name (if any) - local existing - existing=$(_api GET "/containers/${container_name}/json" 2>/dev/null) - if echo "${existing}" | grep -q '"Id"' 2>/dev/null; then - _log "Removing existing container: ${container_name}" - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - sleep 1 - fi - - # CoPaw uses /root/.copaw-worker as install dir (not /root/hiclaw-fs/agents/) - local base_env='["HICLAW_WORKER_NAME='"${worker_name}"'","HICLAW_FS_ENDPOINT='"${fs_endpoint}"'","HICLAW_FS_ACCESS_KEY='"${fs_access_key}"'","HICLAW_FS_SECRET_KEY='"${fs_secret_key}"'"]' - - local all_env - if [ "${extra_env}" != "[]" ] && [ -n "${extra_env}" ]; then - all_env=$(echo "${base_env} ${extra_env}" | jq -s 'add') - else - all_env="${base_env}" - fi - - # Detect HICLAW_CONSOLE_PORT in env to set up port binding - local console_port="" - console_port=$(echo "${all_env}" | jq -r '.[] | select(startswith("HICLAW_CONSOLE_PORT=")) | split("=")[1]' 2>/dev/null || true) - - if [ -n "${console_port}" ]; then - _log " Console port: ${console_port}" - fi - - # ExposedPorts tells Docker which ports the container listens on - local exposed_ports="{}" - if [ -n "${console_port}" ]; then - exposed_ports="{\"${console_port}/tcp\":{}}" - fi - - # Pick a random host port (10000-20000) to minimize conflicts across workers - local host_port="${console_port}" - if [ -n "${console_port}" ]; then - host_port=$(( (RANDOM % 10001) + 10000 )) - _log " Host port: ${host_port} (random)" - fi - local max_port_retries=10 - local port_attempt=0 - - while true; do - # Build HostConfig with NetworkMode (hiclaw-net) and optional PortBindings - # Docker DNS resolves *-local.hiclaw.io via manager's network aliases; no ExtraHosts needed - local host_config - if [ -n "${console_port}" ]; then - host_config="{\"NetworkMode\":\"hiclaw-net\",\"PortBindings\":{\"${console_port}/tcp\":[{\"HostPort\":\"${host_port}\"}]}}" - else - host_config="{\"NetworkMode\":\"hiclaw-net\"}" - fi - - local create_payload - create_payload=$(cat </dev/null) - - if [ -z "${container_id}" ]; then - _log "ERROR: Failed to create CoPaw container. Response: ${create_resp}" - return 1 - fi - - _log "CoPaw container created: ${container_id:0:12}" - - # Start the container — capture both HTTP status code and response body - local start_output - if [ -n "${HICLAW_CONTAINER_API}" ]; then - start_output=$(curl -s -w '\n%{http_code}' \ - -X POST "${CONTAINER_API_BASE}/containers/${container_id}/start") - else - start_output=$(curl -s -w '\n%{http_code}' --unix-socket "${CONTAINER_SOCKET}" \ - -X POST "${CONTAINER_API_BASE}/containers/${container_id}/start") - fi - local start_code - start_code=$(echo "${start_output}" | tail -1) - local start_body - start_body=$(echo "${start_output}" | sed '$d') - - if [ "${start_code}" = "204" ] || [ "${start_code}" = "304" ]; then - if [ -n "${console_port}" ]; then - _log "Console: container port ${console_port} -> host port ${host_port}" - _log "CONSOLE_HOST_PORT=${host_port}" - fi - _log "CoPaw Worker container ${container_name} started successfully" - echo "${container_id}" - return 0 - fi - - # Start failed — check if it's a port conflict we can retry - local err_msg - err_msg=$(echo "${start_body}" | jq -r '.message // empty' 2>/dev/null) - - if [ -n "${console_port}" ] && echo "${err_msg}" | grep -qi "already allocated\|address already in use\|port is already" 2>/dev/null; then - port_attempt=$((port_attempt + 1)) - if [ "${port_attempt}" -ge "${max_port_retries}" ]; then - _log "ERROR: Could not find available port after ${max_port_retries} attempts (tried ${console_port}-${host_port})" - return 1 - fi - _log "Host port ${host_port} is in use, trying $((host_port + 1))..." - host_port=$((host_port + 1)) - _api DELETE "/containers/${container_name}?force=true" > /dev/null 2>&1 - sleep 1 - continue - fi - - # Non-port-conflict error — fail immediately - _log "ERROR: Failed to start CoPaw container (HTTP ${start_code}): ${err_msg:-${start_body}}" - return 1 - done -} - # Wait for CoPaw Worker to become ready -# CoPaw writes config.json after bridge completes; we check for that file. -# Usage: container_wait_copaw_worker_ready [timeout_seconds] container_wait_copaw_worker_ready() { local worker_name="$1" local timeout="${2:-120}" @@ -543,7 +208,6 @@ container_wait_copaw_worker_ready() { return 1 fi - # Check if CoPaw bridge has completed (config.json with channels key exists) if container_exec_worker "${worker_name}" cat "${config_file}" 2>/dev/null \ | grep -q '"channels"' 2>/dev/null; then _log "CoPaw Worker ${worker_name} is ready!" @@ -559,112 +223,7 @@ container_wait_copaw_worker_ready() { return 1 } -# List all HiClaw Worker containers -container_list_workers() { - _api GET "/containers/json?all=true&filters=%7B%22name%22%3A%5B%22${WORKER_CONTAINER_PREFIX}%22%5D%7D" 2>/dev/null | \ - jq -r '.[] | "\(.Names[0] | ltrimstr("/") | ltrimstr("'"${WORKER_CONTAINER_PREFIX}"'"))\t\(.State)\t\(.Status)"' 2>/dev/null -} - - -# ============================================================ -# Cloud Provider Extensions -# ============================================================ -# Load cloud providers (additive — does not modify upstream functions above). -# Each provider file defines its own *_available() check and lifecycle functions. -for _provider_file in /opt/hiclaw/scripts/lib/cloud/*.sh; do - [ -f "${_provider_file}" ] && source "${_provider_file}" -done -unset _provider_file - -# ============================================================ -# Unified Worker Backend API -# ============================================================ -# Auto-detects Docker vs cloud vs none and dispatches to the right backend. -# All skill scripts should use these instead of calling Docker/SAE directly. - -_detect_worker_backend() { - if container_api_available 2>/dev/null; then - echo "docker" - elif [ "${HICLAW_RUNTIME:-}" = "aliyun" ]; then - echo "aliyun" - elif type cloud_sae_available &>/dev/null && cloud_sae_available; then - echo "aliyun" - else - echo "none" - fi -} - -worker_backend_create() { - local worker_name="$1" - local fs_access_key="${2:-}" - local fs_secret_key="${3:-}" - local extra_env_json="${4:-[]}" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) - container_create_worker "${worker_name}" "${fs_access_key}" "${fs_secret_key}" "${extra_env_json}" - ;; - aliyun) - local envs_obj="{}" - if [ "${extra_env_json}" != "[]" ] && [ -n "${extra_env_json}" ]; then - envs_obj=$(echo "${extra_env_json}" | jq '[.[] | split("=") | {(.[0]): (.[1:] | join("="))}] | add // {}') - fi - sae_create_worker "${worker_name}" "${envs_obj}" - ;; - none) - _log "No worker backend available (no Docker socket, no cloud config)" - echo '{"error": "no_backend"}' - return 1 - ;; - esac -} - -worker_backend_status() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_status_worker "${worker_name}" ;; - aliyun) sae_status_worker "${worker_name}" ;; - none) echo "unknown" ;; - esac -} - -worker_backend_stop() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_stop_worker "${worker_name}" ;; - aliyun) sae_stop_worker "${worker_name}" ;; - none) return 1 ;; - esac -} - -worker_backend_start() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_start_worker "${worker_name}" ;; - aliyun) sae_start_worker "${worker_name}" ;; - none) return 1 ;; - esac -} - -worker_backend_delete() { - local worker_name="$1" - local backend - backend=$(_detect_worker_backend) - - case "${backend}" in - docker) container_remove_worker "${worker_name}" ;; - aliyun) sae_delete_worker "${worker_name}" ;; - none) return 1 ;; - esac +# Get the Manager container's own IP (for Worker to connect back) +container_get_manager_ip() { + hostname -I 2>/dev/null | awk '{print $1}' } diff --git a/manager/scripts/lib/gateway-api.sh b/manager/scripts/lib/gateway-api.sh index 208e34c0..2d8b78c9 100644 --- a/manager/scripts/lib/gateway-api.sh +++ b/manager/scripts/lib/gateway-api.sh @@ -1,8 +1,7 @@ #!/bin/bash # gateway-api.sh - Unified gateway consumer/route/MCP authorization abstraction # -# Dispatches to Higress Console REST API (local) or AI Gateway API (cloud). -# Follows the same pattern as worker_backend_* in container-api.sh. +# Dispatches to Higress Console REST API (local) or orchestrator API (cloud). # # Provides: # gateway_ensure_session() — ensure Higress cookie (local) / no-op (cloud) @@ -12,17 +11,7 @@ # # Prerequisites: # - source hiclaw-env.sh (for HICLAW_RUNTIME) -# - HICLAW_ADMIN_USER, HICLAW_ADMIN_PASSWORD (for Higress login) -# - HIGRESS_COOKIE_FILE (set by start-manager-agent.sh or gateway_ensure_session) -# -# Usage: -# source /opt/hiclaw/scripts/lib/gateway-api.sh - -# ── Load cloud providers (additive) ────────────────────────────────────────── -for _gw_provider_file in /opt/hiclaw/scripts/lib/cloud/*.sh; do - [ -f "${_gw_provider_file}" ] && source "${_gw_provider_file}" -done -unset _gw_provider_file +# - source container-api.sh (for _orch_api) # ── Backend detection ───────────────────────────────────────────────────────── @@ -36,15 +25,11 @@ _detect_gateway_backend() { # ── Session management ──────────────────────────────────────────────────────── -# Ensure a valid Higress Console session cookie exists. -# In cloud mode this is a no-op (no local Higress). -# Sets HIGRESS_COOKIE_FILE as a side effect. gateway_ensure_session() { local backend backend=$(_detect_gateway_backend) [ "${backend}" != "higress" ] && return 0 - # Already have a valid cookie if [ -n "${HIGRESS_COOKIE_FILE:-}" ] && [ -s "${HIGRESS_COOKIE_FILE:-}" ]; then return 0 fi @@ -87,7 +72,7 @@ _gateway_cloud_create_consumer() { local credential_key="$2" local resp - resp=$(cloud_create_consumer "${consumer_name}" 2>/dev/null) || true + resp=$(_orch_api POST /gateway/consumers "{\"name\":\"${consumer_name}\"}") || true local status status=$(echo "${resp}" | jq -r '.status // "error"' 2>/dev/null) @@ -128,9 +113,6 @@ _gateway_higress_create_consumer() { # ── Route authorization ─────────────────────────────────────────────────────── -# gateway_authorize_routes -# Cloud: binds consumer to model API via cloud_bind_consumer (if env vars set) -# Local: iterates all AI routes and adds consumer to allowedConsumers gateway_authorize_routes() { local consumer_name="$1" local backend @@ -148,12 +130,11 @@ gateway_authorize_routes() { _gateway_cloud_authorize_routes() { local consumer_name="$1" - - # consumer_id is passed via GATEWAY_CONSUMER_ID (set by caller after gateway_create_consumer) local consumer_id="${GATEWAY_CONSUMER_ID:-}" + if [ -n "${consumer_id}" ] && [ -n "${HICLAW_GW_MODEL_API_ID:-}" ] && [ -n "${HICLAW_GW_ENV_ID:-}" ]; then - local bind_result - bind_result=$(cloud_bind_consumer "${consumer_id}" "${HICLAW_GW_MODEL_API_ID}" "${HICLAW_GW_ENV_ID}" 2>/dev/null) || true + _orch_api POST "/gateway/consumers/${consumer_id}/bind" \ + "{\"model_api_id\":\"${HICLAW_GW_MODEL_API_ID}\",\"env_id\":\"${HICLAW_GW_ENV_ID}\"}" > /dev/null 2>&1 || true else local skip_reason="" [ -z "${consumer_id}" ] && skip_reason="consumer_id empty" @@ -220,10 +201,6 @@ _gateway_higress_authorize_routes() { # ── MCP server authorization ───────────────────────────────────────────────── -# gateway_authorize_mcp -# Cloud: no-op (MCP servers managed via AI Gateway console) -# Local: iterates MCP servers and adds consumer to allowedConsumers -# Sets TARGET_MCP_LIST as a side effect (resolved list of MCP server names) gateway_authorize_mcp() { local consumer_name="$1" local mcp_servers_csv="${2:-}" @@ -232,7 +209,6 @@ gateway_authorize_mcp() { case "${backend}" in aliyun) - # Cloud: MCP authorization is managed via AI Gateway console TARGET_MCP_LIST="${mcp_servers_csv}" ;; higress) @@ -250,7 +226,6 @@ _gateway_higress_authorize_mcp() { -b "${HIGRESS_COOKIE_FILE}" 2>/dev/null) || true all_mcp=$(echo "${all_mcp_raw}" | jq '.data // .' 2>/dev/null || echo "${all_mcp_raw}") - # Resolve target list: use provided CSV or default to all existing MCP servers if [ -n "${mcp_servers_csv}" ]; then TARGET_MCP_LIST="${mcp_servers_csv}" else @@ -262,7 +237,6 @@ _gateway_higress_authorize_mcp() { return 0 fi - # Build a set of existing MCP server names for quick lookup local existing_names existing_names=$(echo "${all_mcp}" | jq -r '.[].name // empty' 2>/dev/null || true) @@ -273,9 +247,8 @@ _gateway_higress_authorize_mcp() { mcp_name=$(echo "${mcp_name}" | tr -d ' ') [ -z "${mcp_name}" ] && continue - # Check if the MCP server actually exists before trying to authorize - if ! echo "${existing_names}" | grep -qx "${mcp_name}"; then - echo "[gateway-api] SKIPPED: MCP server '${mcp_name}' does not exist — create it first via mcp-server-management skill, then authorize this worker" >&2 + if ! echo "${existing_names}" | grep -Fqx "${mcp_name}"; then + echo "[gateway-api] SKIPPED: MCP server '${mcp_name}' does not exist" >&2 continue fi @@ -307,6 +280,5 @@ _gateway_higress_authorize_mcp() { resolved_list="${resolved_list:+${resolved_list},}${mcp_name}" done - # Update TARGET_MCP_LIST to only include servers that actually exist TARGET_MCP_LIST="${resolved_list}" } diff --git a/orchestrator/backend/apig.go b/orchestrator/backend/apig.go index d891b1d5..7cb0d436 100644 --- a/orchestrator/backend/apig.go +++ b/orchestrator/backend/apig.go @@ -149,11 +149,24 @@ func (a *APIGBackend) CreateConsumer(_ context.Context, req ConsumerRequest) (*C } func (a *APIGBackend) BindConsumer(_ context.Context, req BindRequest) error { + // Fallback to config if not provided in request + modelAPIID := req.ModelAPIID + if modelAPIID == "" { + modelAPIID = a.config.ModelAPIID + } + envID := req.EnvID + if envID == "" { + envID = a.config.EnvID + } + if modelAPIID == "" || envID == "" { + return fmt.Errorf("model_api_id and env_id are required (neither provided in request nor configured)") + } + // Check if already bound queryReq := &apig.QueryConsumerAuthorizationRulesRequest{} queryReq.SetConsumerId(req.ConsumerID). - SetResourceId(req.ModelAPIID). - SetEnvironmentId(req.EnvID). + SetResourceId(modelAPIID). + SetEnvironmentId(envID). SetResourceType("LLM"). SetPageNumber(1). SetPageSize(100) @@ -173,8 +186,8 @@ func (a *APIGBackend) BindConsumer(_ context.Context, req BindRequest) error { ResourceType: tea.String("LLM"), ExpireMode: tea.String("LongTerm"), ResourceIdentifier: &apig.CreateConsumerAuthorizationRulesRequestAuthorizationRulesResourceIdentifier{ - ResourceId: tea.String(req.ModelAPIID), - EnvironmentId: tea.String(req.EnvID), + ResourceId: tea.String(modelAPIID), + EnvironmentId: tea.String(envID), }, }, }) From 85128a5ae52b9d081ed353c25be23f16aa5dbad3 Mon Sep 17 00:00:00 2001 From: jingze Date: Thu, 26 Mar 2026 19:26:26 +0800 Subject: [PATCH 05/11] fix(orchestrator): fix 5 bugs found in cloud testing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Unify HICLAW_CONTAINER_API → HICLAW_ORCHESTRATOR_URL (single env var) 2. Remove HICLAW_RUNTIME from create-worker.sh (orchestrator decides) 3. Make image optional in worker create API (backend provides default) 4. Add Timestamp to STS AssumeRoleWithOIDC call 5. SAEBackend.Create() auto-injects HICLAW_RUNTIME=aliyun into worker env 6. oss-credentials.sh: support dual path (RRSA direct + orchestrator mediated) Co-Authored-By: Claude Opus 4.6 --- .../scripts/create-worker.sh | 6 +- manager/scripts/lib/container-api.sh | 4 +- orchestrator/api/worker_handler.go | 4 - orchestrator/api/worker_handler_test.go | 5 +- orchestrator/backend/sae.go | 6 + orchestrator/credentials/sts.go | 19 +-- shared/lib/oss-credentials.sh | 137 +++++++++++++++--- 7 files changed, 139 insertions(+), 42 deletions(-) diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 783ce4f5..0b546082 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -787,15 +787,13 @@ elif container_api_available; then | if $oss_bucket != "" then . + { "HICLAW_OSS_BUCKET": $oss_bucket, "HICLAW_REGION": $region } else . end | if $skills_api_url != "" then . + { "SKILLS_API_URL": $skills_api_url } else . end | if $console_port != "" then . + { "HICLAW_CONSOLE_PORT": $console_port } else . end - | if $runtime == "copaw" then - . + { "HICLAW_RUNTIME": "aliyun" } - else + | if $runtime != "copaw" then . + { "OPENCLAW_DISABLE_BONJOUR": "1", "OPENCLAW_MDNS_HOSTNAME": ("hiclaw-w-" + $worker_name), "HOME": ("/root/hiclaw-fs/agents/" + $worker_name) } - end') + else . end') # Build create request body CREATE_BODY=$(jq -cn \ diff --git a/manager/scripts/lib/container-api.sh b/manager/scripts/lib/container-api.sh index 309668ba..7b15c9b6 100755 --- a/manager/scripts/lib/container-api.sh +++ b/manager/scripts/lib/container-api.sh @@ -6,7 +6,7 @@ # Docker exec/logs operations still use Docker API passthrough. # # Required: -# HICLAW_CONTAINER_API - orchestrator URL (e.g. http://hiclaw-orchestrator:2375) +# HICLAW_ORCHESTRATOR_URL - orchestrator URL (e.g. http://hiclaw-orchestrator:2375) # # Usage: # source /opt/hiclaw/scripts/lib/container-api.sh @@ -14,7 +14,7 @@ # worker_backend_status "alice" # worker_backend_delete "alice" -CONTAINER_API_BASE="${HICLAW_CONTAINER_API:-http://localhost:2375}" +CONTAINER_API_BASE="${HICLAW_ORCHESTRATOR_URL:-http://localhost:2375}" WORKER_CONTAINER_PREFIX="hiclaw-worker-" _log() { diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go index 2bb659e6..1c86343d 100644 --- a/orchestrator/api/worker_handler.go +++ b/orchestrator/api/worker_handler.go @@ -34,10 +34,6 @@ func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { httputil.WriteError(w, http.StatusBadRequest, "name is required") return } - if req.Image == "" { - httputil.WriteError(w, http.StatusBadRequest, "image is required") - return - } b, err := h.registry.GetWorkerBackend(r.Context(), req.Backend) if err != nil { diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go index f6cdafdd..672f160f 100644 --- a/orchestrator/api/worker_handler_test.go +++ b/orchestrator/api/worker_handler_test.go @@ -166,8 +166,9 @@ func TestCreateWorkerMissingImage(t *testing.T) { w := httptest.NewRecorder() mux.ServeHTTP(w, req) - if w.Code != http.StatusBadRequest { - t.Errorf("expected 400, got %d", w.Code) + // Image is optional — backend provides default + if w.Code != http.StatusCreated { + t.Errorf("expected 201, got %d", w.Code) } } diff --git a/orchestrator/backend/sae.go b/orchestrator/backend/sae.go index e4bc1fd0..d543f0b8 100644 --- a/orchestrator/backend/sae.go +++ b/orchestrator/backend/sae.go @@ -122,6 +122,12 @@ func (s *SAEBackend) Create(_ context.Context, req CreateRequest) (*WorkerResult } } + // SAE backend auto-injects runtime identifier so workers know they're on cloud + if req.Env == nil { + req.Env = make(map[string]string) + } + req.Env["HICLAW_RUNTIME"] = "aliyun" + envList := s.buildEnvList(req.Env) saeReq := &sae.CreateApplicationRequest{} diff --git a/orchestrator/credentials/sts.go b/orchestrator/credentials/sts.go index c067ec17..644a654c 100644 --- a/orchestrator/credentials/sts.go +++ b/orchestrator/credentials/sts.go @@ -62,15 +62,16 @@ func (s *STSService) IssueWorkerToken(ctx context.Context, workerName string) (* } form := url.Values{ - "Action": {"AssumeRoleWithOIDC"}, - "Format": {"JSON"}, - "Version": {"2015-04-01"}, - "RoleArn": {s.config.RoleArn}, - "OIDCProviderArn": {s.config.OIDCProviderArn}, - "OIDCToken": {strings.TrimSpace(string(oidcToken))}, - "RoleSessionName": {fmt.Sprintf("hiclaw-worker-%s", workerName)}, - "DurationSeconds": {"3600"}, - "Policy": {policy}, + "Action": {"AssumeRoleWithOIDC"}, + "Format": {"JSON"}, + "Version": {"2015-04-01"}, + "Timestamp": {time.Now().UTC().Format("2006-01-02T15:04:05Z")}, + "RoleArn": {s.config.RoleArn}, + "OIDCProviderArn": {s.config.OIDCProviderArn}, + "OIDCToken": {strings.TrimSpace(string(oidcToken))}, + "RoleSessionName": {fmt.Sprintf("hiclaw-worker-%s", workerName)}, + "DurationSeconds": {"3600"}, + "Policy": {policy}, } req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, strings.NewReader(form.Encode())) diff --git a/shared/lib/oss-credentials.sh b/shared/lib/oss-credentials.sh index 8b29c29d..84cc0b55 100644 --- a/shared/lib/oss-credentials.sh +++ b/shared/lib/oss-credentials.sh @@ -1,29 +1,111 @@ #!/bin/bash # oss-credentials.sh - STS credential management for mc (MinIO Client) # -# Workers obtain STS temporary credentials from the orchestrator service. -# The orchestrator holds OIDC credentials and issues per-worker scoped tokens. -# STS tokens expire after 1 hour. This library provides lazy-refresh: credentials -# are cached in a file and refreshed only when they are about to expire. +# Two credential paths (checked in priority order): # -# Required env vars (set by orchestrator at worker creation): -# HICLAW_ORCHESTRATOR_URL - orchestrator HTTP endpoint (e.g. http://hiclaw-orchestrator:2375) -# HICLAW_WORKER_API_KEY - per-worker API key for authentication +# 1. RRSA OIDC (Manager, Orchestrator — any SAE app with oidc_role_name): +# ALIBABA_CLOUD_OIDC_TOKEN_FILE exists → call STS AssumeRoleWithOIDC directly. +# Worker inline policy applied when HICLAW_WORKER_NAME is set. +# +# 2. Orchestrator-mediated STS (Workers without RRSA): +# HICLAW_ORCHESTRATOR_URL + HICLAW_WORKER_API_KEY → call orchestrator /credentials/sts. +# +# 3. Neither → no-op (local mode, mc alias configured with static credentials). +# +# STS tokens expire after 1 hour. Credentials are cached and lazy-refreshed. # # Usage: # source /opt/hiclaw/scripts/lib/oss-credentials.sh # ensure_mc_credentials # call before any mc command -# mc mirror ... -# -# In local mode (no HICLAW_ORCHESTRATOR_URL), ensure_mc_credentials is a no-op. _OSS_CRED_FILE="/tmp/mc-oss-credentials.env" _OSS_CRED_REFRESH_MARGIN=600 # refresh if less than 10 minutes remaining -# Internal: call orchestrator STS endpoint and write credentials to file +# -------------------------------------------------------------------------- +# Path 1: Direct STS via RRSA OIDC +# -------------------------------------------------------------------------- + +# Build an inline STS policy restricting OSS access to the worker's own prefix. +# Only used when HICLAW_WORKER_NAME is set (worker context). +_oss_build_worker_policy() { + local worker="$1" + local bucket="${HICLAW_OSS_BUCKET:-hiclaw-cloud-storage}" + cat <&2 + fi + + sts_resp=$(curl -s -w "\n%{http_code}" -X POST "https://sts-vpc.${region}.aliyuncs.com" \ + -d "Action=AssumeRoleWithOIDC" \ + -d "Format=JSON" \ + -d "Version=2015-04-01" \ + --data-urlencode "Timestamp=${timestamp}" \ + -d "SignatureNonce=${nonce}" \ + --data-urlencode "RoleArn=${ALIBABA_CLOUD_ROLE_ARN}" \ + --data-urlencode "OIDCProviderArn=${ALIBABA_CLOUD_OIDC_PROVIDER_ARN}" \ + --data-urlencode "OIDCToken=${oidc_token}" \ + -d "RoleSessionName=hiclaw-oss-session" \ + -d "DurationSeconds=3600" \ + "${policy_args[@]}" \ + --connect-timeout 10 --max-time 30 2>&1) + + http_code=$(echo "${sts_resp}" | tail -1) + sts_resp=$(echo "${sts_resp}" | sed '$d') + + if [ "${http_code}" != "200" ]; then + echo "[oss-credentials] ERROR: STS request failed (HTTP ${http_code})" >&2 + echo "[oss-credentials] Response: ${sts_resp}" >&2 + return 1 + fi + + sts_ak=$(echo "${sts_resp}" | jq -r '.Credentials.AccessKeyId') + sts_sk=$(echo "${sts_resp}" | jq -r '.Credentials.AccessKeySecret') + sts_token=$(echo "${sts_resp}" | jq -r '.Credentials.SecurityToken') + + if [ -z "${sts_ak}" ] || [ "${sts_ak}" = "null" ]; then + echo "[oss-credentials] ERROR: Failed to parse STS credentials" >&2 + echo "[oss-credentials] Response: ${sts_resp}" >&2 + return 1 + fi + + expires_at=$(( $(date +%s) + 3600 )) + + cat > "${_OSS_CRED_FILE}" </dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 +} + +# -------------------------------------------------------------------------- +# Path 2: STS via Orchestrator (workers without RRSA) +# -------------------------------------------------------------------------- + _oss_refresh_sts_via_orchestrator() { local resp http_code - local sts_ak sts_sk sts_token oss_endpoint oss_bucket region + local sts_ak sts_sk sts_token oss_endpoint oss_bucket resp=$(curl -s -w "\n%{http_code}" -X POST "${HICLAW_ORCHESTRATOR_URL}/credentials/sts" \ -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" \ @@ -42,7 +124,6 @@ _oss_refresh_sts_via_orchestrator() { sts_sk=$(echo "${resp}" | jq -r '.access_key_secret') sts_token=$(echo "${resp}" | jq -r '.security_token') oss_endpoint=$(echo "${resp}" | jq -r '.oss_endpoint') - oss_bucket=$(echo "${resp}" | jq -r '.oss_bucket') if [ -z "${sts_ak}" ] || [ "${sts_ak}" = "null" ]; then echo "[oss-credentials] ERROR: Failed to parse STS credentials from orchestrator" >&2 @@ -50,7 +131,6 @@ _oss_refresh_sts_via_orchestrator() { return 1 fi - # expires_at = now + 3600 seconds (STS token lifetime) local expires_at expires_at=$(( $(date +%s) + 3600 )) @@ -63,19 +143,34 @@ EOF echo "[oss-credentials] STS credentials refreshed via orchestrator (AK prefix: ${sts_ak:0:8}..., expires: $(date -d @${expires_at} '+%H:%M:%S' 2>/dev/null || date -r ${expires_at} '+%H:%M:%S' 2>/dev/null || echo ${expires_at}))" >&2 } -# Public: ensure MC_HOST_hiclaw is set with valid (non-expired) STS credentials. -# In local mode (no HICLAW_ORCHESTRATOR_URL), this is a no-op. +# -------------------------------------------------------------------------- +# Public API +# -------------------------------------------------------------------------- + ensure_mc_credentials() { - # Skip in local mode — mc alias is configured with static credentials - if [ -z "${HICLAW_ORCHESTRATOR_URL:-}" ]; then - return 0 + # Priority 1: RRSA OIDC token file exists → direct STS call + if [ -n "${ALIBABA_CLOUD_OIDC_TOKEN_FILE:-}" ] && [ -f "${ALIBABA_CLOUD_OIDC_TOKEN_FILE}" ]; then + _oss_ensure_refresh _oss_refresh_sts_direct + return $? fi + # Priority 2: Orchestrator URL + worker API key → orchestrator-mediated STS + if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ] && [ -n "${HICLAW_WORKER_API_KEY:-}" ]; then + _oss_ensure_refresh _oss_refresh_sts_via_orchestrator + return $? + fi + + # Priority 3: local mode — mc alias configured with static credentials + return 0 +} + +# Shared lazy-refresh logic: call the given refresh function only if needed. +_oss_ensure_refresh() { + local refresh_fn="$1" local now needs_refresh=false now=$(date +%s) if [ -f "${_OSS_CRED_FILE}" ]; then - # Source to get _OSS_CRED_EXPIRES_AT . "${_OSS_CRED_FILE}" if [ -z "${_OSS_CRED_EXPIRES_AT:-}" ] || [ $(( _OSS_CRED_EXPIRES_AT - now )) -lt ${_OSS_CRED_REFRESH_MARGIN} ]; then needs_refresh=true @@ -85,7 +180,7 @@ ensure_mc_credentials() { fi if [ "${needs_refresh}" = true ]; then - _oss_refresh_sts_via_orchestrator || return 1 + ${refresh_fn} || return 1 . "${_OSS_CRED_FILE}" fi From 4455f932d511edfde91b2919f35996f618e31b93 Mon Sep 17 00:00:00 2001 From: jingze Date: Thu, 26 Mar 2026 19:48:10 +0800 Subject: [PATCH 06/11] feat(orchestrator): add worker readiness detection - SAEBackend.Create() polls DescribeApplicationStatus until RUNNING (max 120s) - New POST /workers/{name}/ready endpoint for worker self-reporting - GET /workers/{name} merges readiness: running + reported ready = "ready" - Worker entrypoints (openclaw + copaw) report ready to orchestrator in background - New worker_backend_wait_ready() in container-api.sh for unified readiness polling - create-worker.sh Step 9 uses unified wait instead of Docker exec-based polling Co-Authored-By: Claude Opus 4.6 --- copaw/scripts/copaw-worker-entrypoint.sh | 29 ++++ copaw/src/copaw_worker/sync.py | 5 +- .../scripts/create-worker.sh | 27 +--- .../scripts/enable-worker-console.sh | 2 +- manager/scripts/lib/container-api.sh | 74 +++------- orchestrator/api/worker_handler.go | 80 ++++++++++- orchestrator/api/worker_handler_test.go | 128 ++++++++++++++++++ orchestrator/backend/backend.go | 1 + orchestrator/backend/sae.go | 43 +++++- orchestrator/backend/sae_test.go | 2 +- orchestrator/main.go | 3 + worker/scripts/worker-entrypoint.sh | 26 ++++ 12 files changed, 335 insertions(+), 85 deletions(-) diff --git a/copaw/scripts/copaw-worker-entrypoint.sh b/copaw/scripts/copaw-worker-entrypoint.sh index d25bf737..ea7addcb 100755 --- a/copaw/scripts/copaw-worker-entrypoint.sh +++ b/copaw/scripts/copaw-worker-entrypoint.sh @@ -63,6 +63,31 @@ mkdir -p "${WORKER_SKILLS_DIR}" mkdir -p "${HOME}/.agents" ln -sfn "${WORKER_SKILLS_DIR}" "${HOME}/.agents/skills" +# Background readiness reporter — report ready to orchestrator when CoPaw bridge completes +_start_readiness_reporter() { + [ -z "${HICLAW_ORCHESTRATOR_URL:-}" ] || [ -z "${HICLAW_WORKER_API_KEY:-}" ] && return 0 + ( + TIMEOUT=120; ELAPSED=0 + CONFIG_FILE="${INSTALL_DIR}/${WORKER_NAME}/.copaw/config.json" + while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do + if [ -f "${CONFIG_FILE}" ] && grep -q '"channels"' "${CONFIG_FILE}" 2>/dev/null; then + for _attempt in 1 2 3; do + if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" 2>/dev/null; then + log "Reported ready to orchestrator" + exit 0 + fi + sleep 2 + done + log "WARNING: POST to orchestrator failed, will retry health check loop" + fi + sleep 5; ELAPSED=$((ELAPSED + 5)) + done + log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + ) & + log "Background readiness reporter started (PID: $!)" +} + if [ -n "${CONSOLE_PORT}" ]; then # ---------- Standard mode: copaw-worker (PyPI CoPaw venv, with console) ---------- VENV="/opt/venv/standard" @@ -72,6 +97,8 @@ if [ -n "${CONSOLE_PORT}" ]; then log " Console port: ${CONSOLE_PORT}" log " CoPaw: standard (${VENV})" + _start_readiness_reporter + exec "${VENV}/bin/copaw-worker" \ --name "${WORKER_NAME}" \ --fs "${FS_ENDPOINT}" \ @@ -88,6 +115,8 @@ else log " Install dir: ${INSTALL_DIR}" log " CoPaw: lite (${VENV})" + _start_readiness_reporter + exec "${VENV}/bin/copaw-worker" \ --name "${WORKER_NAME}" \ --fs "${FS_ENDPOINT}" \ diff --git a/copaw/src/copaw_worker/sync.py b/copaw/src/copaw_worker/sync.py index 7471e70b..9b121ac4 100644 --- a/copaw/src/copaw_worker/sync.py +++ b/copaw/src/copaw_worker/sync.py @@ -73,10 +73,7 @@ def __init__( self.local_dir.mkdir(parents=True, exist_ok=True) self._prefix = f"agents/{worker_name}" self._alias_set = False - self._cloud_mode = bool( - os.environ.get("ALIBABA_CLOUD_OIDC_TOKEN_FILE") - and Path(os.environ.get("ALIBABA_CLOUD_OIDC_TOKEN_FILE", "")).is_file() - ) + self._cloud_mode = os.environ.get("HICLAW_RUNTIME") == "aliyun" # ------------------------------------------------------------------ # mc alias management diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 0b546082..6f3d3a7b 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -818,29 +818,14 @@ elif container_api_available; then DEPLOY_MODE="cloud" fi - # Wait for readiness (only for local Docker containers with exec access) - if [ "${DEPLOY_MODE}" = "local" ] && [ -n "${CONTAINER_ID}" ]; then - log " Waiting for Worker agent to be ready..." - if [ "${WORKER_RUNTIME}" = "copaw" ]; then - if container_wait_copaw_worker_ready "${WORKER_NAME}" 120; then - WORKER_STATUS="ready" - log " CoPaw Worker agent is ready!" - else - WORKER_STATUS="starting" - log " WARNING: CoPaw Worker agent not ready within timeout" - fi - else - if container_wait_worker_ready "${WORKER_NAME}" 120; then - WORKER_STATUS="ready" - log " Worker agent is ready!" - else - WORKER_STATUS="starting" - log " WARNING: Worker agent not ready within timeout" - fi - fi + # Wait for worker to report ready (unified — works for both Docker and SAE) + log " Waiting for Worker agent to be ready..." + if worker_backend_wait_ready "${WORKER_NAME}" 120; then + WORKER_STATUS="ready" + log " Worker agent is ready!" else WORKER_STATUS="starting" - log " Worker created on ${DEPLOY_MODE} backend" + log " WARNING: Worker agent not ready within timeout" fi else log " WARNING: Worker creation failed, falling back to remote mode" diff --git a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh index 23b92258..1be01e81 100755 --- a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh +++ b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh @@ -131,7 +131,7 @@ fi # --- Wait for ready --- log "Waiting for CoPaw worker to be ready..." -if container_wait_copaw_worker_ready "${WORKER_NAME}" 120; then +if worker_backend_wait_ready "${WORKER_NAME}" 120; then WORKER_STATUS="ready" log "CoPaw Worker is ready!" else diff --git a/manager/scripts/lib/container-api.sh b/manager/scripts/lib/container-api.sh index 7b15c9b6..3f5d3316 100755 --- a/manager/scripts/lib/container-api.sh +++ b/manager/scripts/lib/container-api.sh @@ -160,70 +160,38 @@ container_exec_worker() { return 0 } -# Wait for OpenClaw Worker to become ready -container_wait_worker_ready() { - local worker_name="$1" - local timeout="${2:-120}" - local elapsed=0 - - _log "Waiting for Worker ${worker_name} to be ready (timeout: ${timeout}s)..." - - while [ "${elapsed}" -lt "${timeout}" ]; do - local cstatus - cstatus=$(container_status_worker "${worker_name}") - if [ "${cstatus}" != "running" ]; then - _log "Worker container ${worker_name} stopped unexpectedly (status: ${cstatus})" - return 1 - fi - - if container_exec_worker "${worker_name}" openclaw gateway health --json 2>/dev/null \ - | grep -q '"ok"' 2>/dev/null; then - _log "Worker ${worker_name} is ready!" - return 0 - fi - - sleep 5 - elapsed=$((elapsed + 5)) - _log "Waiting for Worker ${worker_name}... (${elapsed}s/${timeout}s)" - done - - _log "Worker ${worker_name} did not become ready within ${timeout}s" - return 1 +# Get the Manager container's own IP (for Worker to connect back) +container_get_manager_ip() { + hostname -I 2>/dev/null | awk '{print $1}' } -# Wait for CoPaw Worker to become ready -container_wait_copaw_worker_ready() { +# Wait for a worker to report ready via orchestrator. +# Usage: worker_backend_wait_ready [timeout_seconds] +worker_backend_wait_ready() { local worker_name="$1" local timeout="${2:-120}" local elapsed=0 - local config_file="/root/.copaw-worker/${worker_name}/.copaw/config.json" - _log "Waiting for CoPaw Worker ${worker_name} to be ready (timeout: ${timeout}s)..." + _log "Waiting for Worker ${worker_name} to be ready (timeout: ${timeout}s)..." while [ "${elapsed}" -lt "${timeout}" ]; do - local cstatus - cstatus=$(container_status_worker "${worker_name}") - if [ "${cstatus}" != "running" ]; then - _log "CoPaw Worker container ${worker_name} stopped unexpectedly (status: ${cstatus})" - return 1 - fi - - if container_exec_worker "${worker_name}" cat "${config_file}" 2>/dev/null \ - | grep -q '"channels"' 2>/dev/null; then - _log "CoPaw Worker ${worker_name} is ready!" - return 0 - fi - + local status + status=$(worker_backend_status "${worker_name}") + case "${status}" in + ready) + _log "Worker ${worker_name} is ready!" + return 0 + ;; + not_found|stopped|unknown) + _log "Worker ${worker_name} status: ${status} — aborting wait" + return 1 + ;; + esac sleep 5 elapsed=$((elapsed + 5)) - _log "Waiting for CoPaw Worker ${worker_name}... (${elapsed}s/${timeout}s)" + _log "Waiting for Worker ${worker_name}... (${elapsed}s/${timeout}s, status=${status})" done - _log "CoPaw Worker ${worker_name} did not become ready within ${timeout}s" + _log "Worker ${worker_name} did not become ready within ${timeout}s" return 1 } - -# Get the Manager container's own IP (for Worker to connect back) -container_get_manager_ip() { - hostname -I 2>/dev/null | awk '{print $1}' -} diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go index 1c86343d..86a0d988 100644 --- a/orchestrator/api/worker_handler.go +++ b/orchestrator/api/worker_handler.go @@ -5,6 +5,7 @@ import ( "errors" "log" "net/http" + "sync" "github.com/alibaba/hiclaw/orchestrator/auth" "github.com/alibaba/hiclaw/orchestrator/backend" @@ -16,11 +17,20 @@ type WorkerHandler struct { registry *backend.Registry keyStore *auth.KeyStore orchestratorURL string + + // Readiness tracking — workers report ready via POST /workers/{name}/ready + readyMu sync.RWMutex + ready map[string]bool } // NewWorkerHandler creates a WorkerHandler. func NewWorkerHandler(registry *backend.Registry, keyStore *auth.KeyStore, orchestratorURL string) *WorkerHandler { - return &WorkerHandler{registry: registry, keyStore: keyStore, orchestratorURL: orchestratorURL} + return &WorkerHandler{ + registry: registry, + keyStore: keyStore, + orchestratorURL: orchestratorURL, + ready: make(map[string]bool), + } } // Create handles POST /workers. @@ -54,6 +64,9 @@ func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { } } + // Clear any stale readiness state + h.setReady(req.Name, false) + result, err := b.Create(r.Context(), backend.CreateRequest{ Name: req.Name, Image: req.Image, @@ -94,7 +107,9 @@ func (h *WorkerHandler) List(w http.ResponseWriter, r *http.Request) { workers := make([]WorkerResponse, 0, len(results)) for _, r := range results { - workers = append(workers, toWorkerResponse(&r)) + resp := toWorkerResponse(&r) + resp.Status = h.mergeReadiness(r.Name, resp.Status) + workers = append(workers, resp) } httputil.WriteJSON(w, http.StatusOK, WorkerListResponse{Workers: workers}) } @@ -120,7 +135,31 @@ func (h *WorkerHandler) Status(w http.ResponseWriter, r *http.Request) { return } - httputil.WriteJSON(w, http.StatusOK, toWorkerResponse(result)) + resp := toWorkerResponse(result) + resp.Status = h.mergeReadiness(name, resp.Status) + httputil.WriteJSON(w, http.StatusOK, resp) +} + +// Ready handles POST /workers/{name}/ready — worker reports itself as ready. +func (h *WorkerHandler) Ready(w http.ResponseWriter, r *http.Request) { + name := r.PathValue("name") + if name == "" { + httputil.WriteError(w, http.StatusBadRequest, "worker name is required") + return + } + + // Verify the caller is the worker itself. + // When auth is disabled (local mode), caller is nil — allow any caller + // since the network is trusted (Docker bridge). + caller := auth.CallerFromContext(r.Context()) + if caller != nil && caller.WorkerName != name { + httputil.WriteError(w, http.StatusForbidden, "workers can only report their own readiness") + return + } + + h.setReady(name, true) + log.Printf("[READY] Worker %s reported ready", name) + w.WriteHeader(http.StatusNoContent) } // Start handles POST /workers/{name}/start. @@ -137,6 +176,9 @@ func (h *WorkerHandler) Start(w http.ResponseWriter, r *http.Request) { return } + // Clear readiness on restart + h.setReady(name, false) + if err := b.Start(r.Context(), name); err != nil { log.Printf("[ERROR] start worker %s: %v", name, err) writeBackendError(w, err) @@ -160,6 +202,9 @@ func (h *WorkerHandler) Stop(w http.ResponseWriter, r *http.Request) { return } + // Clear readiness on stop + h.setReady(name, false) + if err := b.Stop(r.Context(), name); err != nil { log.Printf("[ERROR] stop worker %s: %v", name, err) writeBackendError(w, err) @@ -192,10 +237,39 @@ func (h *WorkerHandler) Delete(w http.ResponseWriter, r *http.Request) { if h.keyStore != nil { h.keyStore.RemoveWorkerKey(name) } + h.setReady(name, false) w.WriteHeader(http.StatusNoContent) } +// --- readiness helpers --- + +func (h *WorkerHandler) setReady(name string, ready bool) { + h.readyMu.Lock() + defer h.readyMu.Unlock() + if ready { + h.ready[name] = true + } else { + delete(h.ready, name) + } +} + +func (h *WorkerHandler) isReady(name string) bool { + h.readyMu.RLock() + defer h.readyMu.RUnlock() + return h.ready[name] +} + +// mergeReadiness upgrades "running" to "ready" if the worker has reported ready. +func (h *WorkerHandler) mergeReadiness(name string, status backend.WorkerStatus) backend.WorkerStatus { + if status == backend.StatusRunning && h.isReady(name) { + return backend.StatusReady + } + return status +} + +// --- response helpers --- + func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { return WorkerResponse{ Name: r.Name, diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go index 672f160f..bc71243c 100644 --- a/orchestrator/api/worker_handler_test.go +++ b/orchestrator/api/worker_handler_test.go @@ -108,6 +108,7 @@ func setupHandler(mb *mockBackend) (*WorkerHandler, *http.ServeMux) { mux.HandleFunc("POST /workers", h.Create) mux.HandleFunc("GET /workers", h.List) mux.HandleFunc("GET /workers/{name}", h.Status) + mux.HandleFunc("POST /workers/{name}/ready", h.Ready) mux.HandleFunc("POST /workers/{name}/start", h.Start) mux.HandleFunc("POST /workers/{name}/stop", h.Stop) mux.HandleFunc("DELETE /workers/{name}", h.Delete) @@ -379,3 +380,130 @@ func TestGatewayNoBackend(t *testing.T) { } } } + +// --- Readiness tests --- + +func TestReadyEndpoint(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "alice", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + // Status should be "running" before ready report + req = httptest.NewRequest(http.MethodGet, "/workers/alice", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running before ready, got %s", resp.Status) + } + + // Report ready + req = httptest.NewRequest(http.MethodPost, "/workers/alice/ready", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code != http.StatusNoContent { + t.Errorf("ready: expected 204, got %d", w.Code) + } + + // Status should now be "ready" + req = httptest.NewRequest(http.MethodGet, "/workers/alice", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusReady { + t.Errorf("expected ready after report, got %s", resp.Status) + } +} + +func TestReadyOnlyUpgradesRunning(t *testing.T) { + mb := newMockBackend() + mb.workers["bob"] = &backend.WorkerResult{ + Name: "bob", Backend: "mock", Status: backend.StatusStopped, + } + h, mux := setupHandler(mb) + h.setReady("bob", true) + + req := httptest.NewRequest(http.MethodGet, "/workers/bob", nil) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusStopped { + t.Errorf("expected stopped (ready should not upgrade non-running), got %s", resp.Status) + } +} + +func TestReadyClearedOnStop(t *testing.T) { + mb := newMockBackend() + _, mux := setupHandler(mb) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "carol", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodPost, "/workers/carol/ready", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodPost, "/workers/carol/stop", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodPost, "/workers/carol/start", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodGet, "/workers/carol", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running after stop+start (readiness cleared), got %s", resp.Status) + } +} + +func TestReadyClearedOnCreate(t *testing.T) { + mb := newMockBackend() + h, mux := setupHandler(mb) + h.setReady("dave", true) + + body, _ := json.Marshal(CreateWorkerRequest{Name: "dave", Image: "img:latest"}) + req := httptest.NewRequest(http.MethodPost, "/workers", bytes.NewReader(body)) + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + + req = httptest.NewRequest(http.MethodGet, "/workers/dave", nil) + w = httptest.NewRecorder() + mux.ServeHTTP(w, req) + var resp WorkerResponse + json.NewDecoder(w.Body).Decode(&resp) + if resp.Status != backend.StatusRunning { + t.Errorf("expected running (stale readiness cleared on create), got %s", resp.Status) + } +} + +func TestReadyForbiddenCrossWorker(t *testing.T) { + mb := newMockBackend() + h, _ := setupHandler(mb) + mux := http.NewServeMux() + mux.HandleFunc("POST /workers/{name}/ready", h.Ready) + + req := httptest.NewRequest(http.MethodPost, "/workers/bob/ready", nil) + ctx := context.WithValue(req.Context(), auth.CallerKeyForTest(), &auth.CallerIdentity{ + Role: auth.RoleWorker, WorkerName: "alice", + }) + req = req.WithContext(ctx) + + w := httptest.NewRecorder() + mux.ServeHTTP(w, req) + if w.Code != http.StatusForbidden { + t.Errorf("expected 403 for cross-worker ready report, got %d", w.Code) + } +} diff --git a/orchestrator/backend/backend.go b/orchestrator/backend/backend.go index 4bc53562..8671b235 100644 --- a/orchestrator/backend/backend.go +++ b/orchestrator/backend/backend.go @@ -16,6 +16,7 @@ type WorkerStatus string const ( StatusRunning WorkerStatus = "running" + StatusReady WorkerStatus = "ready" StatusStopped WorkerStatus = "stopped" StatusStarting WorkerStatus = "starting" StatusNotFound WorkerStatus = "not_found" diff --git a/orchestrator/backend/sae.go b/orchestrator/backend/sae.go index d543f0b8..02a4daaa 100644 --- a/orchestrator/backend/sae.go +++ b/orchestrator/backend/sae.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "strings" + "time" openapi "github.com/alibabacloud-go/darabonba-openapi/v2/client" sae "github.com/alibabacloud-go/sae-20190506/v4/client" @@ -100,7 +101,7 @@ func (s *SAEBackend) Available(_ context.Context) bool { return IsAliyunRuntime() && s.config.WorkerImage != "" } -func (s *SAEBackend) Create(_ context.Context, req CreateRequest) (*WorkerResult, error) { +func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) { appName := s.containerPrefix + req.Name // Check if already exists @@ -155,8 +156,46 @@ func (s *SAEBackend) Create(_ context.Context, req CreateRequest) (*WorkerResult appID = *resp.Body.Data.AppId } - log.Printf("[SAE] Created application %s (%s)", appName, appID) + log.Printf("[SAE] Created application %s (%s), waiting for RUNNING...", appName, appID) + + // Poll DescribeApplicationStatus until RUNNING (max 120s) + for elapsed := 0; elapsed < 120; elapsed += 5 { + statusReq := &sae.DescribeApplicationStatusRequest{} + statusReq.SetAppId(appID) + statusResp, err := s.client.DescribeApplicationStatus(statusReq) + if err == nil && statusResp.Body != nil && statusResp.Body.Data != nil && + statusResp.Body.Data.CurrentStatus != nil { + current := *statusResp.Body.Data.CurrentStatus + if current == "RUNNING" { + log.Printf("[SAE] Application %s is RUNNING", appName) + return &WorkerResult{ + Name: req.Name, + Backend: "sae", + Status: StatusRunning, + AppID: appID, + RawStatus: "RUNNING", + }, nil + } + if strings.Contains(current, "FAILED") { + return nil, fmt.Errorf("SAE application %s entered failed state: %s", appName, current) + } + log.Printf("[SAE] Application %s status: %s (%ds)", appName, current, elapsed) + } else if err != nil { + log.Printf("[SAE] DescribeApplicationStatus error for %s: %v", appName, err) + } + select { + case <-ctx.Done(): + return &WorkerResult{ + Name: req.Name, + Backend: "sae", + Status: StatusStarting, + AppID: appID, + }, nil + case <-time.After(5 * time.Second): + } + } + log.Printf("[SAE] Application %s did not reach RUNNING within 120s", appName) return &WorkerResult{ Name: req.Name, Backend: "sae", diff --git a/orchestrator/backend/sae_test.go b/orchestrator/backend/sae_test.go index 32950f67..d8d5d093 100644 --- a/orchestrator/backend/sae_test.go +++ b/orchestrator/backend/sae_test.go @@ -32,7 +32,7 @@ func (m *mockSAEClient) CreateApplication(req *sae.CreateApplicationRequest) (*s appID := "app-" + name m.apps[name] = &mockSAEApp{ appID: appID, - status: "DEPLOYING", + status: "RUNNING", envs: tea.StringValue(req.Envs), } return &sae.CreateApplicationResponse{ diff --git a/orchestrator/main.go b/orchestrator/main.go index 7d2d8ce1..40bae092 100644 --- a/orchestrator/main.go +++ b/orchestrator/main.go @@ -98,6 +98,9 @@ func main() { mux.Handle("POST /workers/{name}/stop", authMw.RequireManager(http.HandlerFunc(workerHandler.Stop))) mux.Handle("DELETE /workers/{name}", authMw.RequireManager(http.HandlerFunc(workerHandler.Delete))) + // Worker readiness — workers report themselves as ready + mux.Handle("POST /workers/{name}/ready", authMw.RequireWorker(http.HandlerFunc(workerHandler.Ready))) + // Gateway API — manager only mux.Handle("POST /gateway/consumers", authMw.RequireManager(http.HandlerFunc(gatewayHandler.CreateConsumer))) mux.Handle("POST /gateway/consumers/{id}/bind", authMw.RequireManager(http.HandlerFunc(gatewayHandler.BindConsumer))) diff --git a/worker/scripts/worker-entrypoint.sh b/worker/scripts/worker-entrypoint.sh index 5254e376..25c5a557 100755 --- a/worker/scripts/worker-entrypoint.sh +++ b/worker/scripts/worker-entrypoint.sh @@ -271,4 +271,30 @@ else log "No Matrix password found in MinIO, skipping re-login (E2EE may not work after restart)" fi +# ============================================================ +# Step 5c: Background readiness reporter +# ============================================================ +# Poll local gateway health and report ready to orchestrator when healthy. +if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ] && [ -n "${HICLAW_WORKER_API_KEY:-}" ]; then + ( + TIMEOUT=120; ELAPSED=0 + while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do + if openclaw gateway health --json 2>/dev/null | grep -q '"ok"' 2>/dev/null; then + for _attempt in 1 2 3; do + if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" 2>/dev/null; then + log "Reported ready to orchestrator" + exit 0 + fi + sleep 2 + done + log "WARNING: POST to orchestrator failed, will retry health check loop" + fi + sleep 5; ELAPSED=$((ELAPSED + 5)) + done + log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + ) & + log "Background readiness reporter started (PID: $!)" +fi + exec openclaw gateway run --verbose --force From 821cbf0f4fb4f79132bb45e4e23cfd238fb55bf4 Mon Sep 17 00:00:00 2001 From: jingze Date: Fri, 27 Mar 2026 10:00:47 +0800 Subject: [PATCH 07/11] refactor(orchestrator): abstract backend provider layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminate all backend-specific logic from handler and main layers: - Add NeedsCredentialInjection() to WorkerBackend interface - Move credential injection (API key, orchestrator URL, HICLAW_RUNTIME) into SAEBackend.Create() — handler no longer checks b.Name() - Replace cfg.Runtime == "aliyun" checks with config-driven backend registration (buildBackends function) - Delete IsAliyunRuntime() global function - Delete Config.Runtime field - Backend Available() now checks own config, not global env var "aliyun" string now only exists inside sae.go (backend internal). Handler and main layers are fully backend-agnostic. Co-Authored-By: Claude Opus 4.6 --- Makefile | 5 +- copaw/scripts/copaw-worker-entrypoint.sh | 9 +- .../scripts/create-worker.sh | 21 +++- orchestrator/api/worker_handler.go | 33 +++--- orchestrator/api/worker_handler_test.go | 1 + orchestrator/backend/apig.go | 2 +- orchestrator/backend/backend.go | 20 ++++ orchestrator/backend/docker.go | 50 +++++++-- orchestrator/backend/docker_test.go | 14 +-- orchestrator/backend/registry.go | 22 +--- orchestrator/backend/registry_test.go | 1 + orchestrator/backend/sae.go | 33 +++--- orchestrator/backend/sae_test.go | 32 ++++++ orchestrator/config.go | 14 ++- orchestrator/main.go | 102 +++++++++--------- worker/scripts/worker-entrypoint.sh | 8 +- 16 files changed, 241 insertions(+), 126 deletions(-) diff --git a/Makefile b/Makefile index cb1dbeb4..95024243 100644 --- a/Makefile +++ b/Makefile @@ -95,7 +95,7 @@ LINES ?= 50 # ---------- Phony targets ---------- -.PHONY: all build build-openclaw-base build-hiclaw-controller build-manager build-manager-aliyun build-worker build-copaw-worker build-orchestrator \ +.PHONY: all build build-openclaw-base build-hiclaw-controller build-manager build-manager-aliyun build-worker build-copaw-worker build-orchestrator build-docker-proxy \ tag push push-openclaw-base push-hiclaw-controller push-manager push-manager-aliyun push-worker push-copaw-worker push-orchestrator \ push-native push-native-manager push-native-worker push-native-copaw-worker \ buildx-setup \ @@ -164,6 +164,8 @@ build-orchestrator: ## Build Orchestrator image -t $(LOCAL_ORCHESTRATOR) \ ./orchestrator/ +build-docker-proxy: build-orchestrator ## Backward-compatible alias + # ---------- Tag ---------- tag: build ## Tag images for registry push @@ -492,6 +494,7 @@ endif uninstall: ## Stop and remove Manager + all Worker containers @echo "==> Uninstalling HiClaw..." -docker stop hiclaw-manager 2>/dev/null && docker rm hiclaw-manager 2>/dev/null || true + -docker stop hiclaw-orchestrator 2>/dev/null && docker rm hiclaw-orchestrator 2>/dev/null || true @for c in $$(docker ps -a --filter "name=hiclaw-worker-" --format '{{.Names}}' 2>/dev/null); do \ echo " Removing Worker: $$c"; \ docker rm -f "$$c" 2>/dev/null || true; \ diff --git a/copaw/scripts/copaw-worker-entrypoint.sh b/copaw/scripts/copaw-worker-entrypoint.sh index ea7addcb..535c113d 100755 --- a/copaw/scripts/copaw-worker-entrypoint.sh +++ b/copaw/scripts/copaw-worker-entrypoint.sh @@ -65,7 +65,12 @@ ln -sfn "${WORKER_SKILLS_DIR}" "${HOME}/.agents/skills" # Background readiness reporter — report ready to orchestrator when CoPaw bridge completes _start_readiness_reporter() { - [ -z "${HICLAW_ORCHESTRATOR_URL:-}" ] || [ -z "${HICLAW_WORKER_API_KEY:-}" ] && return 0 + [ -z "${HICLAW_ORCHESTRATOR_URL:-}" ] && return 0 + + # Build auth header if API key is available (cloud mode) + local auth_header="" + [ -n "${HICLAW_WORKER_API_KEY:-}" ] && auth_header="Authorization: Bearer ${HICLAW_WORKER_API_KEY}" + ( TIMEOUT=120; ELAPSED=0 CONFIG_FILE="${INSTALL_DIR}/${WORKER_NAME}/.copaw/config.json" @@ -73,7 +78,7 @@ _start_readiness_reporter() { if [ -f "${CONFIG_FILE}" ] && grep -q '"channels"' "${CONFIG_FILE}" 2>/dev/null; then for _attempt in 1 2 3; do if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ - -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" 2>/dev/null; then + ${auth_header:+-H "${auth_header}"} 2>/dev/null; then log "Reported ready to orchestrator" exit 0 fi diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 6f3d3a7b..27c3c1eb 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -773,6 +773,7 @@ elif container_api_available; then --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ --arg fs_access_key "${WORKER_NAME}" \ --arg fs_secret_key "${WORKER_MINIO_PASSWORD}" \ + --arg orchestrator_url "${HICLAW_ORCHESTRATOR_URL:-}" \ '{ "HICLAW_WORKER_NAME": $worker_name, "HICLAW_WORKER_GATEWAY_KEY": $worker_key, @@ -784,6 +785,7 @@ elif container_api_available; then "HICLAW_FS_ACCESS_KEY": $fs_access_key, "HICLAW_FS_SECRET_KEY": $fs_secret_key } + | if $orchestrator_url != "" then . + { "HICLAW_ORCHESTRATOR_URL": $orchestrator_url } else . end | if $oss_bucket != "" then . + { "HICLAW_OSS_BUCKET": $oss_bucket, "HICLAW_REGION": $region } else . end | if $skills_api_url != "" then . + { "SKILLS_API_URL": $skills_api_url } else . end | if $console_port != "" then . + { "HICLAW_CONSOLE_PORT": $console_port } else . end @@ -795,14 +797,31 @@ elif container_api_available; then } else . end') + # Build extra_hosts for local domains (map *-local.hiclaw.io to Manager IP) + MANAGER_IP=$(container_get_manager_ip) + EXTRA_HOSTS="[]" + if [ -z "${MANAGER_IP}" ]; then + log " WARNING: Could not detect Manager IP — worker may fail to resolve *-local.hiclaw.io domains" + fi + if [ -n "${MANAGER_IP}" ]; then + EXTRA_HOSTS=$(jq -cn --arg ip "${MANAGER_IP}" \ + --arg matrix "${HICLAW_MATRIX_DOMAIN%%:*}" \ + --arg matrix_client "${HICLAW_MATRIX_CLIENT_DOMAIN:-matrix-client-local.hiclaw.io}" \ + --arg aigw "${HICLAW_AI_GATEWAY_DOMAIN:-aigw-local.hiclaw.io}" \ + --arg fs "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + '[$matrix, $matrix_client, $aigw, $fs] | map(select(endswith("-local.hiclaw.io"))) | map(. + ":" + $ip)') + fi + # Build create request body CREATE_BODY=$(jq -cn \ --arg name "${WORKER_NAME}" \ --arg image "${CUSTOM_IMAGE:-}" \ --arg runtime "${WORKER_RUNTIME}" \ --argjson env "${WORKER_ENV}" \ + --argjson extra_hosts "${EXTRA_HOSTS}" \ '{name: $name, runtime: $runtime, env: $env} - | if $image != "" then . + {image: $image} else . end') + | if $image != "" then . + {image: $image} else . end + | if ($extra_hosts | length) > 0 then . + {extra_hosts: $extra_hosts} else . end') CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true log " Create response: ${CREATE_OUTPUT:0:300}" diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go index 86a0d988..95c56a58 100644 --- a/orchestrator/api/worker_handler.go +++ b/orchestrator/api/worker_handler.go @@ -3,6 +3,7 @@ package api import ( "encoding/json" "errors" + "fmt" "log" "net/http" "sync" @@ -44,6 +45,11 @@ func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { httputil.WriteError(w, http.StatusBadRequest, "name is required") return } + if !backend.ValidRuntime(req.Runtime) { + httputil.WriteError(w, http.StatusBadRequest, + fmt.Sprintf("invalid runtime %q, supported: openclaw, copaw", req.Runtime)) + return + } b, err := h.registry.GetWorkerBackend(r.Context(), req.Backend) if err != nil { @@ -51,30 +57,25 @@ func (h *WorkerHandler) Create(w http.ResponseWriter, r *http.Request) { return } - // For SAE backend: generate per-worker API key and inject into env + // Generate API key for backends that need orchestrator-mediated credentials var apiKey string - if b.Name() == "sae" && h.keyStore != nil && h.keyStore.AuthEnabled() { + if b.NeedsCredentialInjection() && h.keyStore != nil && h.keyStore.AuthEnabled() { apiKey = h.keyStore.GenerateWorkerKey(req.Name) - if req.Env == nil { - req.Env = make(map[string]string) - } - req.Env["HICLAW_WORKER_API_KEY"] = apiKey - if h.orchestratorURL != "" { - req.Env["HICLAW_ORCHESTRATOR_URL"] = h.orchestratorURL - } } // Clear any stale readiness state h.setReady(req.Name, false) result, err := b.Create(r.Context(), backend.CreateRequest{ - Name: req.Name, - Image: req.Image, - Runtime: req.Runtime, - Env: req.Env, - Network: req.Network, - ExtraHosts: req.ExtraHosts, - WorkingDir: req.WorkingDir, + Name: req.Name, + Image: req.Image, + Runtime: req.Runtime, + Env: req.Env, + Network: req.Network, + ExtraHosts: req.ExtraHosts, + WorkingDir: req.WorkingDir, + OrchestratorURL: h.orchestratorURL, + WorkerAPIKey: apiKey, }) if err != nil { log.Printf("[ERROR] create worker %s: %v", req.Name, err) diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go index bc71243c..f9c3ef6a 100644 --- a/orchestrator/api/worker_handler_test.go +++ b/orchestrator/api/worker_handler_test.go @@ -35,6 +35,7 @@ func newMockBackend() *mockBackend { func (m *mockBackend) Name() string { return m.name } func (m *mockBackend) Available(_ context.Context) bool { return m.available } +func (m *mockBackend) NeedsCredentialInjection() bool { return false } func (m *mockBackend) Create(_ context.Context, req backend.CreateRequest) (*backend.WorkerResult, error) { if m.createErr != nil { diff --git a/orchestrator/backend/apig.go b/orchestrator/backend/apig.go index 7cb0d436..6f2a79b6 100644 --- a/orchestrator/backend/apig.go +++ b/orchestrator/backend/apig.go @@ -64,7 +64,7 @@ func NewAPIGBackendWithClient(client APIGClient, config APIGConfig) *APIGBackend func (a *APIGBackend) Name() string { return "apig" } func (a *APIGBackend) Available(_ context.Context) bool { - return IsAliyunRuntime() && a.config.GatewayID != "" + return a.config.GatewayID != "" } func (a *APIGBackend) CreateConsumer(_ context.Context, req ConsumerRequest) (*ConsumerResult, error) { diff --git a/orchestrator/backend/backend.go b/orchestrator/backend/backend.go index 8671b235..ae2e822d 100644 --- a/orchestrator/backend/backend.go +++ b/orchestrator/backend/backend.go @@ -23,6 +23,18 @@ const ( StatusUnknown WorkerStatus = "unknown" ) +// Supported worker runtimes. +const ( + RuntimeOpenClaw = "openclaw" + RuntimeCopaw = "copaw" +) + +// ValidRuntime reports whether r is a recognized runtime value. +// An empty string is valid — backends resolve it to the default image. +func ValidRuntime(r string) bool { + return r == "" || r == RuntimeOpenClaw || r == RuntimeCopaw +} + // CreateRequest holds parameters for creating a worker container/instance. type CreateRequest struct { Name string `json:"name"` @@ -32,6 +44,10 @@ type CreateRequest struct { Network string `json:"network,omitempty"` ExtraHosts []string `json:"extra_hosts,omitempty"` WorkingDir string `json:"working_dir,omitempty"` + + // Credential injection — set by handler, backends that need it will inject into env. + OrchestratorURL string `json:"-"` + WorkerAPIKey string `json:"-"` } // WorkerResult holds the result of a worker operation. @@ -53,6 +69,10 @@ type WorkerBackend interface { // Available reports whether this backend is usable in the current environment. Available(ctx context.Context) bool + // NeedsCredentialInjection reports whether this backend requires + // orchestrator-mediated credentials (API key + URL) injected into worker env. + NeedsCredentialInjection() bool + // Create creates and starts a new worker. Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) diff --git a/orchestrator/backend/docker.go b/orchestrator/backend/docker.go index 7fd6b272..69f74a30 100644 --- a/orchestrator/backend/docker.go +++ b/orchestrator/backend/docker.go @@ -8,42 +8,53 @@ import ( "net" "net/http" "net/url" + "os" "sort" "strings" "time" ) +// DockerConfig holds Docker backend configuration. +type DockerConfig struct { + SocketPath string + WorkerImage string // default worker image (HICLAW_WORKER_IMAGE) + CopawWorkerImage string // default copaw worker image (HICLAW_COPAW_WORKER_IMAGE) + DefaultNetwork string // default Docker network (default "hiclaw-net") +} + // DockerBackend manages worker containers via the Docker Engine API over a Unix socket. type DockerBackend struct { - socketPath string + config DockerConfig client *http.Client containerPrefix string } // NewDockerBackend creates a DockerBackend that talks to the given Docker socket. -func NewDockerBackend(socketPath string, containerPrefix string) *DockerBackend { +func NewDockerBackend(config DockerConfig, containerPrefix string) *DockerBackend { if containerPrefix == "" { - containerPrefix = "hiclaw-worker-" + containerPrefix = DefaultContainerPrefix } transport := &http.Transport{ DialContext: func(_ context.Context, _, _ string) (net.Conn, error) { - return net.Dial("unix", socketPath) + return net.Dial("unix", config.SocketPath) }, } return &DockerBackend{ - socketPath: socketPath, + config: config, client: &http.Client{Transport: transport}, containerPrefix: containerPrefix, } } -func (d *DockerBackend) Name() string { return "docker" } +func (d *DockerBackend) Name() string { return "docker" } +func (d *DockerBackend) NeedsCredentialInjection() bool { return false } func (d *DockerBackend) Available(ctx context.Context) bool { - if !DockerSocketAvailable(d.socketPath) { + // Check socket file exists + if _, err := os.Stat(d.config.SocketPath); err != nil { return false } - // Ping the Docker daemon to verify it's actually responding. + // Ping the Docker daemon pingCtx, cancel := context.WithTimeout(ctx, 2*time.Second) defer cancel() req, err := http.NewRequestWithContext(pingCtx, http.MethodGet, "http://localhost/_ping", nil) @@ -61,6 +72,29 @@ func (d *DockerBackend) Available(ctx context.Context) bool { func (d *DockerBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) { containerName := d.containerPrefix + req.Name + // Default image fallback + image := req.Image + if image == "" { + if req.Runtime == RuntimeCopaw && d.config.CopawWorkerImage != "" { + image = d.config.CopawWorkerImage + } else { + image = d.config.WorkerImage + } + } + req.Image = image + + // Default network fallback + if req.Network == "" && d.config.DefaultNetwork != "" { + req.Network = d.config.DefaultNetwork + } + + // Infer WorkingDir from HOME env if not set + if req.WorkingDir == "" { + if home, ok := req.Env["HOME"]; ok { + req.WorkingDir = home + } + } + payload := d.buildCreatePayload(req) body, err := json.Marshal(payload) if err != nil { diff --git a/orchestrator/backend/docker_test.go b/orchestrator/backend/docker_test.go index 7b00665f..abc4c55e 100644 --- a/orchestrator/backend/docker_test.go +++ b/orchestrator/backend/docker_test.go @@ -119,14 +119,16 @@ func mockDockerAPI(t *testing.T) *httptest.Server { func newTestDockerBackend(t *testing.T, serverURL string) *DockerBackend { t.Helper() - client := &http.Client{} b := &DockerBackend{ - client: client, + config: DockerConfig{ + WorkerImage: "hiclaw/worker-agent:latest", + CopawWorkerImage: "hiclaw/copaw-worker:latest", + DefaultNetwork: "hiclaw-net", + }, containerPrefix: "hiclaw-worker-", - } - // Patch all requests to go to the test server instead of Unix socket - b.client = &http.Client{ - Transport: &testTransport{serverURL: serverURL}, + client: &http.Client{ + Transport: &testTransport{serverURL: serverURL}, + }, } return b } diff --git a/orchestrator/backend/registry.go b/orchestrator/backend/registry.go index 7265981a..f7ae4542 100644 --- a/orchestrator/backend/registry.go +++ b/orchestrator/backend/registry.go @@ -3,10 +3,11 @@ package backend import ( "context" "fmt" - "log" - "os" ) +// DefaultContainerPrefix is the default prefix for worker container/app names. +const DefaultContainerPrefix = "hiclaw-worker-" + // Registry holds all available backends and provides auto-detection. type Registry struct { workerBackends []WorkerBackend @@ -22,14 +23,13 @@ func NewRegistry(workers []WorkerBackend, gateways []GatewayBackend) *Registry { } // DetectWorkerBackend returns the first available worker backend. -// Priority matches _detect_worker_backend() in container-api.sh: +// Priority is determined by registration order (set in main.go buildBackends): // 1. Docker backend (socket available) -// 2. SAE backend (HICLAW_RUNTIME=aliyun) +// 2. SAE backend (SAE worker image configured) // 3. nil func (r *Registry) DetectWorkerBackend(ctx context.Context) WorkerBackend { for _, b := range r.workerBackends { if b.Available(ctx) { - log.Printf("Auto-detected worker backend: %s", b.Name()) return b } } @@ -57,7 +57,6 @@ func (r *Registry) GetWorkerBackend(ctx context.Context, name string) (WorkerBac func (r *Registry) DetectGatewayBackend(ctx context.Context) GatewayBackend { for _, b := range r.gatewayBackends { if b.Available(ctx) { - log.Printf("Auto-detected gateway backend: %s", b.Name()) return b } } @@ -80,14 +79,3 @@ func (r *Registry) GetGatewayBackend(ctx context.Context, name string) (GatewayB } return nil, fmt.Errorf("unknown gateway backend: %q", name) } - -// DockerSocketAvailable checks if the Docker socket is accessible. -func DockerSocketAvailable(socketPath string) bool { - _, err := os.Stat(socketPath) - return err == nil -} - -// IsAliyunRuntime checks if HICLAW_RUNTIME is set to "aliyun". -func IsAliyunRuntime() bool { - return os.Getenv("HICLAW_RUNTIME") == "aliyun" -} diff --git a/orchestrator/backend/registry_test.go b/orchestrator/backend/registry_test.go index 5c7fde22..e7c3f87c 100644 --- a/orchestrator/backend/registry_test.go +++ b/orchestrator/backend/registry_test.go @@ -13,6 +13,7 @@ type mockWorkerBackend struct { func (m *mockWorkerBackend) Name() string { return m.name } func (m *mockWorkerBackend) Available(_ context.Context) bool { return m.available } +func (m *mockWorkerBackend) NeedsCredentialInjection() bool { return false } func (m *mockWorkerBackend) Create(_ context.Context, _ CreateRequest) (*WorkerResult, error) { return nil, nil } func (m *mockWorkerBackend) Delete(_ context.Context, _ string) error { return nil } func (m *mockWorkerBackend) Start(_ context.Context, _ string) error { return nil } diff --git a/orchestrator/backend/sae.go b/orchestrator/backend/sae.go index 02a4daaa..339263c6 100644 --- a/orchestrator/backend/sae.go +++ b/orchestrator/backend/sae.go @@ -44,16 +44,6 @@ type SAEBackend struct { // NewSAEBackend creates a SAEBackend with auto-configured SDK client. func NewSAEBackend(creds CloudCredentialProvider, config SAEConfig, containerPrefix string) (*SAEBackend, error) { - if containerPrefix == "" { - containerPrefix = "hiclaw-worker-" - } - if config.CPU == 0 { - config.CPU = 1000 - } - if config.Memory == 0 { - config.Memory = 2048 - } - cred, err := creds.GetCredential() if err != nil { return nil, fmt.Errorf("build SAE credentials: %w", err) @@ -70,17 +60,13 @@ func NewSAEBackend(creds CloudCredentialProvider, config SAEConfig, containerPre return nil, fmt.Errorf("create SAE client: %w", err) } - return &SAEBackend{ - client: client, - config: config, - containerPrefix: containerPrefix, - }, nil + return NewSAEBackendWithClient(client, config, containerPrefix), nil } // NewSAEBackendWithClient creates a SAEBackend with a custom client (for testing). func NewSAEBackendWithClient(client SAEClient, config SAEConfig, containerPrefix string) *SAEBackend { if containerPrefix == "" { - containerPrefix = "hiclaw-worker-" + containerPrefix = DefaultContainerPrefix } if config.CPU == 0 { config.CPU = 1000 @@ -95,10 +81,11 @@ func NewSAEBackendWithClient(client SAEClient, config SAEConfig, containerPrefix } } -func (s *SAEBackend) Name() string { return "sae" } +func (s *SAEBackend) Name() string { return "sae" } +func (s *SAEBackend) NeedsCredentialInjection() bool { return true } func (s *SAEBackend) Available(_ context.Context) bool { - return IsAliyunRuntime() && s.config.WorkerImage != "" + return s.config.WorkerImage != "" && s.config.NamespaceID != "" } func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResult, error) { @@ -116,18 +103,24 @@ func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResu // Build env vars image := req.Image if image == "" { - if req.Runtime == "copaw" && s.config.CopawWorkerImage != "" { + if req.Runtime == RuntimeCopaw && s.config.CopawWorkerImage != "" { image = s.config.CopawWorkerImage } else { image = s.config.WorkerImage } } - // SAE backend auto-injects runtime identifier so workers know they're on cloud + // SAE backend auto-injects runtime and credentials into worker env if req.Env == nil { req.Env = make(map[string]string) } req.Env["HICLAW_RUNTIME"] = "aliyun" + if req.WorkerAPIKey != "" { + req.Env["HICLAW_WORKER_API_KEY"] = req.WorkerAPIKey + } + if req.OrchestratorURL != "" { + req.Env["HICLAW_ORCHESTRATOR_URL"] = req.OrchestratorURL + } envList := s.buildEnvList(req.Env) diff --git a/orchestrator/backend/sae_test.go b/orchestrator/backend/sae_test.go index d8d5d093..f6a4562f 100644 --- a/orchestrator/backend/sae_test.go +++ b/orchestrator/backend/sae_test.go @@ -3,6 +3,7 @@ package backend import ( "context" "fmt" + "strings" "testing" sae "github.com/alibabacloud-go/sae-20190506/v4/client" @@ -144,6 +145,37 @@ func TestSAECreate(t *testing.T) { } } +func TestSAECreateInjectsCredentials(t *testing.T) { + mock := newMockSAEClient() + b := newTestSAEBackend(mock) + + _, err := b.Create(context.Background(), CreateRequest{ + Name: "cred-test", + Image: "custom:v1", + Env: map[string]string{"KEY": "VAL"}, + WorkerAPIKey: "test-key-123", + OrchestratorURL: "http://orchestrator:2375", + }) + if err != nil { + t.Fatalf("Create failed: %v", err) + } + + app := mock.apps["hiclaw-worker-cred-test"] + if app == nil { + t.Fatal("expected app to exist") + } + envs := app.envs + if !strings.Contains(envs, "HICLAW_RUNTIME") { + t.Error("expected HICLAW_RUNTIME in env") + } + if !strings.Contains(envs, "test-key-123") { + t.Error("expected HICLAW_WORKER_API_KEY value in env") + } + if !strings.Contains(envs, "http://orchestrator:2375") { + t.Error("expected HICLAW_ORCHESTRATOR_URL value in env") + } +} + func TestSAECreateConflict(t *testing.T) { mock := newMockSAEClient() b := newTestSAEBackend(mock) diff --git a/orchestrator/config.go b/orchestrator/config.go index 3cc1541f..20e20ac6 100644 --- a/orchestrator/config.go +++ b/orchestrator/config.go @@ -16,8 +16,6 @@ type Config struct { SocketPath string // ContainerPrefix is the required prefix for worker container names (default "hiclaw-worker-"). ContainerPrefix string - // Runtime is the deployment runtime ("aliyun" for cloud, empty for local). - Runtime string // Auth ManagerAPIKey string // HICLAW_ORCHESTRATOR_API_KEY @@ -44,7 +42,7 @@ type Config struct { OIDCProviderArn string OIDCTokenFile string - // Orchestrator URL (advertised to SAE workers for STS refresh) + // Orchestrator URL (advertised to workers for STS refresh) OrchestratorURL string } @@ -54,7 +52,6 @@ func LoadConfig() *Config { ListenAddr: envOrDefault("HICLAW_PROXY_LISTEN", ":2375"), SocketPath: envOrDefault("HICLAW_PROXY_SOCKET", "/var/run/docker.sock"), ContainerPrefix: envOrDefault("HICLAW_PROXY_CONTAINER_PREFIX", "hiclaw-worker-"), - Runtime: os.Getenv("HICLAW_RUNTIME"), ManagerAPIKey: os.Getenv("HICLAW_ORCHESTRATOR_API_KEY"), @@ -81,6 +78,15 @@ func LoadConfig() *Config { } } +func (c *Config) DockerConfig() backend.DockerConfig { + return backend.DockerConfig{ + SocketPath: c.SocketPath, + WorkerImage: envOrDefault("HICLAW_WORKER_IMAGE", "hiclaw/worker-agent:latest"), + CopawWorkerImage: envOrDefault("HICLAW_COPAW_WORKER_IMAGE", "hiclaw/copaw-worker:latest"), + DefaultNetwork: envOrDefault("HICLAW_DOCKER_NETWORK", "hiclaw-net"), + } +} + func (c *Config) SAEConfig() backend.SAEConfig { return backend.SAEConfig{ Region: c.Region, diff --git a/orchestrator/main.go b/orchestrator/main.go index 40bae092..607efa8f 100644 --- a/orchestrator/main.go +++ b/orchestrator/main.go @@ -15,15 +15,13 @@ import ( func main() { cfg := LoadConfig() - // --- Cloud credentials (shared by SAE, APIG, STS, OSS key persistence) --- - var cloudCreds backend.CloudCredentialProvider - if cfg.Runtime == "aliyun" { - cloudCreds = backend.NewDefaultCloudCredentialProvider() - } + // --- Cloud credentials (shared by SAE, APIG, STS, OSS) --- + // Created once if any cloud config is present; nil otherwise. + cloudCreds := buildCloudCredentials(cfg) // --- Auth --- var persister authpkg.KeyPersister - if cfg.Runtime == "aliyun" && cloudCreds != nil && cfg.OSSBucket != "" { + if cloudCreds != nil && cfg.OSSBucket != "" { cred, err := cloudCreds.GetCredential() if err != nil { log.Printf("[WARN] Failed to get credentials for key persistence: %v", err) @@ -40,45 +38,15 @@ func main() { // --- Security validator (for Docker API passthrough) --- validator := proxy.NewSecurityValidator() - - // --- Docker API passthrough handler --- proxyHandler := proxy.NewHandler(cfg.SocketPath, validator) - // --- Worker backends --- - var workerBackends []backend.WorkerBackend - - // Docker backend (always registered; Available() checks socket at runtime) - dockerBackend := backend.NewDockerBackend(cfg.SocketPath, cfg.ContainerPrefix) - workerBackends = append(workerBackends, dockerBackend) - - // SAE backend (cloud mode) - var saeBackend *backend.SAEBackend - if cfg.Runtime == "aliyun" && cloudCreds != nil { - var err error - saeBackend, err = backend.NewSAEBackend(cloudCreds, cfg.SAEConfig(), cfg.ContainerPrefix) - if err != nil { - log.Printf("[WARN] Failed to create SAE backend: %v", err) - } else { - workerBackends = append(workerBackends, saeBackend) - } - } - - // --- Gateway backends --- - var gatewayBackends []backend.GatewayBackend - if cfg.Runtime == "aliyun" && cloudCreds != nil { - apigBackend, err := backend.NewAPIGBackend(cloudCreds, cfg.APIGConfig()) - if err != nil { - log.Printf("[WARN] Failed to create APIG backend: %v", err) - } else { - gatewayBackends = append(gatewayBackends, apigBackend) - } - } - + // --- Backends (config-driven, no runtime string checks) --- + workerBackends, gatewayBackends := buildBackends(cfg, cloudCreds) registry := backend.NewRegistry(workerBackends, gatewayBackends) - // --- STS service --- + // --- STS service (enabled if OIDC token file is configured) --- var stsService *credentials.STSService - if cfg.Runtime == "aliyun" && cfg.OIDCTokenFile != "" { + if cfg.OIDCTokenFile != "" { stsService = credentials.NewSTSService(cfg.STSConfig()) } @@ -114,14 +82,8 @@ func main() { // --- Start server --- log.Printf("hiclaw-orchestrator listening on %s", cfg.ListenAddr) - if cfg.Runtime == "aliyun" { - log.Printf("Cloud mode: SAE=%v, APIG=%v, STS=%v", saeBackend != nil, len(gatewayBackends) > 0, stsService != nil) - } else { - log.Printf("Local mode: docker socket=%s", cfg.SocketPath) - } - if keyStore.AuthEnabled() { - log.Printf("Auth: enabled (manager key configured)") - } + log.Printf("Backends: workers=%d, gateways=%d, STS=%v, auth=%v", + len(workerBackends), len(gatewayBackends), stsService != nil, keyStore.AuthEnabled()) if len(validator.AllowedRegistries) > 0 { log.Printf("Allowed registries: %v", validator.AllowedRegistries) } @@ -129,3 +91,47 @@ func main() { log.Fatalf("Failed to start server: %v", err) } } + +// buildCloudCredentials creates a cloud credential provider if any cloud-related +// config is present (SAE image, APIG gateway, OIDC token, OSS bucket). +func buildCloudCredentials(cfg *Config) backend.CloudCredentialProvider { + if cfg.SAEWorkerImage != "" || cfg.GWGatewayID != "" || cfg.OIDCTokenFile != "" || cfg.OSSBucket != "" { + return backend.NewDefaultCloudCredentialProvider() + } + return nil +} + +// buildBackends creates all worker and gateway backends based on config. +// Each backend is registered if its required config is present. +func buildBackends(cfg *Config, cloudCreds backend.CloudCredentialProvider) ([]backend.WorkerBackend, []backend.GatewayBackend) { + var workers []backend.WorkerBackend + var gateways []backend.GatewayBackend + + // Docker backend — always registered; Available() checks socket at runtime + workers = append(workers, backend.NewDockerBackend(cfg.DockerConfig(), cfg.ContainerPrefix)) + + // SAE backend — registered if worker image is configured + if cfg.SAEWorkerImage != "" && cloudCreds != nil { + sae, err := backend.NewSAEBackend(cloudCreds, cfg.SAEConfig(), cfg.ContainerPrefix) + if err != nil { + log.Printf("[WARN] Failed to create SAE backend: %v", err) + } else { + workers = append(workers, sae) + } + } + + // APIG gateway backend — registered if gateway ID is configured + if cfg.GWGatewayID != "" && cloudCreds != nil { + apig, err := backend.NewAPIGBackend(cloudCreds, cfg.APIGConfig()) + if err != nil { + log.Printf("[WARN] Failed to create APIG backend: %v", err) + } else { + gateways = append(gateways, apig) + } + } + + // Future: K8s backend + // if cfg.K8sKubeconfig != "" { workers = append(workers, backend.NewK8sBackend(...)) } + + return workers, gateways +} diff --git a/worker/scripts/worker-entrypoint.sh b/worker/scripts/worker-entrypoint.sh index 25c5a557..8e876eaa 100755 --- a/worker/scripts/worker-entrypoint.sh +++ b/worker/scripts/worker-entrypoint.sh @@ -275,14 +275,18 @@ fi # Step 5c: Background readiness reporter # ============================================================ # Poll local gateway health and report ready to orchestrator when healthy. -if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ] && [ -n "${HICLAW_WORKER_API_KEY:-}" ]; then +if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ]; then ( + # Build auth header if API key is available (cloud mode) + AUTH_HEADER="" + [ -n "${HICLAW_WORKER_API_KEY:-}" ] && AUTH_HEADER="Authorization: Bearer ${HICLAW_WORKER_API_KEY}" + TIMEOUT=120; ELAPSED=0 while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do if openclaw gateway health --json 2>/dev/null | grep -q '"ok"' 2>/dev/null; then for _attempt in 1 2 3; do if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ - -H "Authorization: Bearer ${HICLAW_WORKER_API_KEY}" 2>/dev/null; then + ${AUTH_HEADER:+-H "${AUTH_HEADER}"} 2>/dev/null; then log "Reported ready to orchestrator" exit 0 fi From 1e5b19411119969fc5f67d26aa354c4fd515679f Mon Sep 17 00:00:00 2001 From: jingze Date: Fri, 27 Mar 2026 16:08:51 +0800 Subject: [PATCH 08/11] refactor(orchestrator): add deployment_mode to API response MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each backend now declares its deployment mode ("local" or "cloud") via the DeploymentMode() interface method. The API response includes a new deployment_mode field, eliminating the backend-name-to-mode translation in create-worker.sh (5 lines → 1 line). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../scripts/create-worker.sh | 7 +-- orchestrator/api/types.go | 15 ++--- orchestrator/api/worker_handler.go | 13 +++-- orchestrator/api/worker_handler_test.go | 15 +++-- orchestrator/backend/backend.go | 22 ++++++-- orchestrator/backend/docker.go | 41 ++++++++------ orchestrator/backend/docker_test.go | 3 + orchestrator/backend/registry_test.go | 1 + orchestrator/backend/sae.go | 55 +++++++++++-------- 9 files changed, 100 insertions(+), 72 deletions(-) diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 27c3c1eb..833b14b8 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -830,12 +830,7 @@ elif container_api_available; then CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | jq -r '.container_id // empty' 2>/dev/null) if [ "${CREATE_STATUS}" = "running" ] || [ "${CREATE_STATUS}" = "starting" ]; then - DEPLOY_MODE=$(echo "${CREATE_OUTPUT}" | jq -r '.backend // "local"' 2>/dev/null) - if [ "${DEPLOY_MODE}" = "docker" ]; then - DEPLOY_MODE="local" - elif [ "${DEPLOY_MODE}" = "sae" ]; then - DEPLOY_MODE="cloud" - fi + DEPLOY_MODE=$(echo "${CREATE_OUTPUT}" | jq -r '.deployment_mode // "local"' 2>/dev/null) # Wait for worker to report ready (unified — works for both Docker and SAE) log " Waiting for Worker agent to be ready..." diff --git a/orchestrator/api/types.go b/orchestrator/api/types.go index c6a4a44e..9b1ef06f 100644 --- a/orchestrator/api/types.go +++ b/orchestrator/api/types.go @@ -18,13 +18,14 @@ type CreateWorkerRequest struct { // WorkerResponse is the JSON response for worker operations. type WorkerResponse struct { - Name string `json:"name"` - Backend string `json:"backend"` - Status backend.WorkerStatus `json:"status"` - ContainerID string `json:"container_id,omitempty"` - AppID string `json:"app_id,omitempty"` - RawStatus string `json:"raw_status,omitempty"` - APIKey string `json:"api_key,omitempty"` + Name string `json:"name"` + Backend string `json:"backend"` + DeploymentMode string `json:"deployment_mode"` + Status backend.WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` + APIKey string `json:"api_key,omitempty"` } // WorkerListResponse is the JSON response for GET /workers. diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go index 95c56a58..3298c73f 100644 --- a/orchestrator/api/worker_handler.go +++ b/orchestrator/api/worker_handler.go @@ -273,12 +273,13 @@ func (h *WorkerHandler) mergeReadiness(name string, status backend.WorkerStatus) func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { return WorkerResponse{ - Name: r.Name, - Backend: r.Backend, - Status: r.Status, - ContainerID: r.ContainerID, - AppID: r.AppID, - RawStatus: r.RawStatus, + Name: r.Name, + Backend: r.Backend, + DeploymentMode: r.DeploymentMode, + Status: r.Status, + ContainerID: r.ContainerID, + AppID: r.AppID, + RawStatus: r.RawStatus, } } diff --git a/orchestrator/api/worker_handler_test.go b/orchestrator/api/worker_handler_test.go index f9c3ef6a..a797d0e9 100644 --- a/orchestrator/api/worker_handler_test.go +++ b/orchestrator/api/worker_handler_test.go @@ -34,6 +34,7 @@ func newMockBackend() *mockBackend { } func (m *mockBackend) Name() string { return m.name } +func (m *mockBackend) DeploymentMode() string { return backend.DeployLocal } func (m *mockBackend) Available(_ context.Context) bool { return m.available } func (m *mockBackend) NeedsCredentialInjection() bool { return false } @@ -42,11 +43,12 @@ func (m *mockBackend) Create(_ context.Context, req backend.CreateRequest) (*bac return nil, m.createErr } r := &backend.WorkerResult{ - Name: req.Name, - Backend: "mock", - Status: backend.StatusRunning, - ContainerID: "mock-" + req.Name, - RawStatus: "running", + Name: req.Name, + Backend: "mock", + DeploymentMode: backend.DeployLocal, + Status: backend.StatusRunning, + ContainerID: "mock-" + req.Name, + RawStatus: "running", } m.workers[req.Name] = r return r, nil @@ -143,6 +145,9 @@ func TestCreateWorker(t *testing.T) { if resp.Backend != "mock" { t.Errorf("expected backend mock, got %s", resp.Backend) } + if resp.DeploymentMode != backend.DeployLocal { + t.Errorf("expected deployment_mode local, got %s", resp.DeploymentMode) + } } func TestCreateWorkerMissingName(t *testing.T) { diff --git a/orchestrator/backend/backend.go b/orchestrator/backend/backend.go index ae2e822d..3853c2f3 100644 --- a/orchestrator/backend/backend.go +++ b/orchestrator/backend/backend.go @@ -50,14 +50,21 @@ type CreateRequest struct { WorkerAPIKey string `json:"-"` } +// Deployment modes returned by backends. +const ( + DeployLocal = "local" + DeployCloud = "cloud" +) + // WorkerResult holds the result of a worker operation. type WorkerResult struct { - Name string `json:"name"` - Backend string `json:"backend"` - Status WorkerStatus `json:"status"` - ContainerID string `json:"container_id,omitempty"` - AppID string `json:"app_id,omitempty"` - RawStatus string `json:"raw_status,omitempty"` + Name string `json:"name"` + Backend string `json:"backend"` + DeploymentMode string `json:"deployment_mode"` + Status WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` } // WorkerBackend defines the interface for worker lifecycle operations. @@ -66,6 +73,9 @@ type WorkerBackend interface { // Name returns the backend identifier (e.g. "docker", "sae"). Name() string + // DeploymentMode returns the user-facing deployment mode ("local" or "cloud"). + DeploymentMode() string + // Available reports whether this backend is usable in the current environment. Available(ctx context.Context) bool diff --git a/orchestrator/backend/docker.go b/orchestrator/backend/docker.go index 69f74a30..b5c09fa4 100644 --- a/orchestrator/backend/docker.go +++ b/orchestrator/backend/docker.go @@ -47,6 +47,7 @@ func NewDockerBackend(config DockerConfig, containerPrefix string) *DockerBacken } func (d *DockerBackend) Name() string { return "docker" } +func (d *DockerBackend) DeploymentMode() string { return DeployLocal } func (d *DockerBackend) NeedsCredentialInjection() bool { return false } func (d *DockerBackend) Available(ctx context.Context) bool { @@ -135,11 +136,12 @@ func (d *DockerBackend) Create(ctx context.Context, req CreateRequest) (*WorkerR } return &WorkerResult{ - Name: req.Name, - Backend: "docker", - Status: StatusRunning, - ContainerID: createResp.ID, - RawStatus: "running", + Name: req.Name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: StatusRunning, + ContainerID: createResp.ID, + RawStatus: "running", }, nil } @@ -218,9 +220,10 @@ func (d *DockerBackend) Status(ctx context.Context, name string) (*WorkerResult, if resp.StatusCode == http.StatusNotFound { return &WorkerResult{ - Name: name, - Backend: "docker", - Status: StatusNotFound, + Name: name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: StatusNotFound, }, nil } @@ -240,11 +243,12 @@ func (d *DockerBackend) Status(ctx context.Context, name string) (*WorkerResult, } return &WorkerResult{ - Name: name, - Backend: "docker", - Status: normalizeDockerStatus(inspectResp.State.Status), - ContainerID: inspectResp.ID, - RawStatus: inspectResp.State.Status, + Name: name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: normalizeDockerStatus(inspectResp.State.Status), + ContainerID: inspectResp.ID, + RawStatus: inspectResp.State.Status, }, nil } @@ -291,11 +295,12 @@ func (d *DockerBackend) List(ctx context.Context) ([]WorkerResult, error) { continue } results = append(results, WorkerResult{ - Name: name, - Backend: "docker", - Status: normalizeDockerStatus(c.State), - ContainerID: c.ID, - RawStatus: c.State, + Name: name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: normalizeDockerStatus(c.State), + ContainerID: c.ID, + RawStatus: c.State, }) } return results, nil diff --git a/orchestrator/backend/docker_test.go b/orchestrator/backend/docker_test.go index abc4c55e..689a1560 100644 --- a/orchestrator/backend/docker_test.go +++ b/orchestrator/backend/docker_test.go @@ -164,6 +164,9 @@ func TestDockerCreate(t *testing.T) { if result.Backend != "docker" { t.Errorf("expected backend docker, got %s", result.Backend) } + if result.DeploymentMode != DeployLocal { + t.Errorf("expected deployment_mode local, got %s", result.DeploymentMode) + } if result.Status != StatusRunning { t.Errorf("expected status running, got %s", result.Status) } diff --git a/orchestrator/backend/registry_test.go b/orchestrator/backend/registry_test.go index e7c3f87c..447279e8 100644 --- a/orchestrator/backend/registry_test.go +++ b/orchestrator/backend/registry_test.go @@ -12,6 +12,7 @@ type mockWorkerBackend struct { } func (m *mockWorkerBackend) Name() string { return m.name } +func (m *mockWorkerBackend) DeploymentMode() string { return DeployLocal } func (m *mockWorkerBackend) Available(_ context.Context) bool { return m.available } func (m *mockWorkerBackend) NeedsCredentialInjection() bool { return false } func (m *mockWorkerBackend) Create(_ context.Context, _ CreateRequest) (*WorkerResult, error) { return nil, nil } diff --git a/orchestrator/backend/sae.go b/orchestrator/backend/sae.go index 339263c6..96e8b702 100644 --- a/orchestrator/backend/sae.go +++ b/orchestrator/backend/sae.go @@ -82,6 +82,7 @@ func NewSAEBackendWithClient(client SAEClient, config SAEConfig, containerPrefix } func (s *SAEBackend) Name() string { return "sae" } +func (s *SAEBackend) DeploymentMode() string { return DeployCloud } func (s *SAEBackend) NeedsCredentialInjection() bool { return true } func (s *SAEBackend) Available(_ context.Context) bool { @@ -162,11 +163,12 @@ func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResu if current == "RUNNING" { log.Printf("[SAE] Application %s is RUNNING", appName) return &WorkerResult{ - Name: req.Name, - Backend: "sae", - Status: StatusRunning, - AppID: appID, - RawStatus: "RUNNING", + Name: req.Name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusRunning, + AppID: appID, + RawStatus: "RUNNING", }, nil } if strings.Contains(current, "FAILED") { @@ -179,10 +181,11 @@ func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResu select { case <-ctx.Done(): return &WorkerResult{ - Name: req.Name, - Backend: "sae", - Status: StatusStarting, - AppID: appID, + Name: req.Name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusStarting, + AppID: appID, }, nil case <-time.After(5 * time.Second): } @@ -190,10 +193,11 @@ func (s *SAEBackend) Create(ctx context.Context, req CreateRequest) (*WorkerResu log.Printf("[SAE] Application %s did not reach RUNNING within 120s", appName) return &WorkerResult{ - Name: req.Name, - Backend: "sae", - Status: StatusStarting, - AppID: appID, + Name: req.Name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusStarting, + AppID: appID, }, nil } @@ -264,9 +268,10 @@ func (s *SAEBackend) Status(_ context.Context, name string) (*WorkerResult, erro } if appID == "" { return &WorkerResult{ - Name: name, - Backend: "sae", - Status: StatusNotFound, + Name: name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: StatusNotFound, }, nil } @@ -283,11 +288,12 @@ func (s *SAEBackend) Status(_ context.Context, name string) (*WorkerResult, erro } return &WorkerResult{ - Name: name, - Backend: "sae", - Status: normalizeSAEStatus(rawStatus), - AppID: appID, - RawStatus: rawStatus, + Name: name, + Backend: "sae", + DeploymentMode: DeployCloud, + Status: normalizeSAEStatus(rawStatus), + AppID: appID, + RawStatus: rawStatus, }, nil } @@ -314,9 +320,10 @@ func (s *SAEBackend) List(_ context.Context) ([]WorkerResult, error) { appID = *app.AppId } results = append(results, WorkerResult{ - Name: name, - Backend: "sae", - AppID: appID, + Name: name, + Backend: "sae", + DeploymentMode: DeployCloud, + AppID: appID, }) } return results, nil From d0b9981e38bf98ea80bc9134544e2cb27b72d66e Mon Sep 17 00:00:00 2001 From: jingze Date: Fri, 27 Mar 2026 16:43:10 +0800 Subject: [PATCH 09/11] fix(network): remove redundant ExtraHosts and duplicate network config after rebase Upstream refactor(network) replaced ExtraHosts with Docker network aliases on the manager container. Remove leftover ExtraHosts injection in create-worker.sh and duplicate hiclaw-net setup in install scripts. Co-Authored-By: Claude Opus 4.6 (1M context) --- install/hiclaw-install.ps1 | 4 ---- install/hiclaw-install.sh | 4 +--- .../scripts/create-worker.sh | 19 +------------------ 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/install/hiclaw-install.ps1 b/install/hiclaw-install.ps1 index 7c84c79a..dd9fa80c 100644 --- a/install/hiclaw-install.ps1 +++ b/install/hiclaw-install.ps1 @@ -2006,9 +2006,6 @@ function Install-Manager { # Start Docker API proxy if enabled if ($config.DOCKER_PROXY -eq "1") { $proxyImage = $script:ORCHESTRATOR_IMAGE - # Ensure Docker network exists (reuse if already present) - docker network inspect hiclaw-net *>$null - if ($LASTEXITCODE -ne 0) { docker network create hiclaw-net *>$null } Write-Log "Starting Docker API proxy..." docker rm -f hiclaw-orchestrator *>$null docker run -d --name hiclaw-orchestrator ` @@ -2019,7 +2016,6 @@ function Install-Manager { --restart unless-stopped ` $proxyImage $dockerArgs += @("-e", "HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375") - $dockerArgs += @("--network", "hiclaw-net") Write-Log (Get-Msg "docker_proxy.selected_enabled") } else { $dockerArgs += @("-v", "//var/run/docker.sock:/var/run/docker.sock") diff --git a/install/hiclaw-install.sh b/install/hiclaw-install.sh index da503805..39843237 100644 --- a/install/hiclaw-install.sh +++ b/install/hiclaw-install.sh @@ -2296,8 +2296,6 @@ EOF PROXY_ARGS="" if [ "${HICLAW_DOCKER_PROXY:-0}" = "1" ] && [ -n "${CONTAINER_SOCK:-}" ]; then local _proxy_image="${ORCHESTRATOR_IMAGE}" - # Ensure Docker network exists (reuse if already present) - ${DOCKER_CMD} network inspect hiclaw-net >/dev/null 2>&1 || ${DOCKER_CMD} network create hiclaw-net log "Starting Docker API proxy..." ${DOCKER_CMD} run -d \ --name hiclaw-orchestrator \ @@ -2307,7 +2305,7 @@ EOF ${HICLAW_PROXY_ALLOWED_REGISTRIES:+-e HICLAW_PROXY_ALLOWED_REGISTRIES="${HICLAW_PROXY_ALLOWED_REGISTRIES}"} \ --restart unless-stopped \ "${_proxy_image}" - PROXY_ARGS="-e HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375 --network hiclaw-net" + PROXY_ARGS="-e HICLAW_ORCHESTRATOR_URL=http://hiclaw-orchestrator:2375" SOCKET_MOUNT_ARGS="" # Manager no longer needs direct socket access fi diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 833b14b8..853c70bc 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -797,31 +797,14 @@ elif container_api_available; then } else . end') - # Build extra_hosts for local domains (map *-local.hiclaw.io to Manager IP) - MANAGER_IP=$(container_get_manager_ip) - EXTRA_HOSTS="[]" - if [ -z "${MANAGER_IP}" ]; then - log " WARNING: Could not detect Manager IP — worker may fail to resolve *-local.hiclaw.io domains" - fi - if [ -n "${MANAGER_IP}" ]; then - EXTRA_HOSTS=$(jq -cn --arg ip "${MANAGER_IP}" \ - --arg matrix "${HICLAW_MATRIX_DOMAIN%%:*}" \ - --arg matrix_client "${HICLAW_MATRIX_CLIENT_DOMAIN:-matrix-client-local.hiclaw.io}" \ - --arg aigw "${HICLAW_AI_GATEWAY_DOMAIN:-aigw-local.hiclaw.io}" \ - --arg fs "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ - '[$matrix, $matrix_client, $aigw, $fs] | map(select(endswith("-local.hiclaw.io"))) | map(. + ":" + $ip)') - fi - # Build create request body CREATE_BODY=$(jq -cn \ --arg name "${WORKER_NAME}" \ --arg image "${CUSTOM_IMAGE:-}" \ --arg runtime "${WORKER_RUNTIME}" \ --argjson env "${WORKER_ENV}" \ - --argjson extra_hosts "${EXTRA_HOSTS}" \ '{name: $name, runtime: $runtime, env: $env} - | if $image != "" then . + {image: $image} else . end - | if ($extra_hosts | length) > 0 then . + {extra_hosts: $extra_hosts} else . end') + | if $image != "" then . + {image: $image} else . end') CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true log " Create response: ${CREATE_OUTPUT:0:300}" From 256dbe3363a62ba7e3f5b2d557be8680b454c9e2 Mon Sep 17 00:00:00 2001 From: jingze Date: Fri, 27 Mar 2026 17:28:36 +0800 Subject: [PATCH 10/11] fix(orchestrator): restore Docker backend features lost in refactor - Add ensureImage() to auto-pull missing images before container create - Handle 409 Conflict by deleting existing container and retrying once - Add ExposedPorts/PortBindings support for CoPaw console port mapping with port conflict retry (up to 10 attempts) - Pass complete env vars (FS credentials, orchestrator URL) when recreating workers in lifecycle-worker.sh and start-manager-agent.sh - Pass HICLAW_WORKER_IMAGE and HICLAW_COPAW_WORKER_IMAGE to orchestrator container in install scripts so it knows which images to use - Extract console_host_port from orchestrator response in create-worker.sh and enable-worker-console.sh Co-Authored-By: Claude Opus 4.6 (1M context) --- install/hiclaw-install.ps1 | 2 + install/hiclaw-install.sh | 2 + .../scripts/create-worker.sh | 1 + .../scripts/enable-worker-console.sh | 2 + .../scripts/lifecycle-worker.sh | 20 +- manager/scripts/init/start-manager-agent.sh | 17 +- orchestrator/api/types.go | 17 +- orchestrator/api/worker_handler.go | 15 +- orchestrator/backend/backend.go | 15 +- orchestrator/backend/docker.go | 231 ++++++++++++++---- orchestrator/backend/docker_test.go | 57 ++++- 11 files changed, 303 insertions(+), 76 deletions(-) diff --git a/install/hiclaw-install.ps1 b/install/hiclaw-install.ps1 index dd9fa80c..c616fcc8 100644 --- a/install/hiclaw-install.ps1 +++ b/install/hiclaw-install.ps1 @@ -2012,6 +2012,8 @@ function Install-Manager { --network hiclaw-net ` -v "//var/run/docker.sock:/var/run/docker.sock" ` --security-opt label=disable ` + -e "HICLAW_WORKER_IMAGE=$($script:WORKER_IMAGE)" ` + -e "HICLAW_COPAW_WORKER_IMAGE=$($script:COPAW_WORKER_IMAGE)" ` $(if ($config.PROXY_ALLOWED_REGISTRIES) { @("-e", "HICLAW_PROXY_ALLOWED_REGISTRIES=$($config.PROXY_ALLOWED_REGISTRIES)") }) ` --restart unless-stopped ` $proxyImage diff --git a/install/hiclaw-install.sh b/install/hiclaw-install.sh index 39843237..c03dfcd6 100644 --- a/install/hiclaw-install.sh +++ b/install/hiclaw-install.sh @@ -2302,6 +2302,8 @@ EOF --network hiclaw-net \ -v "${CONTAINER_SOCK}:/var/run/docker.sock" \ --security-opt label=disable \ + -e HICLAW_WORKER_IMAGE="${WORKER_IMAGE}" \ + -e HICLAW_COPAW_WORKER_IMAGE="${COPAW_WORKER_IMAGE}" \ ${HICLAW_PROXY_ALLOWED_REGISTRIES:+-e HICLAW_PROXY_ALLOWED_REGISTRIES="${HICLAW_PROXY_ALLOWED_REGISTRIES}"} \ --restart unless-stopped \ "${_proxy_image}" diff --git a/manager/agent/skills/worker-management/scripts/create-worker.sh b/manager/agent/skills/worker-management/scripts/create-worker.sh index 853c70bc..268ff6b1 100644 --- a/manager/agent/skills/worker-management/scripts/create-worker.sh +++ b/manager/agent/skills/worker-management/scripts/create-worker.sh @@ -811,6 +811,7 @@ elif container_api_available; then CREATE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | jq -r '.container_id // empty' 2>/dev/null) + CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | jq -r '.console_host_port // empty' 2>/dev/null) if [ "${CREATE_STATUS}" = "running" ] || [ "${CREATE_STATUS}" = "starting" ]; then DEPLOY_MODE=$(echo "${CREATE_OUTPUT}" | jq -r '.deployment_mode // "local"' 2>/dev/null) diff --git a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh index 1be01e81..8f95d839 100755 --- a/manager/agent/skills/worker-management/scripts/enable-worker-console.sh +++ b/manager/agent/skills/worker-management/scripts/enable-worker-console.sh @@ -121,6 +121,8 @@ CREATE_BODY=$(jq -cn \ CREATE_OUTPUT=$(worker_backend_create "${CREATE_BODY}" 2>/dev/null) || true CREATE_STATUS=$(echo "${CREATE_OUTPUT}" | jq -r '.status // "error"' 2>/dev/null) +CONTAINER_ID=$(echo "${CREATE_OUTPUT}" | jq -r '.container_id // empty' 2>/dev/null) +CONSOLE_HOST_PORT=$(echo "${CREATE_OUTPUT}" | jq -r '.console_host_port // empty' 2>/dev/null) if [ "${CREATE_STATUS}" != "running" ] && [ "${CREATE_STATUS}" != "starting" ]; then log "ERROR: Failed to recreate container" diff --git a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh index d54410c6..958674de 100755 --- a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh +++ b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh @@ -392,12 +392,28 @@ action_start() { local runtime runtime=$(jq -r --arg w "$worker" '.workers[$w].runtime // "openclaw"' "$REGISTRY_FILE" 2>/dev/null) - # Build create request for orchestrator + # Build create request for orchestrator (include env vars for worker to function) + local env_map + env_map=$(jq -cn \ + --arg name "$worker" \ + --arg fak "$worker" \ + --arg fsk "${WORKER_MINIO_PASSWORD:-}" \ + --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + --arg orchestrator_url "${HICLAW_ORCHESTRATOR_URL:-}" \ + '{ + "HICLAW_WORKER_NAME": $name, + "HICLAW_FS_ENDPOINT": ("http://" + ($fs_domain | split(":")[0]) + ":8080"), + "HICLAW_FS_ACCESS_KEY": $fak, + "HICLAW_FS_SECRET_KEY": $fsk + } + | if $orchestrator_url != "" then . + {"HICLAW_ORCHESTRATOR_URL": $orchestrator_url} else . end') + local create_body create_body=$(jq -cn \ --arg name "$worker" \ --arg runtime "$runtime" \ - '{name: $name, runtime: $runtime}') + --argjson env "$env_map" \ + '{name: $name, runtime: $runtime, env: $env}') worker_backend_create "$create_body" > /dev/null 2>&1 && ok=true else _log "Starting worker $worker (status: $status)" diff --git a/manager/scripts/init/start-manager-agent.sh b/manager/scripts/init/start-manager-agent.sh index 1c1943b4..cb100c2d 100755 --- a/manager/scripts/init/start-manager-agent.sh +++ b/manager/scripts/init/start-manager-agent.sh @@ -794,8 +794,21 @@ if container_api_available; then _runtime=$(jq -r --arg w "${_worker_name}" '.workers[$w].runtime // "openclaw"' "${REGISTRY_FILE}" 2>/dev/null) _recreated=false for _attempt in 1 2 3; do - local _create_body - _create_body=$(jq -cn --arg name "${_worker_name}" --arg runtime "${_runtime}" '{name: $name, runtime: $runtime}') + local _env_map _create_body + _env_map=$(jq -cn \ + --arg name "${_worker_name}" \ + --arg fak "${_worker_name}" \ + --arg fsk "${WORKER_MINIO_PASSWORD:-}" \ + --arg fs_domain "${HICLAW_FS_DOMAIN:-fs-local.hiclaw.io}" \ + --arg orchestrator_url "${HICLAW_ORCHESTRATOR_URL:-}" \ + '{ + "HICLAW_WORKER_NAME": $name, + "HICLAW_FS_ENDPOINT": ("http://" + ($fs_domain | split(":")[0]) + ":8080"), + "HICLAW_FS_ACCESS_KEY": $fak, + "HICLAW_FS_SECRET_KEY": $fsk + } + | if $orchestrator_url != "" then . + {"HICLAW_ORCHESTRATOR_URL": $orchestrator_url} else . end') + _create_body=$(jq -cn --arg name "${_worker_name}" --arg runtime "${_runtime}" --argjson env "${_env_map}" '{name: $name, runtime: $runtime, env: $env}') worker_backend_create "${_create_body}" > /dev/null 2>&1 && _recreated=true && break log " Attempt ${_attempt}/3 failed for ${_worker_name}, retrying in $((5 * _attempt))s..." sleep $((5 * _attempt)) diff --git a/orchestrator/api/types.go b/orchestrator/api/types.go index 9b1ef06f..18b55d01 100644 --- a/orchestrator/api/types.go +++ b/orchestrator/api/types.go @@ -18,14 +18,15 @@ type CreateWorkerRequest struct { // WorkerResponse is the JSON response for worker operations. type WorkerResponse struct { - Name string `json:"name"` - Backend string `json:"backend"` - DeploymentMode string `json:"deployment_mode"` - Status backend.WorkerStatus `json:"status"` - ContainerID string `json:"container_id,omitempty"` - AppID string `json:"app_id,omitempty"` - RawStatus string `json:"raw_status,omitempty"` - APIKey string `json:"api_key,omitempty"` + Name string `json:"name"` + Backend string `json:"backend"` + DeploymentMode string `json:"deployment_mode"` + Status backend.WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` + APIKey string `json:"api_key,omitempty"` + ConsoleHostPort string `json:"console_host_port,omitempty"` } // WorkerListResponse is the JSON response for GET /workers. diff --git a/orchestrator/api/worker_handler.go b/orchestrator/api/worker_handler.go index 3298c73f..5bc0e96e 100644 --- a/orchestrator/api/worker_handler.go +++ b/orchestrator/api/worker_handler.go @@ -273,13 +273,14 @@ func (h *WorkerHandler) mergeReadiness(name string, status backend.WorkerStatus) func toWorkerResponse(r *backend.WorkerResult) WorkerResponse { return WorkerResponse{ - Name: r.Name, - Backend: r.Backend, - DeploymentMode: r.DeploymentMode, - Status: r.Status, - ContainerID: r.ContainerID, - AppID: r.AppID, - RawStatus: r.RawStatus, + Name: r.Name, + Backend: r.Backend, + DeploymentMode: r.DeploymentMode, + Status: r.Status, + ContainerID: r.ContainerID, + AppID: r.AppID, + RawStatus: r.RawStatus, + ConsoleHostPort: r.ConsoleHostPort, } } diff --git a/orchestrator/backend/backend.go b/orchestrator/backend/backend.go index 3853c2f3..572c7af9 100644 --- a/orchestrator/backend/backend.go +++ b/orchestrator/backend/backend.go @@ -58,13 +58,14 @@ const ( // WorkerResult holds the result of a worker operation. type WorkerResult struct { - Name string `json:"name"` - Backend string `json:"backend"` - DeploymentMode string `json:"deployment_mode"` - Status WorkerStatus `json:"status"` - ContainerID string `json:"container_id,omitempty"` - AppID string `json:"app_id,omitempty"` - RawStatus string `json:"raw_status,omitempty"` + Name string `json:"name"` + Backend string `json:"backend"` + DeploymentMode string `json:"deployment_mode"` + Status WorkerStatus `json:"status"` + ContainerID string `json:"container_id,omitempty"` + AppID string `json:"app_id,omitempty"` + RawStatus string `json:"raw_status,omitempty"` + ConsoleHostPort string `json:"console_host_port,omitempty"` } // WorkerBackend defines the interface for worker lifecycle operations. diff --git a/orchestrator/backend/docker.go b/orchestrator/backend/docker.go index b5c09fa4..5af5e7aa 100644 --- a/orchestrator/backend/docker.go +++ b/orchestrator/backend/docker.go @@ -5,11 +5,14 @@ import ( "encoding/json" "fmt" "io" + "log" + "math/rand" "net" "net/http" "net/url" "os" "sort" + "strconv" "strings" "time" ) @@ -96,53 +99,117 @@ func (d *DockerBackend) Create(ctx context.Context, req CreateRequest) (*WorkerR } } - payload := d.buildCreatePayload(req) - body, err := json.Marshal(payload) - if err != nil { - return nil, fmt.Errorf("marshal create payload: %w", err) + // Ensure image is available locally, pull if needed + if err := d.ensureImage(ctx, req.Image); err != nil { + return nil, err } - u := fmt.Sprintf("http://localhost/containers/create?name=%s", url.QueryEscape(containerName)) - httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, strings.NewReader(string(body))) - if err != nil { - return nil, fmt.Errorf("build create request: %w", err) + // Detect console port from env (for CoPaw workers) + consolePort := "" + if req.Env != nil { + consolePort = req.Env["HICLAW_CONSOLE_PORT"] } - httpReq.Header.Set("Content-Type", "application/json") - resp, err := d.client.Do(httpReq) - if err != nil { - return nil, fmt.Errorf("docker create: %w", err) + // Pick a random host port for console binding + hostPort := 0 + if consolePort != "" { + hostPort = 10000 + rand.Intn(10001) } - defer resp.Body.Close() - respBody, _ := io.ReadAll(resp.Body) + const maxPortRetries = 10 + for attempt := 0; ; attempt++ { + payload := d.buildCreatePayload(req, consolePort, hostPort) + body, err := json.Marshal(payload) + if err != nil { + return nil, fmt.Errorf("marshal create payload: %w", err) + } - if resp.StatusCode == http.StatusConflict { - return nil, fmt.Errorf("%w: container %q", ErrConflict, containerName) - } - if resp.StatusCode != http.StatusCreated { - return nil, fmt.Errorf("docker create failed (status %d): %s", resp.StatusCode, string(respBody)) - } + containerID, err := d.doCreate(ctx, containerName, body) + if err != nil { + return nil, err + } - var createResp struct { - ID string `json:"Id"` - } - if err := json.Unmarshal(respBody, &createResp); err != nil { - return nil, fmt.Errorf("parse create response: %w", err) - } + // Start the container + startErr := d.startContainer(ctx, containerID) + if startErr == nil { + result := &WorkerResult{ + Name: req.Name, + Backend: "docker", + DeploymentMode: DeployLocal, + Status: StatusRunning, + ContainerID: containerID, + RawStatus: "running", + } + if consolePort != "" && hostPort > 0 { + result.ConsoleHostPort = strconv.Itoa(hostPort) + log.Printf("[Docker] Console: container port %s -> host port %d", consolePort, hostPort) + } + return result, nil + } + + // Check if start failed due to port conflict — retry with different port + errMsg := startErr.Error() + if consolePort != "" && attempt < maxPortRetries && + (strings.Contains(errMsg, "already allocated") || + strings.Contains(errMsg, "address already in use") || + strings.Contains(errMsg, "port is already")) { + log.Printf("[Docker] Host port %d in use, retrying with %d...", hostPort, hostPort+1) + hostPort++ + // Clean up the container we just created + d.Delete(ctx, req.Name) + time.Sleep(500 * time.Millisecond) + continue + } - if err := d.startContainer(ctx, createResp.ID); err != nil { - return nil, fmt.Errorf("start after create: %w", err) + return nil, fmt.Errorf("start after create: %w", startErr) } +} - return &WorkerResult{ - Name: req.Name, - Backend: "docker", - DeploymentMode: DeployLocal, - Status: StatusRunning, - ContainerID: createResp.ID, - RawStatus: "running", - }, nil +// doCreate sends the container create request to Docker, handling conflict by +// deleting the existing container and retrying once. +func (d *DockerBackend) doCreate(ctx context.Context, containerName string, body []byte) (string, error) { + for retry := 0; retry < 2; retry++ { + u := fmt.Sprintf("http://localhost/containers/create?name=%s", url.QueryEscape(containerName)) + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, u, strings.NewReader(string(body))) + if err != nil { + return "", fmt.Errorf("build create request: %w", err) + } + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := d.client.Do(httpReq) + if err != nil { + return "", fmt.Errorf("docker create: %w", err) + } + respBody, _ := io.ReadAll(resp.Body) + resp.Body.Close() + + if resp.StatusCode == http.StatusConflict && retry == 0 { + // Remove existing container and retry once + log.Printf("[Docker] Container %s already exists, removing before recreate", containerName) + // Extract worker name from container name + name := strings.TrimPrefix(containerName, d.containerPrefix) + if err := d.Delete(ctx, name); err != nil { + return "", fmt.Errorf("delete existing container: %w", err) + } + time.Sleep(1 * time.Second) + continue + } + if resp.StatusCode == http.StatusConflict { + return "", fmt.Errorf("%w: container %q", ErrConflict, containerName) + } + if resp.StatusCode != http.StatusCreated { + return "", fmt.Errorf("docker create failed (status %d): %s", resp.StatusCode, string(respBody)) + } + + var createResp struct { + ID string `json:"Id"` + } + if err := json.Unmarshal(respBody, &createResp); err != nil { + return "", fmt.Errorf("parse create response: %w", err) + } + return createResp.ID, nil + } + return "", fmt.Errorf("docker create: exhausted retries") } func (d *DockerBackend) Delete(ctx context.Context, name string) error { @@ -308,6 +375,59 @@ func (d *DockerBackend) List(ctx context.Context) ([]WorkerResult, error) { // --- internal helpers --- +// ensureImage checks if an image exists locally and pulls it if not. +func (d *DockerBackend) ensureImage(ctx context.Context, image string) error { + // Check if image exists locally + // Note: Docker Engine API expects unescaped image names in the path + // (e.g. /images/hiclaw/worker-agent:latest/json), not PathEscaped. + u := "http://localhost/images/" + image + "/json" + req, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return fmt.Errorf("build image inspect request: %w", err) + } + resp, err := d.client.Do(req) + if err != nil { + return fmt.Errorf("docker image inspect: %w", err) + } + resp.Body.Close() + + if resp.StatusCode == http.StatusOK { + return nil // image exists + } + + // Pull the image + log.Printf("[Docker] Image not found locally, pulling: %s", image) + pullURL := fmt.Sprintf("http://localhost/images/create?fromImage=%s", url.QueryEscape(image)) + pullReq, err := http.NewRequestWithContext(ctx, http.MethodPost, pullURL, nil) + if err != nil { + return fmt.Errorf("build image pull request: %w", err) + } + pullResp, err := d.client.Do(pullReq) + if err != nil { + return fmt.Errorf("docker image pull: %w", err) + } + // Read full body to wait for pull completion (Docker streams progress JSON) + io.Copy(io.Discard, pullResp.Body) + pullResp.Body.Close() + + // Verify image is now available + verifyReq, err := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + if err != nil { + return fmt.Errorf("build image verify request: %w", err) + } + verifyResp, err := d.client.Do(verifyReq) + if err != nil { + return fmt.Errorf("docker image verify: %w", err) + } + verifyResp.Body.Close() + + if verifyResp.StatusCode != http.StatusOK { + return fmt.Errorf("failed to pull image %s", image) + } + log.Printf("[Docker] Image pulled successfully: %s", image) + return nil +} + func (d *DockerBackend) startContainer(ctx context.Context, nameOrID string) error { u := fmt.Sprintf("http://localhost/containers/%s/start", url.PathEscape(nameOrID)) req, err := http.NewRequestWithContext(ctx, http.MethodPost, u, nil) @@ -335,18 +455,24 @@ func (d *DockerBackend) startContainer(ctx context.Context, nameOrID string) err // dockerCreatePayload is the Docker Engine API container create body. type dockerCreatePayload struct { - Image string `json:"Image"` - Env []string `json:"Env,omitempty"` - WorkingDir string `json:"WorkingDir,omitempty"` - HostConfig *dockerHostConfig `json:"HostConfig,omitempty"` + Image string `json:"Image"` + Env []string `json:"Env,omitempty"` + WorkingDir string `json:"WorkingDir,omitempty"` + ExposedPorts map[string]struct{} `json:"ExposedPorts,omitempty"` + HostConfig *dockerHostConfig `json:"HostConfig,omitempty"` } type dockerHostConfig struct { - NetworkMode string `json:"NetworkMode,omitempty"` - ExtraHosts []string `json:"ExtraHosts,omitempty"` + NetworkMode string `json:"NetworkMode,omitempty"` + ExtraHosts []string `json:"ExtraHosts,omitempty"` + PortBindings map[string][]dockerPortBinding `json:"PortBindings,omitempty"` +} + +type dockerPortBinding struct { + HostPort string `json:"HostPort"` } -func (d *DockerBackend) buildCreatePayload(req CreateRequest) dockerCreatePayload { +func (d *DockerBackend) buildCreatePayload(req CreateRequest, consolePort string, hostPort int) dockerCreatePayload { // Sort env keys for deterministic output keys := make([]string, 0, len(req.Env)) for k := range req.Env { @@ -365,13 +491,24 @@ func (d *DockerBackend) buildCreatePayload(req CreateRequest) dockerCreatePayloa WorkingDir: req.WorkingDir, } - if req.Network != "" || len(req.ExtraHosts) > 0 { - p.HostConfig = &dockerHostConfig{ - NetworkMode: req.Network, - ExtraHosts: req.ExtraHosts, + hc := &dockerHostConfig{ + NetworkMode: req.Network, + ExtraHosts: req.ExtraHosts, + } + + // Console port binding (CoPaw workers) + if consolePort != "" && hostPort > 0 { + portKey := consolePort + "/tcp" + p.ExposedPorts = map[string]struct{}{portKey: {}} + hc.PortBindings = map[string][]dockerPortBinding{ + portKey: {{HostPort: strconv.Itoa(hostPort)}}, } } + if hc.NetworkMode != "" || len(hc.ExtraHosts) > 0 || len(hc.PortBindings) > 0 { + p.HostConfig = hc + } + return p } diff --git a/orchestrator/backend/docker_test.go b/orchestrator/backend/docker_test.go index 689a1560..672ea054 100644 --- a/orchestrator/backend/docker_test.go +++ b/orchestrator/backend/docker_test.go @@ -16,9 +16,38 @@ func mockDockerAPI(t *testing.T) *httptest.Server { // In-memory container store containers := map[string]map[string]interface{}{} + // In-memory image store (pre-populated with common test images) + images := map[string]bool{ + "hiclaw/worker-agent:latest": true, + "hiclaw/copaw-worker:latest": true, + "img:latest": true, + } mux := http.NewServeMux() + // GET /images/{name}/json — check if image exists + mux.HandleFunc("GET /images/", func(w http.ResponseWriter, r *http.Request) { + // Extract image name from path (strip /images/ prefix and /json suffix) + path := strings.TrimPrefix(r.URL.Path, "/images/") + path = strings.TrimSuffix(path, "/json") + if images[path] { + json.NewEncoder(w).Encode(map[string]string{"Id": "sha256-" + path}) + return + } + w.WriteHeader(http.StatusNotFound) + json.NewEncoder(w).Encode(map[string]string{"message": "not found"}) + }) + + // POST /images/create — pull image + mux.HandleFunc("POST /images/create", func(w http.ResponseWriter, r *http.Request) { + fromImage := r.URL.Query().Get("fromImage") + if fromImage != "" { + images[fromImage] = true + } + w.WriteHeader(http.StatusOK) + w.Write([]byte(`{"status":"Pull complete"}`)) + }) + // POST /containers/create?name=xxx mux.HandleFunc("POST /containers/create", func(w http.ResponseWriter, r *http.Request) { name := r.URL.Query().Get("name") @@ -185,9 +214,31 @@ func TestDockerCreateConflict(t *testing.T) { t.Fatalf("first create failed: %v", err) } - _, err = b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:latest"}) - if err == nil { - t.Error("expected conflict error on duplicate create") + // Second create should succeed — auto-deletes existing container and retries + result, err := b.Create(context.Background(), CreateRequest{Name: "alice", Image: "img:latest"}) + if err != nil { + t.Fatalf("second create should succeed (auto-delete+retry), got: %v", err) + } + if result.Name != "alice" { + t.Errorf("expected name alice, got %s", result.Name) + } +} + +func TestDockerCreatePullsImage(t *testing.T) { + srv := mockDockerAPI(t) + defer srv.Close() + b := newTestDockerBackend(t, srv.URL) + + // Use an image that doesn't exist in the mock store — it should be pulled + result, err := b.Create(context.Background(), CreateRequest{ + Name: "puller", + Image: "custom/image:v2", + }) + if err != nil { + t.Fatalf("Create with image pull failed: %v", err) + } + if result.Status != StatusRunning { + t.Errorf("expected running, got %s", result.Status) } } From 90095d76ce427a91c45ff9b3741831eab8ef3fd6 Mon Sep 17 00:00:00 2001 From: jingze Date: Mon, 30 Mar 2026 13:44:04 +0800 Subject: [PATCH 11/11] =?UTF-8?q?fix(orchestrator):=20address=20review=20f?= =?UTF-8?q?eedback=20=E2=80=94=20readiness=20heartbeat,=20key=20persist=20?= =?UTF-8?q?docs,=20delete=20bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Worker/CoPaw readiness reporters now heartbeat every 60s after initial ready, so orchestrator restarts self-heal without persistence - Add comment documenting persist-outside-lock trade-off in keys.go - Fix _detect_worker_backend call in lifecycle-worker.sh action_delete (function was removed in refactor, replaced with container_api_available) - Add backward-compat env var fallback for HICLAW_INSTALL_DOCKER_PROXY_IMAGE - Update stale comment in copaw-worker-entrypoint.sh Co-Authored-By: Claude Opus 4.6 (1M context) --- copaw/scripts/copaw-worker-entrypoint.sh | 20 ++++++++++++++++--- install/hiclaw-install.sh | 2 ++ .../scripts/lifecycle-worker.sh | 8 +++----- orchestrator/auth/keys.go | 4 ++++ worker/scripts/worker-entrypoint.sh | 18 +++++++++++++++-- 5 files changed, 42 insertions(+), 10 deletions(-) diff --git a/copaw/scripts/copaw-worker-entrypoint.sh b/copaw/scripts/copaw-worker-entrypoint.sh index 535c113d..f3de436a 100755 --- a/copaw/scripts/copaw-worker-entrypoint.sh +++ b/copaw/scripts/copaw-worker-entrypoint.sh @@ -7,7 +7,7 @@ # - HICLAW_CONSOLE_PORT set → standard mode (copaw-worker, PyPI CoPaw venv) # - HICLAW_CONSOLE_PORT unset → lite mode (lite-copaw-worker, lite CoPaw venv) # -# Environment variables (set by container_create_worker in container-api.sh): +# Environment variables (set by orchestrator during worker creation): # HICLAW_WORKER_NAME - Worker name (required) # HICLAW_FS_ENDPOINT - MinIO endpoint (required in local mode) # HICLAW_FS_ACCESS_KEY - MinIO access key (required in local mode) @@ -72,6 +72,7 @@ _start_readiness_reporter() { [ -n "${HICLAW_WORKER_API_KEY:-}" ] && auth_header="Authorization: Bearer ${HICLAW_WORKER_API_KEY}" ( + # Phase 1: Wait for initial readiness (with timeout) TIMEOUT=120; ELAPSED=0 CONFIG_FILE="${INSTALL_DIR}/${WORKER_NAME}/.copaw/config.json" while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do @@ -80,7 +81,7 @@ _start_readiness_reporter() { if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ ${auth_header:+-H "${auth_header}"} 2>/dev/null; then log "Reported ready to orchestrator" - exit 0 + break 2 fi sleep 2 done @@ -88,7 +89,20 @@ _start_readiness_reporter() { fi sleep 5; ELAPSED=$((ELAPSED + 5)) done - log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + + if [ "${ELAPSED}" -ge "${TIMEOUT}" ]; then + log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + exit 1 + fi + + # Phase 2: Periodic heartbeat (every 60s) — self-heals after orchestrator restart + while true; do + sleep 60 + if [ -f "${CONFIG_FILE}" ] && grep -q '"channels"' "${CONFIG_FILE}" 2>/dev/null; then + curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + ${auth_header:+-H "${auth_header}"} 2>/dev/null || true + fi + done ) & log "Background readiness reporter started (PID: $!)" } diff --git a/install/hiclaw-install.sh b/install/hiclaw-install.sh index c03dfcd6..38669305 100644 --- a/install/hiclaw-install.sh +++ b/install/hiclaw-install.sh @@ -837,6 +837,8 @@ detect_registry() { } HICLAW_REGISTRY="${HICLAW_REGISTRY:-$(detect_registry)}" +# Backward compatibility: accept old env var names from previous versions +HICLAW_INSTALL_ORCHESTRATOR_IMAGE="${HICLAW_INSTALL_ORCHESTRATOR_IMAGE:-${HICLAW_INSTALL_DOCKER_PROXY_IMAGE:-}}" # Image variables are resolved after version selection in step_version(). # These placeholders allow early code paths to reference them without errors. MANAGER_IMAGE="${HICLAW_INSTALL_MANAGER_IMAGE:-}" diff --git a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh index 958674de..e6957823 100755 --- a/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh +++ b/manager/agent/skills/worker-management/scripts/lifecycle-worker.sh @@ -329,19 +329,17 @@ action_delete() { _init_lifecycle_file _ensure_worker_entry "$worker" - local backend - backend=$(_detect_worker_backend) - if [ "$backend" = "none" ]; then + if ! container_api_available 2>/dev/null; then _log "ERROR: No worker backend available" return 1 fi # Stop first (ignore errors — may already be stopped) - _log "Stopping worker $worker before delete (backend=$backend)" + _log "Stopping worker $worker before delete" worker_backend_stop "$worker" 2>/dev/null || true # Delete container - _log "Deleting worker $worker container (backend=$backend)" + _log "Deleting worker $worker container" if worker_backend_delete "$worker"; then _log "Worker $worker container deleted" else diff --git a/orchestrator/auth/keys.go b/orchestrator/auth/keys.go index 40252dba..47a3b88f 100644 --- a/orchestrator/auth/keys.go +++ b/orchestrator/auth/keys.go @@ -76,7 +76,11 @@ func (ks *KeyStore) GenerateWorkerKey(workerName string) string { snapshot := ks.snapshotLocked() ks.mu.Unlock() + // persist outside lock: avoids blocking ValidateKey() readers during network I/O. + // Trade-off: concurrent GenerateWorkerKey calls could persist stale snapshots, + // but key ops are rare and in-memory state is always correct. ks.persist(snapshot) + return key } diff --git a/worker/scripts/worker-entrypoint.sh b/worker/scripts/worker-entrypoint.sh index 8e876eaa..06509cde 100755 --- a/worker/scripts/worker-entrypoint.sh +++ b/worker/scripts/worker-entrypoint.sh @@ -281,6 +281,7 @@ if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ]; then AUTH_HEADER="" [ -n "${HICLAW_WORKER_API_KEY:-}" ] && AUTH_HEADER="Authorization: Bearer ${HICLAW_WORKER_API_KEY}" + # Phase 1: Wait for initial readiness (with timeout) TIMEOUT=120; ELAPSED=0 while [ "${ELAPSED}" -lt "${TIMEOUT}" ]; do if openclaw gateway health --json 2>/dev/null | grep -q '"ok"' 2>/dev/null; then @@ -288,7 +289,7 @@ if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ]; then if curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ ${AUTH_HEADER:+-H "${AUTH_HEADER}"} 2>/dev/null; then log "Reported ready to orchestrator" - exit 0 + break 2 fi sleep 2 done @@ -296,7 +297,20 @@ if [ -n "${HICLAW_ORCHESTRATOR_URL:-}" ]; then fi sleep 5; ELAPSED=$((ELAPSED + 5)) done - log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + + if [ "${ELAPSED}" -ge "${TIMEOUT}" ]; then + log "WARNING: readiness reporter timed out after ${TIMEOUT}s" + exit 1 + fi + + # Phase 2: Periodic heartbeat (every 60s) — self-heals after orchestrator restart + while true; do + sleep 60 + if openclaw gateway health --json 2>/dev/null | grep -q '"ok"' 2>/dev/null; then + curl -sf -X POST "${HICLAW_ORCHESTRATOR_URL}/workers/${WORKER_NAME}/ready" \ + ${AUTH_HEADER:+-H "${AUTH_HEADER}"} 2>/dev/null || true + fi + done ) & log "Background readiness reporter started (PID: $!)" fi